- get_headers(url): Referer, Sec-Fetch-*, sec-ch-ua, API vs HTML Accept - httpx AsyncClient/ sync Client with optional HTTP/2 (h2 extra) - On 403/429/503/520-523/525/567 or request errors, retry via curl_cffi chrome124 impersonate - POST: Origin, Referer, Content-Type for form posts - kuaidaili/ip3366: forward get_headers(url=...) Made-with: Cursor
108 lines
3.9 KiB
Python
108 lines
3.9 KiB
Python
import re
|
||
import asyncio
|
||
import random
|
||
from typing import List, Optional
|
||
from bs4 import BeautifulSoup
|
||
from app.core.plugin_system import ProxyRaw
|
||
from app.plugins.base import BaseHTTPPlugin
|
||
from app.core.log import logger
|
||
|
||
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
|
||
|
||
|
||
class KuaiDaiLiPlugin(BaseHTTPPlugin):
|
||
default_config = {"max_pages": 5}
|
||
name = "kuaidaili"
|
||
display_name = "快代理"
|
||
description = "从快代理网站爬取免费代理"
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
# fps/dps 列表页目前仍可 200;inha/intr 常返回 567(反爬),作末位兜底
|
||
self.urls = [
|
||
"https://www.kuaidaili.com/free/fps/",
|
||
"https://www.kuaidaili.com/free/dps/",
|
||
"https://www.kuaidaili.com/free/inha/1/",
|
||
"https://www.kuaidaili.com/free/intr/1/",
|
||
]
|
||
|
||
def get_headers(self, url: Optional[str] = None, **kwargs) -> dict:
|
||
headers = super().get_headers(url=url, **kwargs)
|
||
headers["Referer"] = "https://www.kuaidaili.com/free/"
|
||
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
|
||
headers["Accept-Encoding"] = "gzip, deflate"
|
||
headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8"
|
||
headers["Sec-Fetch-Dest"] = "document"
|
||
headers["Sec-Fetch-Mode"] = "navigate"
|
||
headers["Sec-Fetch-Site"] = "same-origin"
|
||
headers["Upgrade-Insecure-Requests"] = "1"
|
||
return headers
|
||
|
||
@staticmethod
|
||
def _infer_protocol(texts: List[str]) -> str:
|
||
"""从一行单元格文本中推断协议(兼容 fps / dps / inha 等版式)。"""
|
||
for t in texts[2:]:
|
||
tl = t.lower().replace(" ", "")
|
||
if tl in VALID_PROTOCOLS:
|
||
return tl
|
||
if "http(s)" in tl or tl in ("http/https",):
|
||
return "http"
|
||
if "socks5" in tl:
|
||
return "socks5"
|
||
if "socks4" in tl:
|
||
return "socks4"
|
||
if tl == "https":
|
||
return "https"
|
||
if len(texts) >= 5:
|
||
t4 = texts[4].lower().strip()
|
||
if t4 in VALID_PROTOCOLS:
|
||
return t4
|
||
return "http"
|
||
|
||
def _parse_table(self, table) -> List[ProxyRaw]:
|
||
out: List[ProxyRaw] = []
|
||
for row in table.find_all("tr"):
|
||
tds = row.find_all("td")
|
||
if len(tds) < 2:
|
||
continue
|
||
texts = [td.get_text(strip=True) for td in tds]
|
||
ip = texts[0]
|
||
port_s = texts[1]
|
||
if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip):
|
||
continue
|
||
if not port_s.isdigit() or not (1 <= int(port_s) <= 65535):
|
||
continue
|
||
protocol = self._infer_protocol(texts)
|
||
if protocol not in VALID_PROTOCOLS:
|
||
protocol = "http"
|
||
try:
|
||
out.append(ProxyRaw(ip, int(port_s), protocol))
|
||
except ValueError:
|
||
continue
|
||
return out
|
||
|
||
async def crawl(self) -> List[ProxyRaw]:
|
||
results = []
|
||
await self.fetch("https://www.kuaidaili.com/free/", timeout=10)
|
||
await asyncio.sleep(random.uniform(1, 2))
|
||
|
||
for url in self.urls:
|
||
html = await self.fetch(url, timeout=15)
|
||
if not html:
|
||
continue
|
||
soup = BeautifulSoup(html, "lxml")
|
||
table = soup.find("table")
|
||
if not table:
|
||
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}")
|
||
continue
|
||
|
||
batch = self._parse_table(table)
|
||
if batch:
|
||
results.extend(batch)
|
||
logger.info(f"{self.display_name} {url} 解析 {len(batch)} 条")
|
||
await asyncio.sleep(random.uniform(1, 2))
|
||
|
||
if results:
|
||
logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理")
|
||
return results
|