feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation
- Add Free_Proxy_Website-style fpw_* plugins and register them - Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting - Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore - Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait - Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys - WebSocket dashboard stats; ProxyList pool_filter and API alignment - POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404 - pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB - .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import Made-with: Cursor
This commit is contained in:
@@ -18,17 +18,19 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# 减少页数,降低被反爬概率,确保至少能拿到数据
|
||||
# fps/dps 列表页目前仍可 200;inha/intr 常返回 567(反爬),作末位兜底
|
||||
self.urls = [
|
||||
"https://www.kuaidaili.com/free/fps/",
|
||||
"https://www.kuaidaili.com/free/dps/",
|
||||
"https://www.kuaidaili.com/free/inha/1/",
|
||||
"https://www.kuaidaili.com/free/intr/1/",
|
||||
]
|
||||
|
||||
def get_headers(self) -> dict:
|
||||
headers = super().get_headers()
|
||||
headers["Referer"] = "https://www.kuaidaili.com/free/inha/"
|
||||
headers["Referer"] = "https://www.kuaidaili.com/free/"
|
||||
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
|
||||
headers["Accept-Encoding"] = "gzip, deflate, br"
|
||||
headers["Accept-Encoding"] = "gzip, deflate"
|
||||
headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8"
|
||||
headers["Sec-Fetch-Dest"] = "document"
|
||||
headers["Sec-Fetch-Mode"] = "navigate"
|
||||
@@ -36,15 +38,56 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
|
||||
headers["Upgrade-Insecure-Requests"] = "1"
|
||||
return headers
|
||||
|
||||
@staticmethod
|
||||
def _infer_protocol(texts: List[str]) -> str:
|
||||
"""从一行单元格文本中推断协议(兼容 fps / dps / inha 等版式)。"""
|
||||
for t in texts[2:]:
|
||||
tl = t.lower().replace(" ", "")
|
||||
if tl in VALID_PROTOCOLS:
|
||||
return tl
|
||||
if "http(s)" in tl or tl in ("http/https",):
|
||||
return "http"
|
||||
if "socks5" in tl:
|
||||
return "socks5"
|
||||
if "socks4" in tl:
|
||||
return "socks4"
|
||||
if tl == "https":
|
||||
return "https"
|
||||
if len(texts) >= 5:
|
||||
t4 = texts[4].lower().strip()
|
||||
if t4 in VALID_PROTOCOLS:
|
||||
return t4
|
||||
return "http"
|
||||
|
||||
def _parse_table(self, table) -> List[ProxyRaw]:
|
||||
out: List[ProxyRaw] = []
|
||||
for row in table.find_all("tr"):
|
||||
tds = row.find_all("td")
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
texts = [td.get_text(strip=True) for td in tds]
|
||||
ip = texts[0]
|
||||
port_s = texts[1]
|
||||
if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip):
|
||||
continue
|
||||
if not port_s.isdigit() or not (1 <= int(port_s) <= 65535):
|
||||
continue
|
||||
protocol = self._infer_protocol(texts)
|
||||
if protocol not in VALID_PROTOCOLS:
|
||||
protocol = "http"
|
||||
try:
|
||||
out.append(ProxyRaw(ip, int(port_s), protocol))
|
||||
except ValueError:
|
||||
continue
|
||||
return out
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results = []
|
||||
# 先访问首页预热会话,获取 cookie,降低被反爬概率
|
||||
await self.fetch("https://www.kuaidaili.com/", timeout=10)
|
||||
await asyncio.sleep(random.uniform(2, 4))
|
||||
await self.fetch("https://www.kuaidaili.com/free/", timeout=10)
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
|
||||
# 顺序请求免费代理页面
|
||||
for url in self.urls:
|
||||
html = await self.fetch(url, timeout=10)
|
||||
html = await self.fetch(url, timeout=15)
|
||||
if not html:
|
||||
continue
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
@@ -53,20 +96,11 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
|
||||
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}")
|
||||
continue
|
||||
|
||||
for row in table.find_all("tr"):
|
||||
tds = row.find_all("td")
|
||||
if len(tds) >= 5:
|
||||
ip = tds[0].get_text(strip=True)
|
||||
port = tds[1].get_text(strip=True)
|
||||
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
|
||||
if protocol not in VALID_PROTOCOLS:
|
||||
protocol = "http"
|
||||
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), protocol))
|
||||
except ValueError:
|
||||
continue
|
||||
await asyncio.sleep(random.uniform(5, 8))
|
||||
batch = self._parse_table(table)
|
||||
if batch:
|
||||
results.extend(batch)
|
||||
logger.info(f"{self.display_name} {url} 解析 {len(batch)} 条")
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
|
||||
if results:
|
||||
logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理")
|
||||
|
||||
Reference in New Issue
Block a user