Files
ProxyPool/app/plugins/kuaidaili.py
祀梦 07248ff4ee feat(crawl): browser-like headers, HTTP/2, curl_cffi TLS fingerprint fallback
- get_headers(url): Referer, Sec-Fetch-*, sec-ch-ua, API vs HTML Accept
- httpx AsyncClient/ sync Client with optional HTTP/2 (h2 extra)
- On 403/429/503/520-523/525/567 or request errors, retry via curl_cffi chrome124 impersonate
- POST: Origin, Referer, Content-Type for form posts
- kuaidaili/ip3366: forward get_headers(url=...)

Made-with: Cursor
2026-04-05 14:40:36 +08:00

108 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import asyncio
import random
from typing import List, Optional
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class KuaiDaiLiPlugin(BaseHTTPPlugin):
default_config = {"max_pages": 5}
name = "kuaidaili"
display_name = "快代理"
description = "从快代理网站爬取免费代理"
def __init__(self):
super().__init__()
# fps/dps 列表页目前仍可 200inha/intr 常返回 567反爬作末位兜底
self.urls = [
"https://www.kuaidaili.com/free/fps/",
"https://www.kuaidaili.com/free/dps/",
"https://www.kuaidaili.com/free/inha/1/",
"https://www.kuaidaili.com/free/intr/1/",
]
def get_headers(self, url: Optional[str] = None, **kwargs) -> dict:
headers = super().get_headers(url=url, **kwargs)
headers["Referer"] = "https://www.kuaidaili.com/free/"
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8"
headers["Sec-Fetch-Dest"] = "document"
headers["Sec-Fetch-Mode"] = "navigate"
headers["Sec-Fetch-Site"] = "same-origin"
headers["Upgrade-Insecure-Requests"] = "1"
return headers
@staticmethod
def _infer_protocol(texts: List[str]) -> str:
"""从一行单元格文本中推断协议(兼容 fps / dps / inha 等版式)。"""
for t in texts[2:]:
tl = t.lower().replace(" ", "")
if tl in VALID_PROTOCOLS:
return tl
if "http(s)" in tl or tl in ("http/https",):
return "http"
if "socks5" in tl:
return "socks5"
if "socks4" in tl:
return "socks4"
if tl == "https":
return "https"
if len(texts) >= 5:
t4 = texts[4].lower().strip()
if t4 in VALID_PROTOCOLS:
return t4
return "http"
def _parse_table(self, table) -> List[ProxyRaw]:
out: List[ProxyRaw] = []
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) < 2:
continue
texts = [td.get_text(strip=True) for td in tds]
ip = texts[0]
port_s = texts[1]
if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip):
continue
if not port_s.isdigit() or not (1 <= int(port_s) <= 65535):
continue
protocol = self._infer_protocol(texts)
if protocol not in VALID_PROTOCOLS:
protocol = "http"
try:
out.append(ProxyRaw(ip, int(port_s), protocol))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]:
results = []
await self.fetch("https://www.kuaidaili.com/free/", timeout=10)
await asyncio.sleep(random.uniform(1, 2))
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
table = soup.find("table")
if not table:
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}")
continue
batch = self._parse_table(table)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} {url} 解析 {len(batch)}")
await asyncio.sleep(random.uniform(1, 2))
if results:
logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理")
return results