Files
ProxyPool/app/plugins/base.py
祀梦 07248ff4ee feat(crawl): browser-like headers, HTTP/2, curl_cffi TLS fingerprint fallback
- get_headers(url): Referer, Sec-Fetch-*, sec-ch-ua, API vs HTML Accept
- httpx AsyncClient/ sync Client with optional HTTP/2 (h2 extra)
- On 403/429/503/520-523/525/567 or request errors, retry via curl_cffi chrome124 impersonate
- POST: Origin, Referer, Content-Type for form posts
- kuaidaili/ip3366: forward get_headers(url=...)

Made-with: Cursor
2026-04-05 14:40:36 +08:00

460 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
import re
import random
import asyncio
import httpx
from typing import Dict, List, Optional
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from app.core.plugin_system import BaseCrawlerPlugin
from app.models.domain import ProxyRaw
try:
import h2 # noqa: F401
_HTTPX_HTTP2 = True
except ImportError:
_HTTPX_HTTP2 = False
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
# 遇此类 HTTP 状态时尝试 curl_cffi 浏览器 TLS/JA3 指纹(比裸 httpx 更易过简单反爬)
_CURL_FALLBACK_STATUS = frozenset(
{403, 429, 503, 520, 521, 522, 523, 525, 567}
)
_CURL_IMPERSONATE = "chrome124"
class BaseHTTPPlugin(BaseCrawlerPlugin):
"""基于 HTTP 的爬虫插件基类"""
def __init__(self):
super().__init__()
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
]
self.urls: List[str] = []
self.current_url: str = ""
self._client: Optional[httpx.AsyncClient] = None
self.max_concurrency: int = 2
def get_headers(
self,
url: Optional[str] = None,
*,
for_api: bool = False,
for_post: bool = False,
) -> dict:
"""接近真实浏览器的请求头url 用于 Referer / Sec-Fetch-*。"""
ua = random.choice(self.user_agents)
is_chrome = "Chrome/" in ua and "Edg/" not in ua
if for_api or (url and ("/api/" in url or url.endswith(".txt") or "/raw/" in url)):
accept = (
"text/plain,text/html,application/json,application/xhtml+xml,"
"application/xml;q=0.9,*/*;q=0.8"
)
sec_dest = "empty"
sec_mode = "cors"
else:
accept = (
"text/html,application/xhtml+xml,application/xml;q=0.9,"
"image/avif,image/webp,image/apng,*/*;q=0.8"
)
sec_dest = "document"
sec_mode = "navigate" if not for_post else "same-origin"
ref_host = ""
if url:
p = urlparse(url)
if p.scheme and p.netloc:
ref_host = p.netloc
referer = f"{p.scheme}://{p.netloc}/"
else:
referer = ""
else:
referer = ""
sec_site = "none"
if referer and url:
try:
req_host = urlparse(url).netloc
if req_host == ref_host:
sec_site = "same-origin"
else:
sec_site = "cross-site"
except Exception:
sec_site = "cross-site"
headers: Dict[str, str] = {
"User-Agent": ua,
"Accept": accept,
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": sec_dest,
"Sec-Fetch-Mode": sec_mode,
"Sec-Fetch-Site": sec_site,
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
if is_chrome:
headers["sec-ch-ua"] = (
'"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"'
)
headers["sec-ch-ua-mobile"] = "?0"
headers["sec-ch-ua-platform"] = '"Windows"'
if referer:
headers["Referer"] = referer
return headers
def _get_client(self) -> httpx.AsyncClient:
"""获取或创建复用的 AsyncClient"""
if self._client is None or self._client.is_closed:
transport = httpx.AsyncHTTPTransport(retries=0)
self._client = httpx.AsyncClient(
transport=transport,
follow_redirects=True,
http2=_HTTPX_HTTP2,
trust_env=False,
)
return self._client
async def _curl_get(self, url: str, headers: dict, timeout: float) -> str:
try:
from curl_cffi import requests as cr
except ImportError:
return ""
def _run() -> str:
try:
h = {k: v for k, v in headers.items() if k.lower() != "accept-encoding"}
r = cr.get(
url,
impersonate=_CURL_IMPERSONATE,
headers=h,
timeout=timeout,
allow_redirects=True,
)
if r.status_code == 200:
return r.text or ""
except Exception:
pass
return ""
return await asyncio.to_thread(_run)
async def _curl_post(
self, url: str, data: Dict[str, str], headers: dict, timeout: float
) -> str:
try:
from curl_cffi import requests as cr
except ImportError:
return ""
def _run() -> str:
try:
h = {k: v for k, v in headers.items() if k.lower() != "accept-encoding"}
r = cr.post(
url,
impersonate=_CURL_IMPERSONATE,
headers=h,
data=data,
timeout=timeout,
allow_redirects=True,
)
if r.status_code == 200:
return r.text or ""
except Exception:
pass
return ""
return await asyncio.to_thread(_run)
@staticmethod
def _http_timeout(seconds: float) -> httpx.Timeout:
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
t = max(2.0, float(seconds))
# 国际链路 / 批量爬取时 connect 过短易集体超时
c = min(12.0, max(4.0, t * 0.4))
return httpx.Timeout(t, connect=c)
@staticmethod
def _decode_response_body(response: httpx.Response) -> str:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
"""同步 GET部分站点在 Windows 上 AsyncClient 易 ConnectTimeout同步 Client 正常)。"""
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
http2=_HTTPX_HTTP2,
) as c:
r = c.get(url, headers=headers, timeout=to)
if r.status_code == 200:
return self._decode_response_body(r)
if r.status_code in _CURL_FALLBACK_STATUS:
try:
from curl_cffi import requests as cr
h = {k: v for k, v in headers.items() if k.lower() != "accept-encoding"}
r2 = cr.get(
url,
impersonate=_CURL_IMPERSONATE,
headers=h,
timeout=timeout,
allow_redirects=True,
)
if r2.status_code == 200:
return r2.text or ""
except Exception:
pass
return ""
def _sync_post(
self, url: str, data: Dict[str, str], timeout: float, headers: dict
) -> str:
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
http2=_HTTPX_HTTP2,
) as c:
r = c.post(url, headers=headers, data=data, timeout=to)
if r.status_code == 200:
return self._decode_response_body(r)
if r.status_code in _CURL_FALLBACK_STATUS:
try:
from curl_cffi import requests as cr
h = {k: v for k, v in headers.items() if k.lower() != "accept-encoding"}
r2 = cr.post(
url,
impersonate=_CURL_IMPERSONATE,
headers=h,
data=data,
timeout=timeout,
allow_redirects=True,
)
if r2.status_code == 200:
return r2.text or ""
except Exception:
pass
return ""
@staticmethod
def _is_textish_url(url: str) -> bool:
return bool(
url.endswith(".txt")
or "/api/" in url
or "raw.githubusercontent.com" in url
or "cdn.jsdelivr.net" in url
)
async def fetch(
self,
url: str,
timeout: float = 15.0,
retries: int = 2,
raise_for_status: bool = False,
) -> str:
"""异步抓取指定 URL 的 HTML 内容"""
from app.core.log import logger
client = self._get_client()
to = self._http_timeout(timeout)
for_api = self._is_textish_url(url)
for attempt in range(retries):
headers = self.get_headers(url=url, for_api=for_api)
try:
response = await client.get(url, headers=headers, timeout=to)
if raise_for_status:
response.raise_for_status()
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"Fetch {url} returned status {response.status_code}")
if response.status_code in _CURL_FALLBACK_STATUS:
curl_text = await self._curl_get(url, headers, timeout)
if curl_text:
logger.info(f"Fetch {url} 使用浏览器指纹回退成功")
return curl_text
except Exception as e:
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
curl_text = await self._curl_get(url, headers, timeout)
if curl_text:
logger.info(f"Fetch {url} 异常后浏览器指纹回退成功")
return curl_text
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
h = self.get_headers(url=url, for_api=for_api)
text = await asyncio.to_thread(self._sync_get, url, timeout, h)
if text:
logger.info(f"Fetch {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"Fetch {url} 同步回退失败: {e}")
return ""
async def fetch_post(
self,
url: str,
data: Optional[Dict[str, str]] = None,
timeout: float = 15.0,
retries: int = 2,
) -> str:
"""POST application/x-www-form-urlencoded用于 spys.one 等表单页。"""
from app.core.log import logger
client = self._get_client()
payload = data or {}
to = self._http_timeout(timeout)
for attempt in range(retries):
headers = self.get_headers(url=url, for_post=True)
p = urlparse(url)
if p.scheme and p.netloc:
headers["Origin"] = f"{p.scheme}://{p.netloc}"
headers["Referer"] = url
headers["Content-Type"] = "application/x-www-form-urlencoded"
try:
response = await client.post(
url,
headers=headers,
data=payload,
timeout=to,
)
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"POST {url} returned status {response.status_code}")
if response.status_code in _CURL_FALLBACK_STATUS:
curl_text = await self._curl_post(url, payload, headers, timeout)
if curl_text:
logger.info(f"POST {url} 使用浏览器指纹回退成功")
return curl_text
except Exception as e:
logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
curl_text = await self._curl_post(url, payload, headers, timeout)
if curl_text:
logger.info(f"POST {url} 异常后浏览器指纹回退成功")
return curl_text
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
headers = self.get_headers(url=url, for_post=True)
p = urlparse(url)
if p.scheme and p.netloc:
headers["Origin"] = f"{p.scheme}://{p.netloc}"
headers["Referer"] = url
headers["Content-Type"] = "application/x-www-form-urlencoded"
text = await asyncio.to_thread(
self._sync_post, url, payload, timeout, headers
)
if text:
logger.info(f"POST {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"POST {url} 同步回退失败: {e}")
return ""
async def fetch_all(
self,
urls: List[str],
timeout: float = 15.0,
retries: int = 2,
) -> List[str]:
"""并发抓取多个 URL限制单个插件内部并发"""
semaphore = asyncio.Semaphore(self.max_concurrency)
async def _fetch_limited(url: str):
async with semaphore:
await asyncio.sleep(random.uniform(0.08, 0.45))
return await self.fetch(url, timeout=timeout, retries=retries)
tasks = [_fetch_limited(url) for url in urls]
return await asyncio.gather(*tasks)
def parse_text_proxies(self, text: str, protocol: str = "http") -> List[ProxyRaw]:
"""解析 ip:port 格式的文本代理列表
统一处理 \r\n\n 两种换行以及可能存在的空行。
"""
results = []
text = text.replace("\r\n", "\n").replace("\r", "\n")
for line in text.split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
ip, _, port = line.rpartition(":")
ip = ip.strip()
port = port.strip()
if ip and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), protocol))
except ValueError:
continue
return results
def parse_html_table(
self,
html: str,
column_map: dict,
protocol: str = "http",
) -> List[ProxyRaw]:
"""通用 HTML 表格解析器
Args:
html: HTML 文本
column_map: 列名到索引的映射,如 {"ip": 0, "port": 1, "protocol": 4}
protocol: 默认协议,如果表格中没有协议列则使用此值
"""
results = []
soup = BeautifulSoup(html, "lxml")
table = soup.find("table")
if not table:
return results
ip_idx = column_map.get("ip", 0)
port_idx = column_map.get("port", 1)
protocol_idx = column_map.get("protocol", -1)
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) <= max(ip_idx, port_idx):
continue
ip = tds[ip_idx].get_text(strip=True)
port = tds[port_idx].get_text(strip=True)
if protocol_idx >= 0 and len(tds) > protocol_idx:
proto = tds[protocol_idx].get_text(strip=True).lower()
if proto not in VALID_PROTOCOLS:
proto = protocol
else:
proto = protocol
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), proto))
except ValueError:
continue
return results
async def close(self):
"""关闭复用的 HTTP 客户端"""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None