Files
ProxyPool/app/plugins/base.py
祀梦 a26ae50051 refactor(crawl): parallel plugins via JobExecutor; per-plugin throttle
- Remove global crawl_slot gate; all CrawlJobs share only executor semaphore
- max_concurrent_jobs = max(24, n_plugins+8) for crawl-all + aggregator headroom
- BaseHTTPPlugin max_concurrency 3->2; fpw multi-URL plugins 4->2
- fetch_all: short random delay before each request to ease single-host pressure

Made-with: Cursor
2026-04-05 14:08:26 +08:00

256 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
import re
import random
import asyncio
import httpx
from typing import Dict, List, Optional
from bs4 import BeautifulSoup
from app.core.plugin_system import BaseCrawlerPlugin
from app.models.domain import ProxyRaw
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class BaseHTTPPlugin(BaseCrawlerPlugin):
"""基于 HTTP 的爬虫插件基类"""
def __init__(self):
super().__init__()
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
]
self.urls: List[str] = []
self.current_url: str = ""
self._client: Optional[httpx.AsyncClient] = None
self.max_concurrency: int = 2
def get_headers(self) -> dict:
return {
"User-Agent": random.choice(self.user_agents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
}
def _get_client(self) -> httpx.AsyncClient:
"""获取或创建复用的 AsyncClient"""
if self._client is None or self._client.is_closed:
transport = httpx.AsyncHTTPTransport(retries=0)
self._client = httpx.AsyncClient(
transport=transport,
follow_redirects=True,
# 忽略系统 HTTP(S)_PROXY避免误配导致列表站全部连接失败
trust_env=False,
)
return self._client
@staticmethod
def _http_timeout(seconds: float) -> httpx.Timeout:
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
t = max(2.0, float(seconds))
# 国际链路 / 批量爬取时 connect 过短易集体超时
c = min(12.0, max(4.0, t * 0.4))
return httpx.Timeout(t, connect=c)
@staticmethod
def _decode_response_body(response: httpx.Response) -> str:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
"""同步 GET部分站点在 Windows 上 AsyncClient 易 ConnectTimeout同步 Client 正常)。"""
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.get(url, headers=headers, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
def _sync_post(
self, url: str, data: Dict[str, str], timeout: float, headers: dict
) -> str:
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.post(url, headers=headers, data=data, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
async def fetch(
self,
url: str,
timeout: float = 15.0,
retries: int = 2,
raise_for_status: bool = False,
) -> str:
"""异步抓取指定 URL 的 HTML 内容"""
from app.core.log import logger
client = self._get_client()
to = self._http_timeout(timeout)
for attempt in range(retries):
try:
response = await client.get(url, headers=self.get_headers(), timeout=to)
if raise_for_status:
response.raise_for_status()
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"Fetch {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_get, url, timeout, self.get_headers()
)
if text:
logger.info(f"Fetch {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"Fetch {url} 同步回退失败: {e}")
return ""
async def fetch_post(
self,
url: str,
data: Optional[Dict[str, str]] = None,
timeout: float = 15.0,
retries: int = 2,
) -> str:
"""POST application/x-www-form-urlencoded用于 spys.one 等表单页。"""
from app.core.log import logger
client = self._get_client()
payload = data or {}
to = self._http_timeout(timeout)
for attempt in range(retries):
try:
response = await client.post(
url,
headers=self.get_headers(),
data=payload,
timeout=to,
)
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"POST {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_post, url, payload, timeout, self.get_headers()
)
if text:
logger.info(f"POST {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"POST {url} 同步回退失败: {e}")
return ""
async def fetch_all(
self,
urls: List[str],
timeout: float = 15.0,
retries: int = 2,
) -> List[str]:
"""并发抓取多个 URL限制单个插件内部并发"""
semaphore = asyncio.Semaphore(self.max_concurrency)
async def _fetch_limited(url: str):
async with semaphore:
await asyncio.sleep(random.uniform(0.08, 0.45))
return await self.fetch(url, timeout=timeout, retries=retries)
tasks = [_fetch_limited(url) for url in urls]
return await asyncio.gather(*tasks)
def parse_text_proxies(self, text: str, protocol: str = "http") -> List[ProxyRaw]:
"""解析 ip:port 格式的文本代理列表
统一处理 \r\n\n 两种换行以及可能存在的空行。
"""
results = []
text = text.replace("\r\n", "\n").replace("\r", "\n")
for line in text.split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
ip, _, port = line.rpartition(":")
ip = ip.strip()
port = port.strip()
if ip and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), protocol))
except ValueError:
continue
return results
def parse_html_table(
self,
html: str,
column_map: dict,
protocol: str = "http",
) -> List[ProxyRaw]:
"""通用 HTML 表格解析器
Args:
html: HTML 文本
column_map: 列名到索引的映射,如 {"ip": 0, "port": 1, "protocol": 4}
protocol: 默认协议,如果表格中没有协议列则使用此值
"""
results = []
soup = BeautifulSoup(html, "lxml")
table = soup.find("table")
if not table:
return results
ip_idx = column_map.get("ip", 0)
port_idx = column_map.get("port", 1)
protocol_idx = column_map.get("protocol", -1)
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) <= max(ip_idx, port_idx):
continue
ip = tds[ip_idx].get_text(strip=True)
port = tds[port_idx].get_text(strip=True)
if protocol_idx >= 0 and len(tds) > protocol_idx:
proto = tds[protocol_idx].get_text(strip=True).lower()
if proto not in VALID_PROTOCOLS:
proto = protocol
else:
proto = protocol
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), proto))
except ValueError:
continue
return results
async def close(self):
"""关闭复用的 HTTP 客户端"""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None