Files
ProxyPool/app/plugins/base.py
祀梦 0131c8b408 feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation
- Add Free_Proxy_Website-style fpw_* plugins and register them
- Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting
- Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore
- Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait
- Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys
- WebSocket dashboard stats; ProxyList pool_filter and API alignment
- POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404
- pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB
- .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import

Made-with: Cursor
2026-04-05 13:39:19 +08:00

254 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
import re
import random
import asyncio
import httpx
from typing import Dict, List, Optional
from bs4 import BeautifulSoup
from app.core.plugin_system import BaseCrawlerPlugin
from app.models.domain import ProxyRaw
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class BaseHTTPPlugin(BaseCrawlerPlugin):
"""基于 HTTP 的爬虫插件基类"""
def __init__(self):
super().__init__()
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
]
self.urls: List[str] = []
self.current_url: str = ""
self._client: Optional[httpx.AsyncClient] = None
self.max_concurrency: int = 3
def get_headers(self) -> dict:
return {
"User-Agent": random.choice(self.user_agents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
}
def _get_client(self) -> httpx.AsyncClient:
"""获取或创建复用的 AsyncClient"""
if self._client is None or self._client.is_closed:
transport = httpx.AsyncHTTPTransport(retries=0)
self._client = httpx.AsyncClient(
transport=transport,
follow_redirects=True,
# 忽略系统 HTTP(S)_PROXY避免误配导致列表站全部连接失败
trust_env=False,
)
return self._client
@staticmethod
def _http_timeout(seconds: float) -> httpx.Timeout:
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
t = max(2.0, float(seconds))
c = min(6.0, max(3.0, t * 0.35))
return httpx.Timeout(t, connect=c)
@staticmethod
def _decode_response_body(response: httpx.Response) -> str:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
"""同步 GET部分站点在 Windows 上 AsyncClient 易 ConnectTimeout同步 Client 正常)。"""
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.get(url, headers=headers, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
def _sync_post(
self, url: str, data: Dict[str, str], timeout: float, headers: dict
) -> str:
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.post(url, headers=headers, data=data, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
async def fetch(
self,
url: str,
timeout: float = 15.0,
retries: int = 2,
raise_for_status: bool = False,
) -> str:
"""异步抓取指定 URL 的 HTML 内容"""
from app.core.log import logger
client = self._get_client()
to = self._http_timeout(timeout)
for attempt in range(retries):
try:
response = await client.get(url, headers=self.get_headers(), timeout=to)
if raise_for_status:
response.raise_for_status()
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"Fetch {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_get, url, timeout, self.get_headers()
)
if text:
logger.info(f"Fetch {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"Fetch {url} 同步回退失败: {e}")
return ""
async def fetch_post(
self,
url: str,
data: Optional[Dict[str, str]] = None,
timeout: float = 15.0,
retries: int = 2,
) -> str:
"""POST application/x-www-form-urlencoded用于 spys.one 等表单页。"""
from app.core.log import logger
client = self._get_client()
payload = data or {}
to = self._http_timeout(timeout)
for attempt in range(retries):
try:
response = await client.post(
url,
headers=self.get_headers(),
data=payload,
timeout=to,
)
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"POST {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_post, url, payload, timeout, self.get_headers()
)
if text:
logger.info(f"POST {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"POST {url} 同步回退失败: {e}")
return ""
async def fetch_all(
self,
urls: List[str],
timeout: float = 15.0,
retries: int = 2,
) -> List[str]:
"""并发抓取多个 URL限制单个插件内部并发"""
semaphore = asyncio.Semaphore(self.max_concurrency)
async def _fetch_limited(url: str):
async with semaphore:
return await self.fetch(url, timeout=timeout, retries=retries)
tasks = [_fetch_limited(url) for url in urls]
return await asyncio.gather(*tasks)
def parse_text_proxies(self, text: str, protocol: str = "http") -> List[ProxyRaw]:
"""解析 ip:port 格式的文本代理列表
统一处理 \r\n\n 两种换行以及可能存在的空行。
"""
results = []
text = text.replace("\r\n", "\n").replace("\r", "\n")
for line in text.split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
ip, _, port = line.rpartition(":")
ip = ip.strip()
port = port.strip()
if ip and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), protocol))
except ValueError:
continue
return results
def parse_html_table(
self,
html: str,
column_map: dict,
protocol: str = "http",
) -> List[ProxyRaw]:
"""通用 HTML 表格解析器
Args:
html: HTML 文本
column_map: 列名到索引的映射,如 {"ip": 0, "port": 1, "protocol": 4}
protocol: 默认协议,如果表格中没有协议列则使用此值
"""
results = []
soup = BeautifulSoup(html, "lxml")
table = soup.find("table")
if not table:
return results
ip_idx = column_map.get("ip", 0)
port_idx = column_map.get("port", 1)
protocol_idx = column_map.get("protocol", -1)
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) <= max(ip_idx, port_idx):
continue
ip = tds[ip_idx].get_text(strip=True)
port = tds[port_idx].get_text(strip=True)
if protocol_idx >= 0 and len(tds) > protocol_idx:
proto = tds[protocol_idx].get_text(strip=True).lower()
if proto not in VALID_PROTOCOLS:
proto = protocol
else:
proto = protocol
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), proto))
except ValueError:
continue
return results
async def close(self):
"""关闭复用的 HTTP 客户端"""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None