"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装""" import re import random import asyncio import httpx from typing import List, Optional from bs4 import BeautifulSoup from app.core.plugin_system import BaseCrawlerPlugin from app.models.domain import ProxyRaw VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") class BaseHTTPPlugin(BaseCrawlerPlugin): """基于 HTTP 的爬虫插件基类""" def __init__(self): super().__init__() self.user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", ] self.urls: List[str] = [] self.current_url: str = "" self._client: Optional[httpx.AsyncClient] = None self.max_concurrency: int = 3 def get_headers(self) -> dict: return { "User-Agent": random.choice(self.user_agents), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Connection": "keep-alive", } def _get_client(self) -> httpx.AsyncClient: """获取或创建复用的 AsyncClient""" if self._client is None or self._client.is_closed: transport = httpx.AsyncHTTPTransport(retries=0) self._client = httpx.AsyncClient( transport=transport, follow_redirects=True, ) return self._client async def fetch( self, url: str, timeout: float = 15.0, retries: int = 2, raise_for_status: bool = False, ) -> str: """异步抓取指定 URL 的 HTML 内容""" from app.core.log import logger client = self._get_client() for attempt in range(retries): try: response = await client.get(url, headers=self.get_headers(), timeout=timeout) if raise_for_status: response.raise_for_status() if response.status_code == 200: content = response.content encoding = response.encoding if encoding == "utf-8" or not encoding: try: return content.decode("utf-8") except UnicodeDecodeError: return content.decode("gbk", errors="ignore") return content.decode(encoding, errors="ignore") else: logger.warning(f"Fetch {url} returned status {response.status_code}") except Exception as e: logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}") if attempt < retries - 1: await asyncio.sleep(random.uniform(1, 3)) return "" async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]: """并发抓取多个 URL,限制单个插件内部并发""" semaphore = asyncio.Semaphore(self.max_concurrency) async def _fetch_limited(url: str): async with semaphore: return await self.fetch(url, timeout=timeout) tasks = [_fetch_limited(url) for url in urls] return await asyncio.gather(*tasks) def parse_text_proxies(self, text: str, protocol: str = "http") -> List[ProxyRaw]: """解析 ip:port 格式的文本代理列表 统一处理 \r\n、\n 两种换行以及可能存在的空行。 """ results = [] text = text.replace("\r\n", "\n").replace("\r", "\n") for line in text.split("\n"): line = line.strip() if not line or ":" not in line: continue ip, _, port = line.rpartition(":") ip = ip.strip() port = port.strip() if ip and port.isdigit() and 1 <= int(port) <= 65535: try: results.append(ProxyRaw(ip, int(port), protocol)) except ValueError: continue return results def parse_html_table( self, html: str, column_map: dict, protocol: str = "http", ) -> List[ProxyRaw]: """通用 HTML 表格解析器 Args: html: HTML 文本 column_map: 列名到索引的映射,如 {"ip": 0, "port": 1, "protocol": 4} protocol: 默认协议,如果表格中没有协议列则使用此值 """ results = [] soup = BeautifulSoup(html, "lxml") table = soup.find("table") if not table: return results ip_idx = column_map.get("ip", 0) port_idx = column_map.get("port", 1) protocol_idx = column_map.get("protocol", -1) for row in table.find_all("tr"): tds = row.find_all("td") if len(tds) <= max(ip_idx, port_idx): continue ip = tds[ip_idx].get_text(strip=True) port = tds[port_idx].get_text(strip=True) if protocol_idx >= 0 and len(tds) > protocol_idx: proto = tds[protocol_idx].get_text(strip=True).lower() if proto not in VALID_PROTOCOLS: proto = protocol else: proto = protocol if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535: try: results.append(ProxyRaw(ip, int(port), proto)) except ValueError: continue return results async def close(self): """关闭复用的 HTTP 客户端""" if self._client and not self._client.is_closed: await self._client.aclose() self._client = None