ProxyPool/app/plugins/base.py

"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
import re
import random
import asyncio
import httpx
from typing import List, Optional
from bs4 import BeautifulSoup
from app.core.plugin_system import BaseCrawlerPlugin
from app.models.domain import ProxyRaw


VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")


class BaseHTTPPlugin(BaseCrawlerPlugin):
    """基于 HTTP 的爬虫插件基类"""

    def __init__(self):
        super().__init__()
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
        ]
        self.urls: List[str] = []
        self.current_url: str = ""
        self._client: Optional[httpx.AsyncClient] = None
        self.max_concurrency: int = 3

    def get_headers(self) -> dict:
        return {
            "User-Agent": random.choice(self.user_agents),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
            "Connection": "keep-alive",
        }

    def _get_client(self) -> httpx.AsyncClient:
        """获取或创建复用的 AsyncClient"""
        if self._client is None or self._client.is_closed:
            transport = httpx.AsyncHTTPTransport(retries=0)
            self._client = httpx.AsyncClient(
                transport=transport,
                follow_redirects=True,
            )
        return self._client

    async def fetch(
        self,
        url: str,
        timeout: float = 15.0,
        retries: int = 2,
        raise_for_status: bool = False,
    ) -> str:
        """异步抓取指定 URL 的 HTML 内容"""
        from app.core.log import logger
        client = self._get_client()
        for attempt in range(retries):
            try:
                response = await client.get(url, headers=self.get_headers(), timeout=timeout)
                if raise_for_status:
                    response.raise_for_status()
                if response.status_code == 200:
                    content = response.content
                    encoding = response.encoding
                    if encoding == "utf-8" or not encoding:
                        try:
                            return content.decode("utf-8")
                        except UnicodeDecodeError:
                            return content.decode("gbk", errors="ignore")
                    return content.decode(encoding, errors="ignore")
                else:
                    logger.warning(f"Fetch {url} returned status {response.status_code}")
            except Exception as e:
                logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
            if attempt < retries - 1:
                await asyncio.sleep(random.uniform(1, 3))
        return ""

    async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
        """并发抓取多个 URL，限制单个插件内部并发"""
        semaphore = asyncio.Semaphore(self.max_concurrency)

        async def _fetch_limited(url: str):
            async with semaphore:
                return await self.fetch(url, timeout=timeout)

        tasks = [_fetch_limited(url) for url in urls]
        return await asyncio.gather(*tasks)

    def parse_text_proxies(self, text: str, protocol: str = "http") -> List[ProxyRaw]:
        """解析 ip:port 格式的文本代理列表

        统一处理 \r\n、\n 两种换行以及可能存在的空行。
        """
        results = []
        text = text.replace("\r\n", "\n").replace("\r", "\n")
        for line in text.split("\n"):
            line = line.strip()
            if not line or ":" not in line:
                continue
            ip, _, port = line.rpartition(":")
            ip = ip.strip()
            port = port.strip()
            if ip and port.isdigit() and 1 <= int(port) <= 65535:
                try:
                    results.append(ProxyRaw(ip, int(port), protocol))
                except ValueError:
                    continue
        return results

    def parse_html_table(
        self,
        html: str,
        column_map: dict,
        protocol: str = "http",
    ) -> List[ProxyRaw]:
        """通用 HTML 表格解析器

        Args:
            html: HTML 文本
            column_map: 列名到索引的映射，如 {"ip": 0, "port": 1, "protocol": 4}
            protocol: 默认协议，如果表格中没有协议列则使用此值
        """
        results = []
        soup = BeautifulSoup(html, "lxml")
        table = soup.find("table")
        if not table:
            return results

        ip_idx = column_map.get("ip", 0)
        port_idx = column_map.get("port", 1)
        protocol_idx = column_map.get("protocol", -1)

        for row in table.find_all("tr"):
            tds = row.find_all("td")
            if len(tds) <= max(ip_idx, port_idx):
                continue
            ip = tds[ip_idx].get_text(strip=True)
            port = tds[port_idx].get_text(strip=True)
            if protocol_idx >= 0 and len(tds) > protocol_idx:
                proto = tds[protocol_idx].get_text(strip=True).lower()
                if proto not in VALID_PROTOCOLS:
                    proto = protocol
            else:
                proto = protocol

            if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
                try:
                    results.append(ProxyRaw(ip, int(port), proto))
                except ValueError:
                    continue
        return results

    async def close(self):
        """关闭复用的 HTTP 客户端"""
        if self._client and not self._client.is_closed:
            await self._client.aclose()
            self._client = None