ProxyPool/app/plugins/kuaidaili.py

import re
import asyncio
import random
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger

VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")


class KuaiDaiLiPlugin(BaseHTTPPlugin):
    default_config = {"max_pages": 5}
    name = "kuaidaili"
    display_name = "快代理"
    description = "从快代理网站爬取免费代理"

    def __init__(self):
        super().__init__()
        # fps/dps 列表页目前仍可 200；inha/intr 常返回 567（反爬），作末位兜底
        self.urls = [
            "https://www.kuaidaili.com/free/fps/",
            "https://www.kuaidaili.com/free/dps/",
            "https://www.kuaidaili.com/free/inha/1/",
            "https://www.kuaidaili.com/free/intr/1/",
        ]

    def get_headers(self) -> dict:
        headers = super().get_headers()
        headers["Referer"] = "https://www.kuaidaili.com/free/"
        headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
        headers["Accept-Encoding"] = "gzip, deflate"
        headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8"
        headers["Sec-Fetch-Dest"] = "document"
        headers["Sec-Fetch-Mode"] = "navigate"
        headers["Sec-Fetch-Site"] = "same-origin"
        headers["Upgrade-Insecure-Requests"] = "1"
        return headers

    @staticmethod
    def _infer_protocol(texts: List[str]) -> str:
        """从一行单元格文本中推断协议（兼容 fps / dps / inha 等版式）。"""
        for t in texts[2:]:
            tl = t.lower().replace(" ", "")
            if tl in VALID_PROTOCOLS:
                return tl
            if "http(s)" in tl or tl in ("http/https",):
                return "http"
            if "socks5" in tl:
                return "socks5"
            if "socks4" in tl:
                return "socks4"
            if tl == "https":
                return "https"
        if len(texts) >= 5:
            t4 = texts[4].lower().strip()
            if t4 in VALID_PROTOCOLS:
                return t4
        return "http"

    def _parse_table(self, table) -> List[ProxyRaw]:
        out: List[ProxyRaw] = []
        for row in table.find_all("tr"):
            tds = row.find_all("td")
            if len(tds) < 2:
                continue
            texts = [td.get_text(strip=True) for td in tds]
            ip = texts[0]
            port_s = texts[1]
            if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip):
                continue
            if not port_s.isdigit() or not (1 <= int(port_s) <= 65535):
                continue
            protocol = self._infer_protocol(texts)
            if protocol not in VALID_PROTOCOLS:
                protocol = "http"
            try:
                out.append(ProxyRaw(ip, int(port_s), protocol))
            except ValueError:
                continue
        return out

    async def crawl(self) -> List[ProxyRaw]:
        results = []
        await self.fetch("https://www.kuaidaili.com/free/", timeout=10)
        await asyncio.sleep(random.uniform(1, 2))

        for url in self.urls:
            html = await self.fetch(url, timeout=15)
            if not html:
                continue
            soup = BeautifulSoup(html, "lxml")
            table = soup.find("table")
            if not table:
                logger.warning(f"{self.display_name} 未能找到表格，可能是触发了反爬: {url}")
                continue

            batch = self._parse_table(table)
            if batch:
                results.extend(batch)
                logger.info(f"{self.display_name} {url} 解析 {len(batch)} 条")
            await asyncio.sleep(random.uniform(1, 2))

        if results:
            logger.info(f"{self.display_name} 解析完成，获取 {len(results)} 个潜在代理")
        return results