ProxyPool/app/plugins/proxyscrape.py

"""ProxyScrape 测试爬虫 - 用于验证架构，支持全协议类型"""
import asyncio
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger


class ProxyScrapePlugin(BaseHTTPPlugin):
    default_config = {"max_pages": 5}
    """
    从 ProxyScrape 公开 API 获取代理库
    覆盖 http/https/socks4/socks5 全协议，专门用于测试插件系统的可扩展性
    """

    name = "proxyscrape"
    display_name = "ProxyScrape测试站"
    description = "从 ProxyScrape API 获取各类型代理（HTTP/HTTPS/SOCKS4/SOCKS5），用于测试架构扩展"
    enabled = True

    def __init__(self):
        super().__init__()
        # GitHub raw 首选；国内/高负载时 jsDelivr 镜像常更稳
        self.urls = [
            ("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
            ("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
            ("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
            ("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
        ]
        self._mirror_prefix = (
            "https://cdn.jsdelivr.net/gh/monosans/proxy-list@main/proxies/"
        )
        # ProxyScrape 官方 API 作为 fallback
        self.api_urls = {
            "http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
            "https": "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all",
            "socks4": "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all",
            "socks5": "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all",
        }

    def _parse_proxies(self, text: str, protocol: str) -> List[ProxyRaw]:
        """解析 ip:port 每行的文本内容"""
        proxies = []
        for line in text.splitlines():
            line = line.strip()
            if not line or ":" not in line:
                continue
            ip, _, port_str = line.rpartition(":")
            ip = ip.strip()
            port_str = port_str.strip()
            if port_str.isdigit() and 1 <= int(port_str) <= 65535:
                try:
                    proxies.append(ProxyRaw(ip, int(port_str), protocol))
                except ValueError:
                    continue
        return proxies

    async def crawl(self) -> List[ProxyRaw]:
        results: List[ProxyRaw] = []
        protocols = [protocol for protocol, _ in self.urls]
        urls = [url for _, url in self.urls]
        fetch_timeout = 28.0

        # 1. GitHub raw：放宽总等待，避免 crawl-all 时与其它插件抢带宽导致集体超时
        tasks = [
            asyncio.create_task(self.fetch(url, timeout=fetch_timeout))
            for url in urls
        ]
        done, pending = await asyncio.wait(tasks, timeout=45)
        for task in pending:
            task.cancel()
        htmls: list[str] = []
        done_protocols: set[str] = set()
        for i, task in enumerate(tasks):
            try:
                if task in done:
                    htmls.append(task.result())
                    done_protocols.add(protocols[i])
                else:
                    htmls.append("")
            except Exception:
                htmls.append("")

        need_mirror: list[str] = []
        for protocol, html in zip(protocols, htmls):
            proxies = self._parse_proxies(html or "", protocol) if html else []
            if proxies:
                logger.info(
                    f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理"
                )
                results.extend(proxies)
            else:
                if protocol in done_protocols:
                    logger.warning(
                        f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效，尝试镜像与 API"
                    )
                else:
                    logger.warning(
                        f"ProxyScrape {protocol.upper()} GitHub raw 请求超时，尝试镜像与 API"
                    )
                need_mirror.append(protocol)

        # 2. jsDelivr 镜像（顺序请求，减轻与其它插件的瞬时并发叠加）
        still_need_api: list[str] = []
        for protocol in need_mirror:
            mirror_url = f"{self._mirror_prefix}{protocol}.txt"
            text = await self.fetch(mirror_url, timeout=fetch_timeout, retries=2)
            proxies = self._parse_proxies(text or "", protocol) if text else []
            if proxies:
                logger.info(
                    f"ProxyScrape {protocol.upper()} jsDelivr 镜像获取 {len(proxies)} 个代理"
                )
                results.extend(proxies)
            else:
                still_need_api.append(protocol)

        # 3. ProxyScrape 官方 API
        if still_need_api:
            fallback_urls = [self.api_urls[p] for p in still_need_api]
            try:
                api_htmls = await asyncio.wait_for(
                    self.fetch_all(fallback_urls, timeout=25), timeout=35
                )
            except asyncio.TimeoutError:
                logger.warning(
                    f"ProxyScrape API fallback 批量请求超时，跳过 {len(still_need_api)} 个协议"
                )
                api_htmls = [""] * len(still_need_api)
            for protocol, api_html in zip(still_need_api, api_htmls):
                proxies = (
                    self._parse_proxies(api_html or "", protocol) if api_html else []
                )
                if proxies:
                    logger.info(
                        f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理"
                    )
                    results.extend(proxies)
                else:
                    logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效")

        if results:
            logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
        else:
            logger.warning("ProxyScrape 所有真实源均不可用，返回空列表")
        return results