ProxyPool/app/plugins/proxylist_download.py

from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger


class ProxyListDownloadPlugin(BaseHTTPPlugin):
    default_config = {"max_pages": 5}
    name = "proxylist_download"
    display_name = "ProxyListDownload"
    description = "从 GitHub 公开代理列表获取代理"

    def __init__(self):
        super().__init__()
        # 首选 GitHub raw + fallback 备用源（jsdelivr CDN 或 ProxyScrape API）
        self.sources = [
            {
                "primary": "https://raw.githubusercontent.com/komutan234/Proxy-List-Free/main/proxies/http.txt",
                "fallbacks": [
                    "https://cdn.jsdelivr.net/gh/komutan234/Proxy-List-Free@main/proxies/http.txt",
                    "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
                ],
                "protocol": "http",
            },
            {
                "primary": "https://raw.githubusercontent.com/komutan234/Proxy-List-Free/main/proxies/socks4.txt",
                "fallbacks": [
                    "https://cdn.jsdelivr.net/gh/komutan234/Proxy-List-Free@main/proxies/socks4.txt",
                    "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all",
                ],
                "protocol": "socks4",
            },
            {
                "primary": "https://raw.githubusercontent.com/komutan234/Proxy-List-Free/main/proxies/socks5.txt",
                "fallbacks": [
                    "https://cdn.jsdelivr.net/gh/komutan234/Proxy-List-Free@main/proxies/socks5.txt",
                    "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all",
                ],
                "protocol": "socks5",
            },
        ]

    def _detect_protocol(self, url: str) -> str:
        """根据 URL 判断协议（注意不要用 https:// 来判断）"""
        if "socks4" in url:
            return "socks4"
        elif "socks5" in url:
            return "socks5"
        elif "/http.txt" in url or "protocol=http" in url:
            return "http"
        return "http"

    def _parse_lines(self, html: str, protocol: str) -> List[ProxyRaw]:
        """解析代理文本，统一处理 \r\n、\n 两种换行以及可能存在的空行"""
        results = []
        # 统一替换为 \n 后再分割
        text = html.replace("\r\n", "\n").replace("\r", "\n")
        for line in text.split("\n"):
            line = line.strip()
            if not line or ":" not in line:
                continue
            parts = line.split(":")
            if len(parts) >= 2:
                ip = parts[0].strip()
                port = parts[1].strip()
                if ip and port.isdigit():
                    results.append(ProxyRaw(ip, int(port), protocol))
        return results

    async def crawl(self) -> List[ProxyRaw]:
        results = []
        # 并发请求所有 primary URL
        primary_urls = [s["primary"] for s in self.sources]
        primary_htmls = await self.fetch_all(primary_urls, timeout=15)

        for idx, html in enumerate(primary_htmls):
            source = self.sources[idx]
            protocol = source.get("protocol") or self._detect_protocol(source["primary"])

            if html and html.strip():
                results.extend(self._parse_lines(html, protocol))
                continue

            # primary 返回空或仅空白字符，依次尝试 fallback
            logger.warning(f"{self.display_name} 主源返回空，尝试 fallback: {source['primary']}")
            for fallback_url in source["fallbacks"]:
                fallback_html = await self.fetch(fallback_url, timeout=15)
                if fallback_html and fallback_html.strip():
                    fb_protocol = source.get("protocol") or self._detect_protocol(fallback_url)
                    results.extend(self._parse_lines(fallback_html, fb_protocol))
                    break

        if results:
            logger.info(f"{self.display_name} 解析完成，获得 {len(results)} 个潜在代理")
        return results