ProxyPool/app/plugins/yundaili.py

import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger

VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")


class YunDaiLiPlugin(BaseHTTPPlugin):
    default_config = {"max_pages": 5}
    name = "yundaili"
    display_name = "云代理"
    description = "从 GitHub 公开代理列表获取免费代理"

    def __init__(self):
        super().__init__()
        # 主数据源：GitHub raw
        self.urls = [
            ("http", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt"),
            ("socks4", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt"),
            ("socks5", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt"),
        ]
        # Fallback：jsdelivr CDN 加速
        self.fallback_urls = [
            ("http", "https://cdn.jsdelivr.net/gh/mmpx12/proxy-list@master/http.txt"),
            ("socks4", "https://cdn.jsdelivr.net/gh/mmpx12/proxy-list@master/socks4.txt"),
            ("socks5", "https://cdn.jsdelivr.net/gh/mmpx12/proxy-list@master/socks5.txt"),
        ]

    def _parse_htmls(self, htmls: List[str], url_mapping: List[tuple]) -> List[ProxyRaw]:
        results: List[ProxyRaw] = []
        for (protocol, _), html in zip(url_mapping, htmls):
            if not html:
                logger.warning(f"{self.display_name} {protocol.upper()} 返回空内容，可能网络受限或源已失效")
                continue

            count = 0
            for line in html.splitlines():
                line = line.strip()
                if not line or ":" not in line:
                    continue
                parts = line.split(":")
                if len(parts) < 2:
                    continue
                ip = parts[0].strip()
                port_str = parts[1].strip()
                if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
                    continue
                if not port_str.isdigit() or not (1 <= int(port_str) <= 65535):
                    continue
                final_protocol = protocol if protocol in VALID_PROTOCOLS else "http"
                results.append(ProxyRaw(ip, int(port_str), final_protocol))
                count += 1

            if count:
                logger.info(f"{self.display_name} {protocol.upper()} 解析完成，获取 {count} 个潜在代理")
        return results

    async def crawl(self) -> List[ProxyRaw]:
        results: List[ProxyRaw] = []

        # 顺序请求主源，避免某个 URL 卡住拖慢整体
        for protocol, url in self.urls:
            html = await self.fetch(url, timeout=12)
            if html:
                results.extend(self._parse_htmls([html], [(protocol, url)]))

        # 主源为空时尝试 fallback（也顺序请求）
        if not results:
            logger.warning(f"{self.display_name} GitHub 主源全部返回空，尝试 jsdelivr fallback")
            for protocol, url in self.fallback_urls:
                html = await self.fetch(url, timeout=12)
                if html:
                    results.extend(self._parse_htmls([html], [(protocol, url)]))

        if results:
            logger.info(f"{self.display_name} 总计解析完成，获取 {len(results)} 个潜在代理")
        else:
            logger.warning(f"{self.display_name} 未获取到任何代理")
        return results