ProxyPool/app/plugins/fpw_gatherproxy.py

"""gatherproxy.com 页面内嵌 JSON（PROXY_IP / PROXY_PORT）。"""
import re
from typing import List

from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger


class FpwGatherproxyPlugin(BaseHTTPPlugin):
    name = "fpw_gatherproxy"
    display_name = "GatherProxy"
    description = "gatherproxy.com 内嵌代理 JSON（站点常有限流）"

    def __init__(self):
        super().__init__()
        self.urls = [
            "https://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
            "https://www.gatherproxy.com/proxylist/country/?c=United%20States",
        ]

    def _extract_from_text(self, text: str) -> List[ProxyRaw]:
        results: List[ProxyRaw] = []
        for m in re.finditer(
            r"PROXY_IP['\"]?\s*:\s*['\"]([\d.]+)['\"].{0,120}?PROXY_PORT['\"]?\s*:\s*['\"](\d+)['\"]",
            text,
            re.DOTALL | re.IGNORECASE,
        ):
            ip, port = m.group(1), m.group(2)
            if port.isdigit() and 1 <= int(port) <= 65535:
                try:
                    results.append(ProxyRaw(ip, int(port), "http"))
                except ValueError:
                    continue
        for m in re.finditer(
            r"\{[^{}]*\"PROXY_IP\"\s*:\s*\"([\d.]+)\"[^{}]*\"PROXY_PORT\"\s*:\s*\"(\d+)\"[^{}]*\}",
            text,
        ):
            ip, port = m.group(1), m.group(2)
            if port.isdigit() and 1 <= int(port) <= 65535:
                try:
                    results.append(ProxyRaw(ip, int(port), "http"))
                except ValueError:
                    continue
        for m in re.finditer(
            r'"(?:proxy_)?ip"\s*:\s*"([\d.]+)"\s*,\s*"(?:proxy_)?port"\s*:\s*"?(\d+)"?',
            text,
            re.I,
        ):
            ip, port = m.group(1), m.group(2)
            if port.isdigit() and 1 <= int(port) <= 65535:
                try:
                    results.append(ProxyRaw(ip, int(port), "http"))
                except ValueError:
                    continue
        return results

    async def crawl(self) -> List[ProxyRaw]:
        seen = set()
        out: List[ProxyRaw] = []
        htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
        for url, html in zip(self.urls, htmls):
            if not html:
                continue
            for p in self._extract_from_text(html):
                k = (p.ip, p.port)
                if k not in seen:
                    seen.add(k)
                    out.append(p)
            if out:
                logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条")
        return out