"""gatherproxy.com 页面内嵌 JSON(PROXY_IP / PROXY_PORT)。""" import re from typing import List from app.core.plugin_system import ProxyRaw from app.plugins.base import BaseHTTPPlugin from app.core.log import logger class FpwGatherproxyPlugin(BaseHTTPPlugin): name = "fpw_gatherproxy" display_name = "GatherProxy" description = "gatherproxy.com 内嵌代理 JSON(站点常有限流)" def __init__(self): super().__init__() self.urls = [ "http://www.gatherproxy.com/proxylist/anonymity/?t=Elite", "http://www.gatherproxy.com/proxylist/country/?c=United%20States", ] def _extract_from_text(self, text: str) -> List[ProxyRaw]: results: List[ProxyRaw] = [] for m in re.finditer( r"PROXY_IP['\"]?\s*:\s*['\"]([\d.]+)['\"].{0,120}?PROXY_PORT['\"]?\s*:\s*['\"](\d+)['\"]", text, re.DOTALL | re.IGNORECASE, ): ip, port = m.group(1), m.group(2) if port.isdigit() and 1 <= int(port) <= 65535: try: results.append(ProxyRaw(ip, int(port), "http")) except ValueError: continue for m in re.finditer( r"\{[^{}]*\"PROXY_IP\"\s*:\s*\"([\d.]+)\"[^{}]*\"PROXY_PORT\"\s*:\s*\"(\d+)\"[^{}]*\}", text, ): ip, port = m.group(1), m.group(2) if port.isdigit() and 1 <= int(port) <= 65535: try: results.append(ProxyRaw(ip, int(port), "http")) except ValueError: continue return results async def crawl(self) -> List[ProxyRaw]: seen = set() out: List[ProxyRaw] = [] htmls = await self.fetch_all(self.urls, timeout=10, retries=1) for url, html in zip(self.urls, htmls): if not html: continue for p in self._extract_from_text(html): k = (p.ip, p.port) if k not in seen: seen.add(k) out.append(p) if out: logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条") return out