"""freeproxylists.net 及常见镜像路径(表格 / 纯文本)。""" import re from typing import List from bs4 import BeautifulSoup from app.core.plugin_system import ProxyRaw from app.plugins.base import BaseHTTPPlugin from app.core.log import logger class FpwFreeproxylistsPlugin(BaseHTTPPlugin): name = "fpw_freeproxylists" display_name = "FreeProxyLists" description = "freeproxylists.net 系列页面(易被 403,多 URL 尝试)" def __init__(self): super().__init__() self.urls = [ "http://www.freeproxylists.net/", "http://freeproxylists.net/", "http://www.freeproxylists.net/en/http-txt.html", ] def _parse_any(self, html: str) -> List[ProxyRaw]: ipport = re.findall( r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b", html, ) if len(ipport) >= 5: out: List[ProxyRaw] = [] for ip, ps in ipport: if ps.isdigit() and 1 <= int(ps) <= 65535: try: out.append(ProxyRaw(ip, int(ps), "http")) except ValueError: pass return out soup = BeautifulSoup(html, "lxml") results: List[ProxyRaw] = [] for tr in soup.find_all("tr"): tds = tr.find_all("td") if len(tds) < 2: continue ip = tds[0].get_text(strip=True) port = tds[1].get_text(strip=True) if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip) and port.isdigit(): if 1 <= int(port) <= 65535: try: results.append(ProxyRaw(ip, int(port), "http")) except ValueError: pass return results async def crawl(self) -> List[ProxyRaw]: seen = set() out: List[ProxyRaw] = [] htmls = await self.fetch_all(self.urls, timeout=10, retries=1) for url, html in zip(self.urls, htmls): if not html: continue for p in self._parse_any(html): key = (p.ip, p.port, p.protocol) if key not in seen: seen.add(key) out.append(p) if out: logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条") return out