"""premproxy.com 列表页表格。""" import re from typing import List from bs4 import BeautifulSoup from app.core.plugin_system import ProxyRaw from app.plugins.base import BaseHTTPPlugin from app.core.log import logger class FpwPremproxyPlugin(BaseHTTPPlugin): name = "fpw_premproxy" display_name = "PremProxy" description = "premproxy.com HTTP/SOCKS 列表页" def __init__(self): super().__init__() self.urls = [ "https://premproxy.com/list/", "https://premproxy.com/socks-list/", ] def _parse_ipport_embedded(self, html: str) -> List[ProxyRaw]: found = re.findall( r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b", html, ) out: List[ProxyRaw] = [] for ip, ps in found: if not ps.isdigit() or not (1 <= int(ps) <= 65535): continue try: out.append(ProxyRaw(ip, int(ps), "http")) except ValueError: continue return out def _parse_html(self, html: str) -> List[ProxyRaw]: soup = BeautifulSoup(html, "lxml") results: List[ProxyRaw] = [] for tr in soup.find_all("tr"): tds = tr.find_all("td") if len(tds) < 2: continue ip = tds[0].get_text(strip=True) port = tds[1].get_text(strip=True) if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip): continue if not port.isdigit() or not (1 <= int(port) <= 65535): continue row = tr.get_text(" ", strip=True).lower() if "socks5" in row: proto = "socks5" elif "socks4" in row or "socks" in row: proto = "socks4" elif "https" in row: proto = "https" else: proto = "http" try: results.append(ProxyRaw(ip, int(port), proto)) except ValueError: continue if len(results) < 5: results.extend(self._parse_ipport_embedded(html)) return results async def crawl(self) -> List[ProxyRaw]: merged: List[ProxyRaw] = [] htmls = await self.fetch_all(self.urls, timeout=25, retries=2) for url, html in zip(self.urls, htmls): if not html: continue batch = self._parse_html(html) if batch: merged.extend(batch) logger.info(f"{self.display_name} {url}: {len(batch)} 条") if merged: logger.info(f"{self.display_name} 合计 {len(merged)} 条") return merged