From e582067316dd13670070a0e9264281812dcba9d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A5=80=E6=A2=A6?= <3501646051@qq.com> Date: Sun, 5 Apr 2026 14:16:03 +0800 Subject: [PATCH] fix(plugins): fpw parsers for JSON API, mirrors, and looser HTML - fpw_proxy_list_download: parse JSON list/proxies bodies; jsDelivr monosans tier; crawl timeout 300s - fpw_socks_ssl: try parse_html_table before regex - fpw_hidemy: loose row scan when fixed columns fail - fpw_proxynova: plain IP/port row fallback - fpw_spys_one: HTTPS endpoints; crawl timeout 180s - fpw_gatherproxy: HTTPS + extra JSON key patterns - fpw_checkerproxy: lower min HTML length for parse - fpw_premproxy: ip:port regex fallback when few table rows Made-with: Cursor --- app/plugins/fpw_checkerproxy.py | 2 +- app/plugins/fpw_gatherproxy.py | 15 ++++- app/plugins/fpw_hidemy.py | 43 ++++++++++++ app/plugins/fpw_premproxy.py | 17 +++++ app/plugins/fpw_proxy_list_download.py | 90 +++++++++++++++++++++++--- app/plugins/fpw_proxynova.py | 30 +++++++++ app/plugins/fpw_socks_ssl_proxy.py | 8 ++- app/plugins/fpw_spys_one.py | 5 +- 8 files changed, 193 insertions(+), 17 deletions(-) diff --git a/app/plugins/fpw_checkerproxy.py b/app/plugins/fpw_checkerproxy.py index bdd81ea..aad4b0c 100644 --- a/app/plugins/fpw_checkerproxy.py +++ b/app/plugins/fpw_checkerproxy.py @@ -49,7 +49,7 @@ class FpwCheckerproxyPlugin(BaseHTTPPlugin): seen: Set[Tuple[str, int, str]] = set() htmls = await self.fetch_all(self.urls, timeout=25, retries=2) for html in htmls: - if not html or len(html) < 200: + if not html or len(html) < 80: continue for p in self._parse_ip_ports(html): k = (p.ip, p.port, p.protocol) diff --git a/app/plugins/fpw_gatherproxy.py b/app/plugins/fpw_gatherproxy.py index 3e68770..2e93bfb 100644 --- a/app/plugins/fpw_gatherproxy.py +++ b/app/plugins/fpw_gatherproxy.py @@ -15,8 +15,8 @@ class FpwGatherproxyPlugin(BaseHTTPPlugin): def __init__(self): super().__init__() self.urls = [ - "http://www.gatherproxy.com/proxylist/anonymity/?t=Elite", - "http://www.gatherproxy.com/proxylist/country/?c=United%20States", + "https://www.gatherproxy.com/proxylist/anonymity/?t=Elite", + "https://www.gatherproxy.com/proxylist/country/?c=United%20States", ] def _extract_from_text(self, text: str) -> List[ProxyRaw]: @@ -42,6 +42,17 @@ class FpwGatherproxyPlugin(BaseHTTPPlugin): results.append(ProxyRaw(ip, int(port), "http")) except ValueError: continue + for m in re.finditer( + r'"(?:proxy_)?ip"\s*:\s*"([\d.]+)"\s*,\s*"(?:proxy_)?port"\s*:\s*"?(\d+)"?', + text, + re.I, + ): + ip, port = m.group(1), m.group(2) + if port.isdigit() and 1 <= int(port) <= 65535: + try: + results.append(ProxyRaw(ip, int(port), "http")) + except ValueError: + continue return results async def crawl(self) -> List[ProxyRaw]: diff --git a/app/plugins/fpw_hidemy.py b/app/plugins/fpw_hidemy.py index 86aae01..d1a501c 100644 --- a/app/plugins/fpw_hidemy.py +++ b/app/plugins/fpw_hidemy.py @@ -1,4 +1,5 @@ """hidemyna.me 免费代理列表表格。""" +import re from typing import List from app.core.plugin_system import ProxyRaw @@ -19,6 +20,46 @@ class FpwHidemyPlugin(BaseHTTPPlugin): "https://hidemyna.me/en/proxy-list/?type=socks4", ] + def _parse_rows_loose(self, html: str) -> List[ProxyRaw]: + from bs4 import BeautifulSoup + + out: List[ProxyRaw] = [] + soup = BeautifulSoup(html, "lxml") + for tr in soup.find_all("tr"): + tds = tr.find_all("td") + if len(tds) < 2: + continue + row = " ".join(td.get_text(" ", strip=True) for td in tds) + ip_m = re.search( + r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b", + row, + ) + if not ip_m: + continue + ip = ip_m.group(1) + port_val = None + for td in tds: + t = td.get_text(strip=True) + if t.isdigit() and 1 <= int(t) <= 65535: + port_val = int(t) + break + if port_val is None: + continue + u = row.upper() + if "SOCKS5" in u: + proto = "socks5" + elif "SOCKS4" in u or "SOCKS" in u: + proto = "socks4" + elif "HTTPS" in u: + proto = "https" + else: + proto = "http" + try: + out.append(ProxyRaw(ip, port_val, proto)) + except ValueError: + continue + return out + async def crawl(self) -> List[ProxyRaw]: results: List[ProxyRaw] = [] htmls = await self.fetch_all(self.urls, timeout=25, retries=2) @@ -30,6 +71,8 @@ class FpwHidemyPlugin(BaseHTTPPlugin): column_map={"ip": 0, "port": 1, "protocol": 4}, protocol="http", ) + if not batch: + batch = self._parse_rows_loose(html) if batch: results.extend(batch) logger.info(f"{self.display_name} {url}: {len(batch)} 条") diff --git a/app/plugins/fpw_premproxy.py b/app/plugins/fpw_premproxy.py index 9660f8c..651029c 100644 --- a/app/plugins/fpw_premproxy.py +++ b/app/plugins/fpw_premproxy.py @@ -21,6 +21,21 @@ class FpwPremproxyPlugin(BaseHTTPPlugin): "https://premproxy.com/socks-list/", ] + def _parse_ipport_embedded(self, html: str) -> List[ProxyRaw]: + found = re.findall( + r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b", + html, + ) + out: List[ProxyRaw] = [] + for ip, ps in found: + if not ps.isdigit() or not (1 <= int(ps) <= 65535): + continue + try: + out.append(ProxyRaw(ip, int(ps), "http")) + except ValueError: + continue + return out + def _parse_html(self, html: str) -> List[ProxyRaw]: soup = BeautifulSoup(html, "lxml") results: List[ProxyRaw] = [] @@ -47,6 +62,8 @@ class FpwPremproxyPlugin(BaseHTTPPlugin): results.append(ProxyRaw(ip, int(port), proto)) except ValueError: continue + if len(results) < 5: + results.extend(self._parse_ipport_embedded(html)) return results async def crawl(self) -> List[ProxyRaw]: diff --git a/app/plugins/fpw_proxy_list_download.py b/app/plugins/fpw_proxy_list_download.py index 7b83871..e4e295e 100644 --- a/app/plugins/fpw_proxy_list_download.py +++ b/app/plugins/fpw_proxy_list_download.py @@ -1,5 +1,6 @@ -"""www.proxy-list.download 公开 API(README: Free_Proxy_Website)。""" -from typing import List +"""www.proxy-list.download 公开 API(常见为 JSON,内含 list 字段的 ip:port 文本)。""" +import json +from typing import Any, List from app.core.plugin_system import ProxyRaw from app.plugins.base import BaseHTTPPlugin @@ -10,6 +11,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin): name = "fpw_proxy_list_download" display_name = "Proxy-List.download" description = "proxy-list.download 官方 API(http/https/socks4/socks5)" + crawl_timeout_seconds = 300.0 def __init__(self): super().__init__() @@ -20,6 +22,9 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin): ("socks4", "https://www.proxy-list.download/api/v1/get?type=socks4"), ("socks5", "https://www.proxy-list.download/api/v1/get?type=socks5"), ] + self._mirror_prefix = ( + "https://cdn.jsdelivr.net/gh/monosans/proxy-list@main/proxies/" + ) self.fallback_pairs = [ ("http", "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all"), ("https", "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all"), @@ -27,28 +32,93 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin): ("socks5", "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all"), ] + def _items_to_proxies(self, items: List[Any], protocol: str) -> List[ProxyRaw]: + out: List[ProxyRaw] = [] + for it in items: + if isinstance(it, dict): + ip = str( + it.get("ip") + or it.get("IP") + or it.get("host") + or it.get("Host") + or "" + ).strip() + port = it.get("port") or it.get("Port") + if not ip or port is None: + continue + ps = str(port).strip() + if not ps.isdigit() or not (1 <= int(ps) <= 65535): + continue + try: + out.append(ProxyRaw(ip, int(ps), protocol)) + except ValueError: + continue + elif isinstance(it, str) and ":" in it: + out.extend(self.parse_text_proxies(it, protocol)) + return out + + def _parse_api_body(self, text: str, protocol: str) -> List[ProxyRaw]: + text = (text or "").strip() + if not text: + return [] + if text[0] in "{[": + try: + data = json.loads(text) + except json.JSONDecodeError: + return self.parse_text_proxies(text, protocol) + if isinstance(data, list): + return self._items_to_proxies(data, protocol) + if isinstance(data, dict): + for key in ("list", "LIST", "data", "Data", "proxies", "Proxies"): + raw = data.get(key) + if isinstance(raw, str) and raw.strip(): + return self.parse_text_proxies(raw, protocol) + if isinstance(raw, list): + return self._items_to_proxies(raw, protocol) + return [] + return [] + return self.parse_text_proxies(text, protocol) + async def crawl(self) -> List[ProxyRaw]: results: List[ProxyRaw] = [] + t_req, n_try = 18.0, 1 urls = [u for _, u in self.api_pairs] - htmls = await self.fetch_all(urls, timeout=25, retries=2) + htmls = await self.fetch_all(urls, timeout=t_req, retries=n_try) for (protocol, _), text in zip(self.api_pairs, htmls): if not text: continue - batch = self.parse_text_proxies(text, protocol) + batch = self._parse_api_body(text, protocol) if batch: results.extend(batch) - logger.info(f"{self.display_name} {protocol}: {len(batch)} 条") + logger.info(f"{self.display_name} API {protocol}: {len(batch)} 条") if not results: - logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用") - fb_urls = [u for _, u in self.fallback_pairs] - fb_htmls = await self.fetch_all(fb_urls, timeout=25, retries=2) - for (protocol, _), text in zip(self.fallback_pairs, fb_htmls): + logger.warning(f"{self.display_name} 主 API 无数据,尝试 jsDelivr 文本镜像") + mirror_pairs = [ + (p, f"{self._mirror_prefix}{p}.txt") + for p in ("http", "https", "socks4", "socks5") + ] + m_urls = [u for _, u in mirror_pairs] + m_htmls = await self.fetch_all(m_urls, timeout=22.0, retries=2) + for (protocol, _), text in zip(mirror_pairs, m_htmls): if not text: continue batch = self.parse_text_proxies(text, protocol) if batch: results.extend(batch) - logger.info(f"{self.display_name} fallback {protocol}: {len(batch)} 条") + logger.info(f"{self.display_name} 镜像 {protocol}: {len(batch)} 条") + if not results: + logger.warning(f"{self.display_name} 镜像无数据,尝试 ProxyScrape API") + fb_urls = [u for _, u in self.fallback_pairs] + fb_htmls = await self.fetch_all(fb_urls, timeout=20.0, retries=1) + for (protocol, _), text in zip(self.fallback_pairs, fb_htmls): + if not text: + continue + batch = self._parse_api_body(text, protocol) + if not batch: + batch = self.parse_text_proxies(text, protocol) + if batch: + results.extend(batch) + logger.info(f"{self.display_name} ProxyScrape {protocol}: {len(batch)} 条") if results: logger.info(f"{self.display_name} 合计 {len(results)} 条") return results diff --git a/app/plugins/fpw_proxynova.py b/app/plugins/fpw_proxynova.py index ae493e5..e12abf5 100644 --- a/app/plugins/fpw_proxynova.py +++ b/app/plugins/fpw_proxynova.py @@ -64,11 +64,41 @@ class FpwProxynovaPlugin(BaseHTTPPlugin): continue return out + def _parse_plain_ip_port_rows(self, html: str) -> List[ProxyRaw]: + soup = BeautifulSoup(html, "lxml") + out: List[ProxyRaw] = [] + for tr in soup.find_all("tr"): + tds = tr.find_all("td") + if len(tds) < 2: + continue + ip = tds[0].get_text(strip=True) + port_txt = tds[1].get_text(strip=True) + if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip): + continue + if not port_txt.isdigit() or not (1 <= int(port_txt) <= 65535): + continue + row_text = tr.get_text(" ", strip=True).upper() + if "SOCKS5" in row_text: + proto = "socks5" + elif "SOCKS4" in row_text: + proto = "socks4" + elif "HTTPS" in row_text: + proto = "https" + else: + proto = "http" + try: + out.append(ProxyRaw(ip, int(port_txt), proto)) + except ValueError: + continue + return out + async def crawl(self) -> List[ProxyRaw]: html = await self.fetch(self.urls[0], timeout=25, retries=2) if not html: return [] results = self._parse_rows(html) + if not results: + results = self._parse_plain_ip_port_rows(html) if results: logger.info(f"{self.display_name} 解析 {len(results)} 条") return results diff --git a/app/plugins/fpw_socks_ssl_proxy.py b/app/plugins/fpw_socks_ssl_proxy.py index dffdd58..cb79386 100644 --- a/app/plugins/fpw_socks_ssl_proxy.py +++ b/app/plugins/fpw_socks_ssl_proxy.py @@ -26,7 +26,7 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin): def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]: results = [] pattern = re.compile( - r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*]*>\s*(\d+)", + r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*]*>\s*(\d{1,5})", re.I, ) for ip, port in pattern.findall(html): @@ -47,7 +47,11 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin): proto = "socks4" else: proto = "http" - batch = self._parse_page(html, proto) + batch = self.parse_html_table( + html, column_map={"ip": 0, "port": 1}, protocol=proto + ) + if not batch: + batch = self._parse_page(html, proto) results.extend(batch) if batch: logger.info(f"{self.display_name} {url}: {len(batch)} 条") diff --git a/app/plugins/fpw_spys_one.py b/app/plugins/fpw_spys_one.py index 29ee73f..994f439 100644 --- a/app/plugins/fpw_spys_one.py +++ b/app/plugins/fpw_spys_one.py @@ -12,12 +12,13 @@ class FpwSpysOnePlugin(BaseHTTPPlugin): name = "fpw_spys_one" display_name = "Spys.one" description = "spys.one HTTP/SOCKS 列表(POST 筛选 + XOR 端口解码)" + crawl_timeout_seconds = 180.0 def __init__(self): super().__init__() self.pages: List[Tuple[str, str, str]] = [ - ("http", "http://spys.one/en/http-proxy-list/", "1"), - ("socks5", "http://spys.one/en/socks-proxy-list/", "2"), + ("http", "https://spys.one/en/http-proxy-list/", "1"), + ("socks5", "https://spys.one/en/socks-proxy-list/", "2"), ] @staticmethod