fix(plugins): fpw parsers for JSON API, mirrors, and looser HTML
- fpw_proxy_list_download: parse JSON list/proxies bodies; jsDelivr monosans tier; crawl timeout 300s - fpw_socks_ssl: try parse_html_table before regex - fpw_hidemy: loose row scan when fixed columns fail - fpw_proxynova: plain IP/port row fallback - fpw_spys_one: HTTPS endpoints; crawl timeout 180s - fpw_gatherproxy: HTTPS + extra JSON key patterns - fpw_checkerproxy: lower min HTML length for parse - fpw_premproxy: ip:port regex fallback when few table rows Made-with: Cursor
This commit is contained in:
@@ -49,7 +49,7 @@ class FpwCheckerproxyPlugin(BaseHTTPPlugin):
|
|||||||
seen: Set[Tuple[str, int, str]] = set()
|
seen: Set[Tuple[str, int, str]] = set()
|
||||||
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
||||||
for html in htmls:
|
for html in htmls:
|
||||||
if not html or len(html) < 200:
|
if not html or len(html) < 80:
|
||||||
continue
|
continue
|
||||||
for p in self._parse_ip_ports(html):
|
for p in self._parse_ip_ports(html):
|
||||||
k = (p.ip, p.port, p.protocol)
|
k = (p.ip, p.port, p.protocol)
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ class FpwGatherproxyPlugin(BaseHTTPPlugin):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.urls = [
|
self.urls = [
|
||||||
"http://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
|
"https://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
|
||||||
"http://www.gatherproxy.com/proxylist/country/?c=United%20States",
|
"https://www.gatherproxy.com/proxylist/country/?c=United%20States",
|
||||||
]
|
]
|
||||||
|
|
||||||
def _extract_from_text(self, text: str) -> List[ProxyRaw]:
|
def _extract_from_text(self, text: str) -> List[ProxyRaw]:
|
||||||
@@ -42,6 +42,17 @@ class FpwGatherproxyPlugin(BaseHTTPPlugin):
|
|||||||
results.append(ProxyRaw(ip, int(port), "http"))
|
results.append(ProxyRaw(ip, int(port), "http"))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
for m in re.finditer(
|
||||||
|
r'"(?:proxy_)?ip"\s*:\s*"([\d.]+)"\s*,\s*"(?:proxy_)?port"\s*:\s*"?(\d+)"?',
|
||||||
|
text,
|
||||||
|
re.I,
|
||||||
|
):
|
||||||
|
ip, port = m.group(1), m.group(2)
|
||||||
|
if port.isdigit() and 1 <= int(port) <= 65535:
|
||||||
|
try:
|
||||||
|
results.append(ProxyRaw(ip, int(port), "http"))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
return results
|
return results
|
||||||
|
|
||||||
async def crawl(self) -> List[ProxyRaw]:
|
async def crawl(self) -> List[ProxyRaw]:
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
"""hidemyna.me 免费代理列表表格。"""
|
"""hidemyna.me 免费代理列表表格。"""
|
||||||
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from app.core.plugin_system import ProxyRaw
|
from app.core.plugin_system import ProxyRaw
|
||||||
@@ -19,6 +20,46 @@ class FpwHidemyPlugin(BaseHTTPPlugin):
|
|||||||
"https://hidemyna.me/en/proxy-list/?type=socks4",
|
"https://hidemyna.me/en/proxy-list/?type=socks4",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def _parse_rows_loose(self, html: str) -> List[ProxyRaw]:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
out: List[ProxyRaw] = []
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
for tr in soup.find_all("tr"):
|
||||||
|
tds = tr.find_all("td")
|
||||||
|
if len(tds) < 2:
|
||||||
|
continue
|
||||||
|
row = " ".join(td.get_text(" ", strip=True) for td in tds)
|
||||||
|
ip_m = re.search(
|
||||||
|
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b",
|
||||||
|
row,
|
||||||
|
)
|
||||||
|
if not ip_m:
|
||||||
|
continue
|
||||||
|
ip = ip_m.group(1)
|
||||||
|
port_val = None
|
||||||
|
for td in tds:
|
||||||
|
t = td.get_text(strip=True)
|
||||||
|
if t.isdigit() and 1 <= int(t) <= 65535:
|
||||||
|
port_val = int(t)
|
||||||
|
break
|
||||||
|
if port_val is None:
|
||||||
|
continue
|
||||||
|
u = row.upper()
|
||||||
|
if "SOCKS5" in u:
|
||||||
|
proto = "socks5"
|
||||||
|
elif "SOCKS4" in u or "SOCKS" in u:
|
||||||
|
proto = "socks4"
|
||||||
|
elif "HTTPS" in u:
|
||||||
|
proto = "https"
|
||||||
|
else:
|
||||||
|
proto = "http"
|
||||||
|
try:
|
||||||
|
out.append(ProxyRaw(ip, port_val, proto))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return out
|
||||||
|
|
||||||
async def crawl(self) -> List[ProxyRaw]:
|
async def crawl(self) -> List[ProxyRaw]:
|
||||||
results: List[ProxyRaw] = []
|
results: List[ProxyRaw] = []
|
||||||
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
||||||
@@ -30,6 +71,8 @@ class FpwHidemyPlugin(BaseHTTPPlugin):
|
|||||||
column_map={"ip": 0, "port": 1, "protocol": 4},
|
column_map={"ip": 0, "port": 1, "protocol": 4},
|
||||||
protocol="http",
|
protocol="http",
|
||||||
)
|
)
|
||||||
|
if not batch:
|
||||||
|
batch = self._parse_rows_loose(html)
|
||||||
if batch:
|
if batch:
|
||||||
results.extend(batch)
|
results.extend(batch)
|
||||||
logger.info(f"{self.display_name} {url}: {len(batch)} 条")
|
logger.info(f"{self.display_name} {url}: {len(batch)} 条")
|
||||||
|
|||||||
@@ -21,6 +21,21 @@ class FpwPremproxyPlugin(BaseHTTPPlugin):
|
|||||||
"https://premproxy.com/socks-list/",
|
"https://premproxy.com/socks-list/",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def _parse_ipport_embedded(self, html: str) -> List[ProxyRaw]:
|
||||||
|
found = re.findall(
|
||||||
|
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
|
||||||
|
html,
|
||||||
|
)
|
||||||
|
out: List[ProxyRaw] = []
|
||||||
|
for ip, ps in found:
|
||||||
|
if not ps.isdigit() or not (1 <= int(ps) <= 65535):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
out.append(ProxyRaw(ip, int(ps), "http"))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return out
|
||||||
|
|
||||||
def _parse_html(self, html: str) -> List[ProxyRaw]:
|
def _parse_html(self, html: str) -> List[ProxyRaw]:
|
||||||
soup = BeautifulSoup(html, "lxml")
|
soup = BeautifulSoup(html, "lxml")
|
||||||
results: List[ProxyRaw] = []
|
results: List[ProxyRaw] = []
|
||||||
@@ -47,6 +62,8 @@ class FpwPremproxyPlugin(BaseHTTPPlugin):
|
|||||||
results.append(ProxyRaw(ip, int(port), proto))
|
results.append(ProxyRaw(ip, int(port), proto))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
if len(results) < 5:
|
||||||
|
results.extend(self._parse_ipport_embedded(html))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
async def crawl(self) -> List[ProxyRaw]:
|
async def crawl(self) -> List[ProxyRaw]:
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""www.proxy-list.download 公开 API(README: Free_Proxy_Website)。"""
|
"""www.proxy-list.download 公开 API(常见为 JSON,内含 list 字段的 ip:port 文本)。"""
|
||||||
from typing import List
|
import json
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
from app.core.plugin_system import ProxyRaw
|
from app.core.plugin_system import ProxyRaw
|
||||||
from app.plugins.base import BaseHTTPPlugin
|
from app.plugins.base import BaseHTTPPlugin
|
||||||
@@ -10,6 +11,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
|||||||
name = "fpw_proxy_list_download"
|
name = "fpw_proxy_list_download"
|
||||||
display_name = "Proxy-List.download"
|
display_name = "Proxy-List.download"
|
||||||
description = "proxy-list.download 官方 API(http/https/socks4/socks5)"
|
description = "proxy-list.download 官方 API(http/https/socks4/socks5)"
|
||||||
|
crawl_timeout_seconds = 300.0
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -20,6 +22,9 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
|||||||
("socks4", "https://www.proxy-list.download/api/v1/get?type=socks4"),
|
("socks4", "https://www.proxy-list.download/api/v1/get?type=socks4"),
|
||||||
("socks5", "https://www.proxy-list.download/api/v1/get?type=socks5"),
|
("socks5", "https://www.proxy-list.download/api/v1/get?type=socks5"),
|
||||||
]
|
]
|
||||||
|
self._mirror_prefix = (
|
||||||
|
"https://cdn.jsdelivr.net/gh/monosans/proxy-list@main/proxies/"
|
||||||
|
)
|
||||||
self.fallback_pairs = [
|
self.fallback_pairs = [
|
||||||
("http", "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all"),
|
("http", "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||||||
("https", "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all"),
|
("https", "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||||||
@@ -27,28 +32,93 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
|||||||
("socks5", "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all"),
|
("socks5", "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def _items_to_proxies(self, items: List[Any], protocol: str) -> List[ProxyRaw]:
|
||||||
|
out: List[ProxyRaw] = []
|
||||||
|
for it in items:
|
||||||
|
if isinstance(it, dict):
|
||||||
|
ip = str(
|
||||||
|
it.get("ip")
|
||||||
|
or it.get("IP")
|
||||||
|
or it.get("host")
|
||||||
|
or it.get("Host")
|
||||||
|
or ""
|
||||||
|
).strip()
|
||||||
|
port = it.get("port") or it.get("Port")
|
||||||
|
if not ip or port is None:
|
||||||
|
continue
|
||||||
|
ps = str(port).strip()
|
||||||
|
if not ps.isdigit() or not (1 <= int(ps) <= 65535):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
out.append(ProxyRaw(ip, int(ps), protocol))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
elif isinstance(it, str) and ":" in it:
|
||||||
|
out.extend(self.parse_text_proxies(it, protocol))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _parse_api_body(self, text: str, protocol: str) -> List[ProxyRaw]:
|
||||||
|
text = (text or "").strip()
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
if text[0] in "{[":
|
||||||
|
try:
|
||||||
|
data = json.loads(text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return self.parse_text_proxies(text, protocol)
|
||||||
|
if isinstance(data, list):
|
||||||
|
return self._items_to_proxies(data, protocol)
|
||||||
|
if isinstance(data, dict):
|
||||||
|
for key in ("list", "LIST", "data", "Data", "proxies", "Proxies"):
|
||||||
|
raw = data.get(key)
|
||||||
|
if isinstance(raw, str) and raw.strip():
|
||||||
|
return self.parse_text_proxies(raw, protocol)
|
||||||
|
if isinstance(raw, list):
|
||||||
|
return self._items_to_proxies(raw, protocol)
|
||||||
|
return []
|
||||||
|
return []
|
||||||
|
return self.parse_text_proxies(text, protocol)
|
||||||
|
|
||||||
async def crawl(self) -> List[ProxyRaw]:
|
async def crawl(self) -> List[ProxyRaw]:
|
||||||
results: List[ProxyRaw] = []
|
results: List[ProxyRaw] = []
|
||||||
|
t_req, n_try = 18.0, 1
|
||||||
urls = [u for _, u in self.api_pairs]
|
urls = [u for _, u in self.api_pairs]
|
||||||
htmls = await self.fetch_all(urls, timeout=25, retries=2)
|
htmls = await self.fetch_all(urls, timeout=t_req, retries=n_try)
|
||||||
for (protocol, _), text in zip(self.api_pairs, htmls):
|
for (protocol, _), text in zip(self.api_pairs, htmls):
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
batch = self.parse_text_proxies(text, protocol)
|
batch = self._parse_api_body(text, protocol)
|
||||||
if batch:
|
if batch:
|
||||||
results.extend(batch)
|
results.extend(batch)
|
||||||
logger.info(f"{self.display_name} {protocol}: {len(batch)} 条")
|
logger.info(f"{self.display_name} API {protocol}: {len(batch)} 条")
|
||||||
if not results:
|
if not results:
|
||||||
logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用")
|
logger.warning(f"{self.display_name} 主 API 无数据,尝试 jsDelivr 文本镜像")
|
||||||
fb_urls = [u for _, u in self.fallback_pairs]
|
mirror_pairs = [
|
||||||
fb_htmls = await self.fetch_all(fb_urls, timeout=25, retries=2)
|
(p, f"{self._mirror_prefix}{p}.txt")
|
||||||
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
|
for p in ("http", "https", "socks4", "socks5")
|
||||||
|
]
|
||||||
|
m_urls = [u for _, u in mirror_pairs]
|
||||||
|
m_htmls = await self.fetch_all(m_urls, timeout=22.0, retries=2)
|
||||||
|
for (protocol, _), text in zip(mirror_pairs, m_htmls):
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
batch = self.parse_text_proxies(text, protocol)
|
batch = self.parse_text_proxies(text, protocol)
|
||||||
if batch:
|
if batch:
|
||||||
results.extend(batch)
|
results.extend(batch)
|
||||||
logger.info(f"{self.display_name} fallback {protocol}: {len(batch)} 条")
|
logger.info(f"{self.display_name} 镜像 {protocol}: {len(batch)} 条")
|
||||||
|
if not results:
|
||||||
|
logger.warning(f"{self.display_name} 镜像无数据,尝试 ProxyScrape API")
|
||||||
|
fb_urls = [u for _, u in self.fallback_pairs]
|
||||||
|
fb_htmls = await self.fetch_all(fb_urls, timeout=20.0, retries=1)
|
||||||
|
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
batch = self._parse_api_body(text, protocol)
|
||||||
|
if not batch:
|
||||||
|
batch = self.parse_text_proxies(text, protocol)
|
||||||
|
if batch:
|
||||||
|
results.extend(batch)
|
||||||
|
logger.info(f"{self.display_name} ProxyScrape {protocol}: {len(batch)} 条")
|
||||||
if results:
|
if results:
|
||||||
logger.info(f"{self.display_name} 合计 {len(results)} 条")
|
logger.info(f"{self.display_name} 合计 {len(results)} 条")
|
||||||
return results
|
return results
|
||||||
|
|||||||
@@ -64,11 +64,41 @@ class FpwProxynovaPlugin(BaseHTTPPlugin):
|
|||||||
continue
|
continue
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
def _parse_plain_ip_port_rows(self, html: str) -> List[ProxyRaw]:
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
out: List[ProxyRaw] = []
|
||||||
|
for tr in soup.find_all("tr"):
|
||||||
|
tds = tr.find_all("td")
|
||||||
|
if len(tds) < 2:
|
||||||
|
continue
|
||||||
|
ip = tds[0].get_text(strip=True)
|
||||||
|
port_txt = tds[1].get_text(strip=True)
|
||||||
|
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
|
||||||
|
continue
|
||||||
|
if not port_txt.isdigit() or not (1 <= int(port_txt) <= 65535):
|
||||||
|
continue
|
||||||
|
row_text = tr.get_text(" ", strip=True).upper()
|
||||||
|
if "SOCKS5" in row_text:
|
||||||
|
proto = "socks5"
|
||||||
|
elif "SOCKS4" in row_text:
|
||||||
|
proto = "socks4"
|
||||||
|
elif "HTTPS" in row_text:
|
||||||
|
proto = "https"
|
||||||
|
else:
|
||||||
|
proto = "http"
|
||||||
|
try:
|
||||||
|
out.append(ProxyRaw(ip, int(port_txt), proto))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return out
|
||||||
|
|
||||||
async def crawl(self) -> List[ProxyRaw]:
|
async def crawl(self) -> List[ProxyRaw]:
|
||||||
html = await self.fetch(self.urls[0], timeout=25, retries=2)
|
html = await self.fetch(self.urls[0], timeout=25, retries=2)
|
||||||
if not html:
|
if not html:
|
||||||
return []
|
return []
|
||||||
results = self._parse_rows(html)
|
results = self._parse_rows(html)
|
||||||
|
if not results:
|
||||||
|
results = self._parse_plain_ip_port_rows(html)
|
||||||
if results:
|
if results:
|
||||||
logger.info(f"{self.display_name} 解析 {len(results)} 条")
|
logger.info(f"{self.display_name} 解析 {len(results)} 条")
|
||||||
return results
|
return results
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
|
|||||||
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
|
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
|
||||||
results = []
|
results = []
|
||||||
pattern = re.compile(
|
pattern = re.compile(
|
||||||
r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>\s*<td[^>]*>\s*(\d+)",
|
r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>\s*<td[^>]*>\s*(\d{1,5})",
|
||||||
re.I,
|
re.I,
|
||||||
)
|
)
|
||||||
for ip, port in pattern.findall(html):
|
for ip, port in pattern.findall(html):
|
||||||
@@ -47,7 +47,11 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
|
|||||||
proto = "socks4"
|
proto = "socks4"
|
||||||
else:
|
else:
|
||||||
proto = "http"
|
proto = "http"
|
||||||
batch = self._parse_page(html, proto)
|
batch = self.parse_html_table(
|
||||||
|
html, column_map={"ip": 0, "port": 1}, protocol=proto
|
||||||
|
)
|
||||||
|
if not batch:
|
||||||
|
batch = self._parse_page(html, proto)
|
||||||
results.extend(batch)
|
results.extend(batch)
|
||||||
if batch:
|
if batch:
|
||||||
logger.info(f"{self.display_name} {url}: {len(batch)} 条")
|
logger.info(f"{self.display_name} {url}: {len(batch)} 条")
|
||||||
|
|||||||
@@ -12,12 +12,13 @@ class FpwSpysOnePlugin(BaseHTTPPlugin):
|
|||||||
name = "fpw_spys_one"
|
name = "fpw_spys_one"
|
||||||
display_name = "Spys.one"
|
display_name = "Spys.one"
|
||||||
description = "spys.one HTTP/SOCKS 列表(POST 筛选 + XOR 端口解码)"
|
description = "spys.one HTTP/SOCKS 列表(POST 筛选 + XOR 端口解码)"
|
||||||
|
crawl_timeout_seconds = 180.0
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.pages: List[Tuple[str, str, str]] = [
|
self.pages: List[Tuple[str, str, str]] = [
|
||||||
("http", "http://spys.one/en/http-proxy-list/", "1"),
|
("http", "https://spys.one/en/http-proxy-list/", "1"),
|
||||||
("socks5", "http://spys.one/en/socks-proxy-list/", "2"),
|
("socks5", "https://spys.one/en/socks-proxy-list/", "2"),
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
Reference in New Issue
Block a user