- fpw_proxy_list_download: parse JSON list/proxies bodies; jsDelivr monosans tier; crawl timeout 300s - fpw_socks_ssl: try parse_html_table before regex - fpw_hidemy: loose row scan when fixed columns fail - fpw_proxynova: plain IP/port row fallback - fpw_spys_one: HTTPS endpoints; crawl timeout 180s - fpw_gatherproxy: HTTPS + extra JSON key patterns - fpw_checkerproxy: lower min HTML length for parse - fpw_premproxy: ip:port regex fallback when few table rows Made-with: Cursor
125 lines
5.6 KiB
Python
125 lines
5.6 KiB
Python
"""www.proxy-list.download 公开 API(常见为 JSON,内含 list 字段的 ip:port 文本)。"""
|
||
import json
|
||
from typing import Any, List
|
||
|
||
from app.core.plugin_system import ProxyRaw
|
||
from app.plugins.base import BaseHTTPPlugin
|
||
from app.core.log import logger
|
||
|
||
|
||
class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
||
name = "fpw_proxy_list_download"
|
||
display_name = "Proxy-List.download"
|
||
description = "proxy-list.download 官方 API(http/https/socks4/socks5)"
|
||
crawl_timeout_seconds = 300.0
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.max_concurrency = 2
|
||
self.api_pairs = [
|
||
("http", "https://www.proxy-list.download/api/v1/get?type=http"),
|
||
("https", "https://www.proxy-list.download/api/v1/get?type=https"),
|
||
("socks4", "https://www.proxy-list.download/api/v1/get?type=socks4"),
|
||
("socks5", "https://www.proxy-list.download/api/v1/get?type=socks5"),
|
||
]
|
||
self._mirror_prefix = (
|
||
"https://cdn.jsdelivr.net/gh/monosans/proxy-list@main/proxies/"
|
||
)
|
||
self.fallback_pairs = [
|
||
("http", "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||
("https", "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||
("socks4", "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||
("socks5", "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||
]
|
||
|
||
def _items_to_proxies(self, items: List[Any], protocol: str) -> List[ProxyRaw]:
|
||
out: List[ProxyRaw] = []
|
||
for it in items:
|
||
if isinstance(it, dict):
|
||
ip = str(
|
||
it.get("ip")
|
||
or it.get("IP")
|
||
or it.get("host")
|
||
or it.get("Host")
|
||
or ""
|
||
).strip()
|
||
port = it.get("port") or it.get("Port")
|
||
if not ip or port is None:
|
||
continue
|
||
ps = str(port).strip()
|
||
if not ps.isdigit() or not (1 <= int(ps) <= 65535):
|
||
continue
|
||
try:
|
||
out.append(ProxyRaw(ip, int(ps), protocol))
|
||
except ValueError:
|
||
continue
|
||
elif isinstance(it, str) and ":" in it:
|
||
out.extend(self.parse_text_proxies(it, protocol))
|
||
return out
|
||
|
||
def _parse_api_body(self, text: str, protocol: str) -> List[ProxyRaw]:
|
||
text = (text or "").strip()
|
||
if not text:
|
||
return []
|
||
if text[0] in "{[":
|
||
try:
|
||
data = json.loads(text)
|
||
except json.JSONDecodeError:
|
||
return self.parse_text_proxies(text, protocol)
|
||
if isinstance(data, list):
|
||
return self._items_to_proxies(data, protocol)
|
||
if isinstance(data, dict):
|
||
for key in ("list", "LIST", "data", "Data", "proxies", "Proxies"):
|
||
raw = data.get(key)
|
||
if isinstance(raw, str) and raw.strip():
|
||
return self.parse_text_proxies(raw, protocol)
|
||
if isinstance(raw, list):
|
||
return self._items_to_proxies(raw, protocol)
|
||
return []
|
||
return []
|
||
return self.parse_text_proxies(text, protocol)
|
||
|
||
async def crawl(self) -> List[ProxyRaw]:
|
||
results: List[ProxyRaw] = []
|
||
t_req, n_try = 18.0, 1
|
||
urls = [u for _, u in self.api_pairs]
|
||
htmls = await self.fetch_all(urls, timeout=t_req, retries=n_try)
|
||
for (protocol, _), text in zip(self.api_pairs, htmls):
|
||
if not text:
|
||
continue
|
||
batch = self._parse_api_body(text, protocol)
|
||
if batch:
|
||
results.extend(batch)
|
||
logger.info(f"{self.display_name} API {protocol}: {len(batch)} 条")
|
||
if not results:
|
||
logger.warning(f"{self.display_name} 主 API 无数据,尝试 jsDelivr 文本镜像")
|
||
mirror_pairs = [
|
||
(p, f"{self._mirror_prefix}{p}.txt")
|
||
for p in ("http", "https", "socks4", "socks5")
|
||
]
|
||
m_urls = [u for _, u in mirror_pairs]
|
||
m_htmls = await self.fetch_all(m_urls, timeout=22.0, retries=2)
|
||
for (protocol, _), text in zip(mirror_pairs, m_htmls):
|
||
if not text:
|
||
continue
|
||
batch = self.parse_text_proxies(text, protocol)
|
||
if batch:
|
||
results.extend(batch)
|
||
logger.info(f"{self.display_name} 镜像 {protocol}: {len(batch)} 条")
|
||
if not results:
|
||
logger.warning(f"{self.display_name} 镜像无数据,尝试 ProxyScrape API")
|
||
fb_urls = [u for _, u in self.fallback_pairs]
|
||
fb_htmls = await self.fetch_all(fb_urls, timeout=20.0, retries=1)
|
||
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
|
||
if not text:
|
||
continue
|
||
batch = self._parse_api_body(text, protocol)
|
||
if not batch:
|
||
batch = self.parse_text_proxies(text, protocol)
|
||
if batch:
|
||
results.extend(batch)
|
||
logger.info(f"{self.display_name} ProxyScrape {protocol}: {len(batch)} 条")
|
||
if results:
|
||
logger.info(f"{self.display_name} 合计 {len(results)} 条")
|
||
return results
|