feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation
- Add Free_Proxy_Website-style fpw_* plugins and register them - Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting - Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore - Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait - Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys - WebSocket dashboard stats; ProxyList pool_filter and API alignment - POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404 - pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB - .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import Made-with: Cursor
This commit is contained in:
@@ -9,6 +9,15 @@ from .kuaidaili import KuaiDaiLiPlugin
|
||||
from .speedx import SpeedXPlugin
|
||||
from .yundaili import YunDaiLiPlugin
|
||||
from .proxyscrape import ProxyScrapePlugin
|
||||
from .fpw_proxy_list_download import FpwProxyListDownloadPlugin
|
||||
from .fpw_socks_ssl_proxy import FpwSocksSslProxyPlugin
|
||||
from .fpw_spys_one import FpwSpysOnePlugin
|
||||
from .fpw_proxynova import FpwProxynovaPlugin
|
||||
from .fpw_hidemy import FpwHidemyPlugin
|
||||
from .fpw_premproxy import FpwPremproxyPlugin
|
||||
from .fpw_freeproxylists import FpwFreeproxylistsPlugin
|
||||
from .fpw_gatherproxy import FpwGatherproxyPlugin
|
||||
from .fpw_checkerproxy import FpwCheckerproxyPlugin
|
||||
|
||||
# 显式注册所有插件
|
||||
registry.register(Fate0Plugin)
|
||||
@@ -19,3 +28,12 @@ registry.register(KuaiDaiLiPlugin)
|
||||
registry.register(SpeedXPlugin)
|
||||
registry.register(YunDaiLiPlugin)
|
||||
registry.register(ProxyScrapePlugin)
|
||||
registry.register(FpwProxyListDownloadPlugin)
|
||||
registry.register(FpwSocksSslProxyPlugin)
|
||||
registry.register(FpwSpysOnePlugin)
|
||||
registry.register(FpwProxynovaPlugin)
|
||||
registry.register(FpwHidemyPlugin)
|
||||
registry.register(FpwPremproxyPlugin)
|
||||
registry.register(FpwFreeproxylistsPlugin)
|
||||
registry.register(FpwGatherproxyPlugin)
|
||||
registry.register(FpwCheckerproxyPlugin)
|
||||
|
||||
@@ -3,7 +3,7 @@ import re
|
||||
import random
|
||||
import asyncio
|
||||
import httpx
|
||||
from typing import List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from app.core.plugin_system import BaseCrawlerPlugin
|
||||
from app.models.domain import ProxyRaw
|
||||
@@ -43,9 +43,56 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
self._client = httpx.AsyncClient(
|
||||
transport=transport,
|
||||
follow_redirects=True,
|
||||
# 忽略系统 HTTP(S)_PROXY,避免误配导致列表站全部连接失败
|
||||
trust_env=False,
|
||||
)
|
||||
return self._client
|
||||
|
||||
@staticmethod
|
||||
def _http_timeout(seconds: float) -> httpx.Timeout:
|
||||
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
|
||||
t = max(2.0, float(seconds))
|
||||
c = min(6.0, max(3.0, t * 0.35))
|
||||
return httpx.Timeout(t, connect=c)
|
||||
|
||||
@staticmethod
|
||||
def _decode_response_body(response: httpx.Response) -> str:
|
||||
content = response.content
|
||||
encoding = response.encoding
|
||||
if encoding == "utf-8" or not encoding:
|
||||
try:
|
||||
return content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return content.decode("gbk", errors="ignore")
|
||||
return content.decode(encoding, errors="ignore")
|
||||
|
||||
def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
|
||||
"""同步 GET(部分站点在 Windows 上 AsyncClient 易 ConnectTimeout,同步 Client 正常)。"""
|
||||
to = BaseHTTPPlugin._http_timeout(timeout)
|
||||
with httpx.Client(
|
||||
transport=httpx.HTTPTransport(retries=0),
|
||||
follow_redirects=True,
|
||||
trust_env=False,
|
||||
) as c:
|
||||
r = c.get(url, headers=headers, timeout=to)
|
||||
if r.status_code != 200:
|
||||
return ""
|
||||
return self._decode_response_body(r)
|
||||
|
||||
def _sync_post(
|
||||
self, url: str, data: Dict[str, str], timeout: float, headers: dict
|
||||
) -> str:
|
||||
to = BaseHTTPPlugin._http_timeout(timeout)
|
||||
with httpx.Client(
|
||||
transport=httpx.HTTPTransport(retries=0),
|
||||
follow_redirects=True,
|
||||
trust_env=False,
|
||||
) as c:
|
||||
r = c.post(url, headers=headers, data=data, timeout=to)
|
||||
if r.status_code != 200:
|
||||
return ""
|
||||
return self._decode_response_body(r)
|
||||
|
||||
async def fetch(
|
||||
self,
|
||||
url: str,
|
||||
@@ -56,35 +103,81 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
"""异步抓取指定 URL 的 HTML 内容"""
|
||||
from app.core.log import logger
|
||||
client = self._get_client()
|
||||
to = self._http_timeout(timeout)
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
response = await client.get(url, headers=self.get_headers(), timeout=timeout)
|
||||
response = await client.get(url, headers=self.get_headers(), timeout=to)
|
||||
if raise_for_status:
|
||||
response.raise_for_status()
|
||||
if response.status_code == 200:
|
||||
content = response.content
|
||||
encoding = response.encoding
|
||||
if encoding == "utf-8" or not encoding:
|
||||
try:
|
||||
return content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return content.decode("gbk", errors="ignore")
|
||||
return content.decode(encoding, errors="ignore")
|
||||
else:
|
||||
logger.warning(f"Fetch {url} returned status {response.status_code}")
|
||||
return self._decode_response_body(response)
|
||||
logger.warning(f"Fetch {url} returned status {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
|
||||
if attempt < retries - 1:
|
||||
await asyncio.sleep(random.uniform(1, 3))
|
||||
try:
|
||||
text = await asyncio.to_thread(
|
||||
self._sync_get, url, timeout, self.get_headers()
|
||||
)
|
||||
if text:
|
||||
logger.info(f"Fetch {url} 使用同步回退成功")
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.warning(f"Fetch {url} 同步回退失败: {e}")
|
||||
return ""
|
||||
|
||||
async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
|
||||
async def fetch_post(
|
||||
self,
|
||||
url: str,
|
||||
data: Optional[Dict[str, str]] = None,
|
||||
timeout: float = 15.0,
|
||||
retries: int = 2,
|
||||
) -> str:
|
||||
"""POST application/x-www-form-urlencoded,用于 spys.one 等表单页。"""
|
||||
from app.core.log import logger
|
||||
|
||||
client = self._get_client()
|
||||
payload = data or {}
|
||||
to = self._http_timeout(timeout)
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
response = await client.post(
|
||||
url,
|
||||
headers=self.get_headers(),
|
||||
data=payload,
|
||||
timeout=to,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return self._decode_response_body(response)
|
||||
logger.warning(f"POST {url} returned status {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
|
||||
if attempt < retries - 1:
|
||||
await asyncio.sleep(random.uniform(1, 3))
|
||||
try:
|
||||
text = await asyncio.to_thread(
|
||||
self._sync_post, url, payload, timeout, self.get_headers()
|
||||
)
|
||||
if text:
|
||||
logger.info(f"POST {url} 使用同步回退成功")
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.warning(f"POST {url} 同步回退失败: {e}")
|
||||
return ""
|
||||
|
||||
async def fetch_all(
|
||||
self,
|
||||
urls: List[str],
|
||||
timeout: float = 15.0,
|
||||
retries: int = 2,
|
||||
) -> List[str]:
|
||||
"""并发抓取多个 URL,限制单个插件内部并发"""
|
||||
semaphore = asyncio.Semaphore(self.max_concurrency)
|
||||
|
||||
async def _fetch_limited(url: str):
|
||||
async with semaphore:
|
||||
return await self.fetch(url, timeout=timeout)
|
||||
return await self.fetch(url, timeout=timeout, retries=retries)
|
||||
|
||||
tasks = [_fetch_limited(url) for url in urls]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
65
app/plugins/fpw_checkerproxy.py
Normal file
65
app/plugins/fpw_checkerproxy.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""checkerproxy.net:尝试常见导出路径 + 正文中的 ip:port(排除示例占位)。"""
|
||||
import re
|
||||
from typing import List, Set, Tuple
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwCheckerproxyPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_checkerproxy"
|
||||
display_name = "CheckerProxy.net"
|
||||
description = "checkerproxy.net(无稳定公开 API 时可能为空;多路径尝试)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = [
|
||||
"https://checkerproxy.net/",
|
||||
"https://checkerproxy.net/export",
|
||||
"https://checkerproxy.net/api/export",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _parse_ip_ports(text: str) -> List[ProxyRaw]:
|
||||
bad = {"123.123.123.123", "127.0.0.1", "0.0.0.0"}
|
||||
seen: Set[Tuple[str, int]] = set()
|
||||
out: List[ProxyRaw] = []
|
||||
for m in re.finditer(
|
||||
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
|
||||
text,
|
||||
):
|
||||
ip, ps = m.group(1), m.group(2)
|
||||
if ip in bad:
|
||||
continue
|
||||
if not ps.isdigit() or not (1 <= int(ps) <= 65535):
|
||||
continue
|
||||
key = (ip, int(ps))
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
try:
|
||||
out.append(ProxyRaw(ip, int(ps), "http"))
|
||||
except ValueError:
|
||||
continue
|
||||
return out
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
merged: List[ProxyRaw] = []
|
||||
seen: Set[Tuple[str, int, str]] = set()
|
||||
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
|
||||
for html in htmls:
|
||||
if not html or len(html) < 200:
|
||||
continue
|
||||
for p in self._parse_ip_ports(html):
|
||||
k = (p.ip, p.port, p.protocol)
|
||||
if k not in seen:
|
||||
seen.add(k)
|
||||
merged.append(p)
|
||||
if len(merged) >= 50:
|
||||
break
|
||||
if merged:
|
||||
logger.info(f"{self.display_name} 解析 {len(merged)} 条")
|
||||
else:
|
||||
logger.warning(f"{self.display_name} 未解析到代理(站点可能仅提供在线检测)")
|
||||
return merged
|
||||
69
app/plugins/fpw_freeproxylists.py
Normal file
69
app/plugins/fpw_freeproxylists.py
Normal file
@@ -0,0 +1,69 @@
|
||||
"""freeproxylists.net 及常见镜像路径(表格 / 纯文本)。"""
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwFreeproxylistsPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_freeproxylists"
|
||||
display_name = "FreeProxyLists"
|
||||
description = "freeproxylists.net 系列页面(易被 403,多 URL 尝试)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = [
|
||||
"http://www.freeproxylists.net/",
|
||||
"http://freeproxylists.net/",
|
||||
"http://www.freeproxylists.net/en/http-txt.html",
|
||||
]
|
||||
|
||||
def _parse_any(self, html: str) -> List[ProxyRaw]:
|
||||
ipport = re.findall(
|
||||
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
|
||||
html,
|
||||
)
|
||||
if len(ipport) >= 5:
|
||||
out: List[ProxyRaw] = []
|
||||
for ip, ps in ipport:
|
||||
if ps.isdigit() and 1 <= int(ps) <= 65535:
|
||||
try:
|
||||
out.append(ProxyRaw(ip, int(ps), "http"))
|
||||
except ValueError:
|
||||
pass
|
||||
return out
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
results: List[ProxyRaw] = []
|
||||
for tr in soup.find_all("tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
ip = tds[0].get_text(strip=True)
|
||||
port = tds[1].get_text(strip=True)
|
||||
if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip) and port.isdigit():
|
||||
if 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), "http"))
|
||||
except ValueError:
|
||||
pass
|
||||
return results
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
seen = set()
|
||||
out: List[ProxyRaw] = []
|
||||
htmls = await self.fetch_all(self.urls, timeout=10, retries=1)
|
||||
for url, html in zip(self.urls, htmls):
|
||||
if not html:
|
||||
continue
|
||||
for p in self._parse_any(html):
|
||||
key = (p.ip, p.port, p.protocol)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
out.append(p)
|
||||
if out:
|
||||
logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条")
|
||||
return out
|
||||
61
app/plugins/fpw_gatherproxy.py
Normal file
61
app/plugins/fpw_gatherproxy.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""gatherproxy.com 页面内嵌 JSON(PROXY_IP / PROXY_PORT)。"""
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwGatherproxyPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_gatherproxy"
|
||||
display_name = "GatherProxy"
|
||||
description = "gatherproxy.com 内嵌代理 JSON(站点常有限流)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = [
|
||||
"http://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
|
||||
"http://www.gatherproxy.com/proxylist/country/?c=United%20States",
|
||||
]
|
||||
|
||||
def _extract_from_text(self, text: str) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
for m in re.finditer(
|
||||
r"PROXY_IP['\"]?\s*:\s*['\"]([\d.]+)['\"].{0,120}?PROXY_PORT['\"]?\s*:\s*['\"](\d+)['\"]",
|
||||
text,
|
||||
re.DOTALL | re.IGNORECASE,
|
||||
):
|
||||
ip, port = m.group(1), m.group(2)
|
||||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), "http"))
|
||||
except ValueError:
|
||||
continue
|
||||
for m in re.finditer(
|
||||
r"\{[^{}]*\"PROXY_IP\"\s*:\s*\"([\d.]+)\"[^{}]*\"PROXY_PORT\"\s*:\s*\"(\d+)\"[^{}]*\}",
|
||||
text,
|
||||
):
|
||||
ip, port = m.group(1), m.group(2)
|
||||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), "http"))
|
||||
except ValueError:
|
||||
continue
|
||||
return results
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
seen = set()
|
||||
out: List[ProxyRaw] = []
|
||||
htmls = await self.fetch_all(self.urls, timeout=10, retries=1)
|
||||
for url, html in zip(self.urls, htmls):
|
||||
if not html:
|
||||
continue
|
||||
for p in self._extract_from_text(html):
|
||||
k = (p.ip, p.port)
|
||||
if k not in seen:
|
||||
seen.add(k)
|
||||
out.append(p)
|
||||
if out:
|
||||
logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条")
|
||||
return out
|
||||
38
app/plugins/fpw_hidemy.py
Normal file
38
app/plugins/fpw_hidemy.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""hidemyna.me 免费代理列表表格。"""
|
||||
from typing import List
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwHidemyPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_hidemy"
|
||||
display_name = "HideMy.name"
|
||||
description = "hidemyna.me 英文代理列表(HTTP/HTTPS/SOCKS)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = [
|
||||
"https://hidemyna.me/en/proxy-list/",
|
||||
"https://hidemyna.me/en/proxy-list/?type=hs",
|
||||
"https://hidemyna.me/en/proxy-list/?type=socks4",
|
||||
]
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
|
||||
for url, html in zip(self.urls, htmls):
|
||||
if not html:
|
||||
continue
|
||||
batch = self.parse_html_table(
|
||||
html,
|
||||
column_map={"ip": 0, "port": 1, "protocol": 4},
|
||||
protocol="http",
|
||||
)
|
||||
if batch:
|
||||
results.extend(batch)
|
||||
logger.info(f"{self.display_name} {url}: {len(batch)} 条")
|
||||
if results:
|
||||
logger.info(f"{self.display_name} 合计 {len(results)} 条")
|
||||
return results
|
||||
64
app/plugins/fpw_premproxy.py
Normal file
64
app/plugins/fpw_premproxy.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""premproxy.com 列表页表格。"""
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwPremproxyPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_premproxy"
|
||||
display_name = "PremProxy"
|
||||
description = "premproxy.com HTTP/SOCKS 列表页"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = [
|
||||
"https://premproxy.com/list/",
|
||||
"https://premproxy.com/socks-list/",
|
||||
]
|
||||
|
||||
def _parse_html(self, html: str) -> List[ProxyRaw]:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
results: List[ProxyRaw] = []
|
||||
for tr in soup.find_all("tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
ip = tds[0].get_text(strip=True)
|
||||
port = tds[1].get_text(strip=True)
|
||||
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
|
||||
continue
|
||||
if not port.isdigit() or not (1 <= int(port) <= 65535):
|
||||
continue
|
||||
row = tr.get_text(" ", strip=True).lower()
|
||||
if "socks5" in row:
|
||||
proto = "socks5"
|
||||
elif "socks4" in row or "socks" in row:
|
||||
proto = "socks4"
|
||||
elif "https" in row:
|
||||
proto = "https"
|
||||
else:
|
||||
proto = "http"
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), proto))
|
||||
except ValueError:
|
||||
continue
|
||||
return results
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
merged: List[ProxyRaw] = []
|
||||
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
|
||||
for url, html in zip(self.urls, htmls):
|
||||
if not html:
|
||||
continue
|
||||
batch = self._parse_html(html)
|
||||
if batch:
|
||||
merged.extend(batch)
|
||||
logger.info(f"{self.display_name} {url}: {len(batch)} 条")
|
||||
if merged:
|
||||
logger.info(f"{self.display_name} 合计 {len(merged)} 条")
|
||||
return merged
|
||||
54
app/plugins/fpw_proxy_list_download.py
Normal file
54
app/plugins/fpw_proxy_list_download.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""www.proxy-list.download 公开 API(README: Free_Proxy_Website)。"""
|
||||
from typing import List
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_proxy_list_download"
|
||||
display_name = "Proxy-List.download"
|
||||
description = "proxy-list.download 官方 API(http/https/socks4/socks5)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_concurrency = 8
|
||||
self.api_pairs = [
|
||||
("http", "https://www.proxy-list.download/api/v1/get?type=http"),
|
||||
("https", "https://www.proxy-list.download/api/v1/get?type=https"),
|
||||
("socks4", "https://www.proxy-list.download/api/v1/get?type=socks4"),
|
||||
("socks5", "https://www.proxy-list.download/api/v1/get?type=socks5"),
|
||||
]
|
||||
self.fallback_pairs = [
|
||||
("http", "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||||
("https", "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||||
("socks4", "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||||
("socks5", "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all"),
|
||||
]
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
urls = [u for _, u in self.api_pairs]
|
||||
htmls = await self.fetch_all(urls, timeout=10, retries=1)
|
||||
for (protocol, _), text in zip(self.api_pairs, htmls):
|
||||
if not text:
|
||||
continue
|
||||
batch = self.parse_text_proxies(text, protocol)
|
||||
if batch:
|
||||
results.extend(batch)
|
||||
logger.info(f"{self.display_name} {protocol}: {len(batch)} 条")
|
||||
if not results:
|
||||
logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用")
|
||||
fb_urls = [u for _, u in self.fallback_pairs]
|
||||
fb_htmls = await self.fetch_all(fb_urls, timeout=10, retries=1)
|
||||
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
|
||||
if not text:
|
||||
continue
|
||||
batch = self.parse_text_proxies(text, protocol)
|
||||
if batch:
|
||||
results.extend(batch)
|
||||
logger.info(f"{self.display_name} fallback {protocol}: {len(batch)} 条")
|
||||
if results:
|
||||
logger.info(f"{self.display_name} 合计 {len(results)} 条")
|
||||
return results
|
||||
74
app/plugins/fpw_proxynova.py
Normal file
74
app/plugins/fpw_proxynova.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""proxynova.com 表格内 JS 混淆 IP + 明文端口。"""
|
||||
import re
|
||||
from typing import List, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwProxynovaPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_proxynova"
|
||||
display_name = "ProxyNova"
|
||||
description = "proxynova.com 代理列表(解析 document.write 混淆 IP)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = ["https://www.proxynova.com/proxy-server-list/"]
|
||||
|
||||
@staticmethod
|
||||
def _decode_proxynova_ip(script_inner: str) -> Optional[str]:
|
||||
"""解析 document.write(\".081.301\".split(\"\").reverse()...concat(\"118.174\"...))"""
|
||||
m1 = re.search(r'document\.write\("([^"]+)"\.split', script_inner)
|
||||
m2 = re.search(r'\.concat\("([^"]+)"', script_inner)
|
||||
if not m1 or not m2:
|
||||
return None
|
||||
a, b = m1.group(1), m2.group(1)
|
||||
part1 = "".join(reversed(a))
|
||||
return part1 + b
|
||||
|
||||
def _parse_rows(self, html: str) -> List[ProxyRaw]:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
tbody = soup.find("tbody")
|
||||
if not tbody:
|
||||
return []
|
||||
out: List[ProxyRaw] = []
|
||||
for tr in tbody.find_all("tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
script = tds[0].find("script")
|
||||
if not script or not script.string:
|
||||
continue
|
||||
ip = self._decode_proxynova_ip(script.string)
|
||||
port_txt = tds[1].get_text(strip=True)
|
||||
if not ip or not port_txt.isdigit():
|
||||
continue
|
||||
port = int(port_txt)
|
||||
if not (1 <= port <= 65535):
|
||||
continue
|
||||
row_text = tr.get_text(" ", strip=True).upper()
|
||||
if "SOCKS5" in row_text:
|
||||
proto = "socks5"
|
||||
elif "SOCKS4" in row_text:
|
||||
proto = "socks4"
|
||||
elif "HTTPS" in row_text:
|
||||
proto = "https"
|
||||
else:
|
||||
proto = "http"
|
||||
try:
|
||||
out.append(ProxyRaw(ip, port, proto))
|
||||
except ValueError:
|
||||
continue
|
||||
return out
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
html = await self.fetch(self.urls[0], timeout=14, retries=1)
|
||||
if not html:
|
||||
return []
|
||||
results = self._parse_rows(html)
|
||||
if results:
|
||||
logger.info(f"{self.display_name} 解析 {len(results)} 条")
|
||||
return results
|
||||
56
app/plugins/fpw_socks_ssl_proxy.py
Normal file
56
app/plugins/fpw_socks_ssl_proxy.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""socks-proxy.net / sslproxies.org 表格(README 参考 GetProxyFromSocks-proxy.py)。"""
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
|
||||
name = "fpw_socks_ssl_proxy"
|
||||
display_name = "Socks-Proxy / SSLProxies"
|
||||
description = "socks-proxy.net 与 sslproxies.org 首页表格(HTTP/HTTPS 列表)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_concurrency = 6
|
||||
# 与 sslproxies 同模板的镜像站较多,socks-proxy 在部分网络下不稳定,多源提高成功率
|
||||
self.urls = [
|
||||
"https://www.sslproxies.org/",
|
||||
"https://free-proxy-list.net/",
|
||||
"https://www.us-proxy.org/",
|
||||
"https://www.socks-proxy.net/",
|
||||
]
|
||||
|
||||
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
|
||||
results = []
|
||||
pattern = re.compile(
|
||||
r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>\s*<td[^>]*>\s*(\d+)",
|
||||
re.I,
|
||||
)
|
||||
for ip, port in pattern.findall(html):
|
||||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), default_protocol))
|
||||
except ValueError:
|
||||
continue
|
||||
return results
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
|
||||
for url, html in zip(self.urls, htmls):
|
||||
if not html:
|
||||
continue
|
||||
if "socks-proxy" in url:
|
||||
proto = "socks4"
|
||||
else:
|
||||
proto = "http"
|
||||
batch = self._parse_page(html, proto)
|
||||
results.extend(batch)
|
||||
if batch:
|
||||
logger.info(f"{self.display_name} {url}: {len(batch)} 条")
|
||||
if results:
|
||||
logger.info(f"{self.display_name} 合计 {len(results)} 条")
|
||||
return results
|
||||
148
app/plugins/fpw_spys_one.py
Normal file
148
app/plugins/fpw_spys_one.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""spys.one 表单 POST + 端口 XOR 解码(README: GetProxyFromSPYSONE.py)。"""
|
||||
import asyncio
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
from app.core.log import logger
|
||||
|
||||
|
||||
class FpwSpysOnePlugin(BaseHTTPPlugin):
|
||||
name = "fpw_spys_one"
|
||||
display_name = "Spys.one"
|
||||
description = "spys.one HTTP/SOCKS 列表(POST 筛选 + XOR 端口解码)"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pages: List[Tuple[str, str, str]] = [
|
||||
("http", "http://spys.one/en/http-proxy-list/", "1"),
|
||||
("socks5", "http://spys.one/en/socks-proxy-list/", "2"),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _exec_spys_decoder(body: str) -> Dict[str, int]:
|
||||
body = re.sub(r"\s+", "", body)
|
||||
stmts = [s.strip() for s in body.split(";") if s.strip() and "document" not in s]
|
||||
env: Dict[str, int] = {}
|
||||
for _ in range(8):
|
||||
progressed = False
|
||||
for stmt in stmts:
|
||||
if "=" not in stmt:
|
||||
continue
|
||||
lhs, rhs = stmt.split("=", 1)
|
||||
lhs = lhs.strip()
|
||||
rhs = rhs.strip()
|
||||
if lhs in env:
|
||||
continue
|
||||
if "^" not in rhs:
|
||||
if rhs.isdigit():
|
||||
env[lhs] = int(rhs)
|
||||
progressed = True
|
||||
continue
|
||||
a, b = rhs.split("^", 1)
|
||||
a, b = a.strip(), b.strip()
|
||||
|
||||
def gv(x: str) -> int:
|
||||
if x.isdigit():
|
||||
return int(x)
|
||||
return env[x]
|
||||
|
||||
try:
|
||||
env[lhs] = gv(a) ^ gv(b)
|
||||
progressed = True
|
||||
except KeyError:
|
||||
continue
|
||||
if not progressed:
|
||||
break
|
||||
return env
|
||||
|
||||
def _decoder_env_from_html(self, html: str) -> Dict[str, int]:
|
||||
best: Dict[str, int] = {}
|
||||
for m in re.finditer(r"<script[^>]*>([\s\S]*?)</script>", html, re.IGNORECASE):
|
||||
chunk = m.group(1).strip()
|
||||
if "document.write" in chunk:
|
||||
continue
|
||||
xor_assigns = len(re.findall(r"\w+=\d+\^\w+", chunk))
|
||||
if xor_assigns < 4:
|
||||
continue
|
||||
env = self._exec_spys_decoder(chunk)
|
||||
if len(env) > len(best):
|
||||
best = env
|
||||
return best
|
||||
|
||||
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
|
||||
env = self._decoder_env_from_html(html)
|
||||
if not env:
|
||||
logger.warning(f"{self.display_name} 未解析到 XOR 变量表")
|
||||
return []
|
||||
|
||||
results: List[ProxyRaw] = []
|
||||
for m in re.finditer(
|
||||
r"class=spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<script[^>]*>([\s\S]+?)</script>",
|
||||
html,
|
||||
re.IGNORECASE,
|
||||
):
|
||||
ip = m.group(1)
|
||||
inner = m.group(2)
|
||||
dw = re.search(
|
||||
r'document\.write\("[^"]*"\+((?:\(\w+\^\w+\)\+?)+)\)',
|
||||
inner,
|
||||
)
|
||||
if not dw:
|
||||
continue
|
||||
pairs = re.findall(r"\((\w+)\^(\w+)\)", dw.group(1))
|
||||
if not pairs:
|
||||
continue
|
||||
try:
|
||||
digits = "".join(str(env[a] ^ env[b]) for a, b in pairs)
|
||||
port = int(digits)
|
||||
except (KeyError, ValueError):
|
||||
continue
|
||||
if not (1 <= port <= 65535):
|
||||
continue
|
||||
tail = html[m.end() : m.end() + 2000]
|
||||
u = tail.upper()
|
||||
if "SOCKS5" in u:
|
||||
proto = "socks5"
|
||||
elif "SOCKS4" in u:
|
||||
proto = "socks4"
|
||||
elif "HTTPS" in u:
|
||||
proto = "https"
|
||||
elif "HTTP" in u:
|
||||
proto = "http"
|
||||
else:
|
||||
proto = default_protocol
|
||||
try:
|
||||
results.append(ProxyRaw(ip, port, proto))
|
||||
except ValueError:
|
||||
continue
|
||||
return results
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
form_base = {
|
||||
"xpp": "3",
|
||||
"xf1": "0",
|
||||
"xf2": "0",
|
||||
"xf4": "0",
|
||||
}
|
||||
|
||||
async def _one(proto: str, url: str, xf5: str) -> Tuple[str, str]:
|
||||
data = {**form_base, "xf5": xf5}
|
||||
html = await self.fetch_post(url, data=data, timeout=14, retries=1)
|
||||
return proto, html or ""
|
||||
|
||||
pairs = await asyncio.gather(
|
||||
*(_one(proto, url, xf5) for proto, url, xf5 in self.pages)
|
||||
)
|
||||
for proto, html in pairs:
|
||||
if not html:
|
||||
continue
|
||||
batch = self._parse_page(html, proto)
|
||||
if batch:
|
||||
results.extend(batch)
|
||||
logger.info(f"{self.display_name} ({proto}): {len(batch)} 条")
|
||||
if results:
|
||||
logger.info(f"{self.display_name} 合计 {len(results)} 条")
|
||||
return results
|
||||
@@ -18,17 +18,19 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# 减少页数,降低被反爬概率,确保至少能拿到数据
|
||||
# fps/dps 列表页目前仍可 200;inha/intr 常返回 567(反爬),作末位兜底
|
||||
self.urls = [
|
||||
"https://www.kuaidaili.com/free/fps/",
|
||||
"https://www.kuaidaili.com/free/dps/",
|
||||
"https://www.kuaidaili.com/free/inha/1/",
|
||||
"https://www.kuaidaili.com/free/intr/1/",
|
||||
]
|
||||
|
||||
def get_headers(self) -> dict:
|
||||
headers = super().get_headers()
|
||||
headers["Referer"] = "https://www.kuaidaili.com/free/inha/"
|
||||
headers["Referer"] = "https://www.kuaidaili.com/free/"
|
||||
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
|
||||
headers["Accept-Encoding"] = "gzip, deflate, br"
|
||||
headers["Accept-Encoding"] = "gzip, deflate"
|
||||
headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8"
|
||||
headers["Sec-Fetch-Dest"] = "document"
|
||||
headers["Sec-Fetch-Mode"] = "navigate"
|
||||
@@ -36,15 +38,56 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
|
||||
headers["Upgrade-Insecure-Requests"] = "1"
|
||||
return headers
|
||||
|
||||
@staticmethod
|
||||
def _infer_protocol(texts: List[str]) -> str:
|
||||
"""从一行单元格文本中推断协议(兼容 fps / dps / inha 等版式)。"""
|
||||
for t in texts[2:]:
|
||||
tl = t.lower().replace(" ", "")
|
||||
if tl in VALID_PROTOCOLS:
|
||||
return tl
|
||||
if "http(s)" in tl or tl in ("http/https",):
|
||||
return "http"
|
||||
if "socks5" in tl:
|
||||
return "socks5"
|
||||
if "socks4" in tl:
|
||||
return "socks4"
|
||||
if tl == "https":
|
||||
return "https"
|
||||
if len(texts) >= 5:
|
||||
t4 = texts[4].lower().strip()
|
||||
if t4 in VALID_PROTOCOLS:
|
||||
return t4
|
||||
return "http"
|
||||
|
||||
def _parse_table(self, table) -> List[ProxyRaw]:
|
||||
out: List[ProxyRaw] = []
|
||||
for row in table.find_all("tr"):
|
||||
tds = row.find_all("td")
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
texts = [td.get_text(strip=True) for td in tds]
|
||||
ip = texts[0]
|
||||
port_s = texts[1]
|
||||
if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip):
|
||||
continue
|
||||
if not port_s.isdigit() or not (1 <= int(port_s) <= 65535):
|
||||
continue
|
||||
protocol = self._infer_protocol(texts)
|
||||
if protocol not in VALID_PROTOCOLS:
|
||||
protocol = "http"
|
||||
try:
|
||||
out.append(ProxyRaw(ip, int(port_s), protocol))
|
||||
except ValueError:
|
||||
continue
|
||||
return out
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results = []
|
||||
# 先访问首页预热会话,获取 cookie,降低被反爬概率
|
||||
await self.fetch("https://www.kuaidaili.com/", timeout=10)
|
||||
await asyncio.sleep(random.uniform(2, 4))
|
||||
await self.fetch("https://www.kuaidaili.com/free/", timeout=10)
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
|
||||
# 顺序请求免费代理页面
|
||||
for url in self.urls:
|
||||
html = await self.fetch(url, timeout=10)
|
||||
html = await self.fetch(url, timeout=15)
|
||||
if not html:
|
||||
continue
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
@@ -53,20 +96,11 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
|
||||
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}")
|
||||
continue
|
||||
|
||||
for row in table.find_all("tr"):
|
||||
tds = row.find_all("td")
|
||||
if len(tds) >= 5:
|
||||
ip = tds[0].get_text(strip=True)
|
||||
port = tds[1].get_text(strip=True)
|
||||
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
|
||||
if protocol not in VALID_PROTOCOLS:
|
||||
protocol = "http"
|
||||
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), protocol))
|
||||
except ValueError:
|
||||
continue
|
||||
await asyncio.sleep(random.uniform(5, 8))
|
||||
batch = self._parse_table(table)
|
||||
if batch:
|
||||
results.extend(batch)
|
||||
logger.info(f"{self.display_name} {url} 解析 {len(batch)} 条")
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
|
||||
if results:
|
||||
logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理")
|
||||
|
||||
@@ -109,21 +109,5 @@ class ProxyScrapePlugin(BaseHTTPPlugin):
|
||||
if results:
|
||||
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
|
||||
else:
|
||||
# Fallback:生成测试代理,确保在测试环境也能验证完整流程
|
||||
logger.warning("ProxyScrape 所有真实源均不可用,生成测试代理用于架构验证")
|
||||
results = self._generate_test_proxies()
|
||||
logger.warning("ProxyScrape 所有真实源均不可用,返回空列表")
|
||||
return results
|
||||
|
||||
def _generate_test_proxies(self) -> List[ProxyRaw]:
|
||||
"""生成测试代理数据,覆盖全协议类型,用于验证插件系统"""
|
||||
import random
|
||||
test_proxies = []
|
||||
protocols = ["http", "https", "socks4", "socks5"]
|
||||
for protocol in protocols:
|
||||
for _ in range(3):
|
||||
# 生成随机公网格式 IP(仅用于测试流程)
|
||||
ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
|
||||
port = random.randint(1024, 65535)
|
||||
test_proxies.append(ProxyRaw(ip, port, protocol))
|
||||
logger.info(f"生成 {len(test_proxies)} 个测试代理 HTTP/HTTPS/SOCKS4/SOCKS5 各 3 个")
|
||||
return test_proxies
|
||||
|
||||
Reference in New Issue
Block a user