Files
ProxyPool/app/plugins/proxyscrape.py
祀梦 0131c8b408 feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation
- Add Free_Proxy_Website-style fpw_* plugins and register them
- Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting
- Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore
- Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait
- Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys
- WebSocket dashboard stats; ProxyList pool_filter and API alignment
- POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404
- pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB
- .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import

Made-with: Cursor
2026-04-05 13:39:19 +08:00

114 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""ProxyScrape 测试爬虫 - 用于验证架构,支持全协议类型"""
import asyncio
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyScrapePlugin(BaseHTTPPlugin):
default_config = {"max_pages": 5}
"""
从 ProxyScrape 公开 API 获取代理库
覆盖 http/https/socks4/socks5 全协议,专门用于测试插件系统的可扩展性
"""
name = "proxyscrape"
display_name = "ProxyScrape测试站"
description = "从 ProxyScrape API 获取各类型代理HTTP/HTTPS/SOCKS4/SOCKS5用于测试架构扩展"
enabled = True
def __init__(self):
super().__init__()
# GitHub raw 源作为首选
self.urls = [
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
]
# ProxyScrape 官方 API 作为 fallback
self.api_urls = {
"http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
"https": "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all",
"socks4": "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all",
"socks5": "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all",
}
def _parse_proxies(self, text: str, protocol: str) -> List[ProxyRaw]:
"""解析 ip:port 每行的文本内容"""
proxies = []
for line in text.splitlines():
line = line.strip()
if not line or ":" not in line:
continue
ip, _, port_str = line.rpartition(":")
ip = ip.strip()
port_str = port_str.strip()
if port_str.isdigit() and 1 <= int(port_str) <= 65535:
try:
proxies.append(ProxyRaw(ip, int(port_str), protocol))
except ValueError:
continue
return proxies
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
protocols = [protocol for protocol, _ in self.urls]
urls = [url for _, url in self.urls]
# 1. 并发请求所有 GitHub raw 源,整体限时 10s先完成的保留结果
tasks = [asyncio.create_task(self.fetch(url, timeout=12)) for url in urls]
done, pending = await asyncio.wait(tasks, timeout=10)
for task in pending:
task.cancel()
htmls = []
done_protocols = set()
for i, task in enumerate(tasks):
try:
if task in done:
htmls.append(task.result())
done_protocols.add(protocols[i])
else:
htmls.append("")
except Exception:
htmls.append("")
# 异常时不加入 done_protocols以便触发 API fallback
fallback_protocols = []
for protocol, html in zip(protocols, htmls):
proxies = self._parse_proxies(html or "", protocol) if html else []
if proxies:
logger.info(f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理")
results.extend(proxies)
else:
if protocol in done_protocols:
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效,将尝试 API fallback")
else:
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 请求超时,将尝试 API fallback")
fallback_protocols.append(protocol)
# 2. 对 GitHub raw 失败的协议,并发请求 ProxyScrape API fallback
if fallback_protocols:
fallback_urls = [self.api_urls[p] for p in fallback_protocols]
try:
api_htmls = await asyncio.wait_for(
self.fetch_all(fallback_urls, timeout=10), timeout=10
)
except asyncio.TimeoutError:
logger.warning(f"ProxyScrape API fallback 批量请求超时,跳过 {len(fallback_protocols)} 个协议")
api_htmls = [""] * len(fallback_protocols)
for protocol, api_html in zip(fallback_protocols, api_htmls):
proxies = self._parse_proxies(api_html or "", protocol) if api_html else []
if proxies:
logger.info(f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理")
results.extend(proxies)
else:
logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效")
if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else:
logger.warning("ProxyScrape 所有真实源均不可用,返回空列表")
return results