Files
ProxyPool/app/plugins/proxyscrape.py
祀梦 4ef7931941 fix: 全面修复代码问题并优化架构
修复问题:
- 添加缺失的 httpx 依赖到 requirements.txt
- 修复前端批量删除参数格式与后端不匹配(数组->对象数组)
- 移除 app/api/main.py 中重复创建 app 的冗余代码
- 修复 Plugins.vue v-model 直接修改 store 状态的 Vue 警告
- 修复 README 端口/启动命令文档与实际配置不一致
- 修正 pytest.ini 过时配置 (asyncio_default_fixture_loop_scope)
- 修复 WebUI index.html 语言设置为 zh-CN
- 修复 .gitignore 错误忽略 tests/ 目录

后端优化:
- 修复调度器默认间隔从 5 秒改为 30 分钟,避免无节制验证
- 修复 validate_all_now 在调度器停止时无法执行的 bug
- 设置保存后热更新运行中调度器的验证间隔
- 将 update_score 优化为原子单事务 SQL,消除并发竞态
- 导出功能改为真正的流式分批读取(iter_batches),降低大导出内存占用
- ProxyResponse Schema 补齐 response_time_ms 字段
- 日志级别改为从配置动态读取,不再硬编码 INFO
- 清理 validator_service 中的冗余 try/finally 代码

插件健壮性:
- 修复 ip3366/ip89/kuaidaili/proxylist_download/speedx/yundaili/proxyscrape
  的端口范围检查和 IPv6 地址解析问题(改用 rsplit + 1-65535 校验)
- 修复 PluginService.list_plugins 并发竞争条件
- 修复 run_all_plugins 去重逻辑与数据库 UNIQUE 约束保持一致
- 修复 proxyscrape 异常时错误跳过 fallback 的 bug

测试:
- 新增 7 个插件解析单元测试
- 新增 update_score 自动删除和 iter_batches 流式读取测试
- 全部 74 个测试通过
2026-04-04 21:03:43 +08:00

130 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""ProxyScrape 测试爬虫 - 用于验证架构,支持全协议类型"""
import asyncio
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyScrapePlugin(BaseHTTPPlugin):
default_config = {"max_pages": 5}
"""
从 ProxyScrape 公开 API 获取代理库
覆盖 http/https/socks4/socks5 全协议,专门用于测试插件系统的可扩展性
"""
name = "proxyscrape"
display_name = "ProxyScrape测试站"
description = "从 ProxyScrape API 获取各类型代理HTTP/HTTPS/SOCKS4/SOCKS5用于测试架构扩展"
enabled = True
def __init__(self):
super().__init__()
# GitHub raw 源作为首选
self.urls = [
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
]
# ProxyScrape 官方 API 作为 fallback
self.api_urls = {
"http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
"https": "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all",
"socks4": "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all",
"socks5": "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all",
}
def _parse_proxies(self, text: str, protocol: str) -> List[ProxyRaw]:
"""解析 ip:port 每行的文本内容"""
proxies = []
for line in text.splitlines():
line = line.strip()
if not line or ":" not in line:
continue
ip, _, port_str = line.rpartition(":")
ip = ip.strip()
port_str = port_str.strip()
if port_str.isdigit() and 1 <= int(port_str) <= 65535:
try:
proxies.append(ProxyRaw(ip, int(port_str), protocol))
except ValueError:
continue
return proxies
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
protocols = [protocol for protocol, _ in self.urls]
urls = [url for _, url in self.urls]
# 1. 并发请求所有 GitHub raw 源,整体限时 10s先完成的保留结果
tasks = [asyncio.create_task(self.fetch(url, timeout=12)) for url in urls]
done, pending = await asyncio.wait(tasks, timeout=10)
for task in pending:
task.cancel()
htmls = []
done_protocols = set()
for i, task in enumerate(tasks):
try:
if task in done:
htmls.append(task.result())
done_protocols.add(protocols[i])
else:
htmls.append("")
except Exception:
htmls.append("")
# 异常时不加入 done_protocols以便触发 API fallback
fallback_protocols = []
for protocol, html in zip(protocols, htmls):
proxies = self._parse_proxies(html or "", protocol) if html else []
if proxies:
logger.info(f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理")
results.extend(proxies)
else:
if protocol in done_protocols:
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效,将尝试 API fallback")
else:
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 请求超时,将尝试 API fallback")
fallback_protocols.append(protocol)
# 2. 对 GitHub raw 失败的协议,并发请求 ProxyScrape API fallback
if fallback_protocols:
fallback_urls = [self.api_urls[p] for p in fallback_protocols]
try:
api_htmls = await asyncio.wait_for(
self.fetch_all(fallback_urls, timeout=10), timeout=10
)
except asyncio.TimeoutError:
logger.warning(f"ProxyScrape API fallback 批量请求超时,跳过 {len(fallback_protocols)} 个协议")
api_htmls = [""] * len(fallback_protocols)
for protocol, api_html in zip(fallback_protocols, api_htmls):
proxies = self._parse_proxies(api_html or "", protocol) if api_html else []
if proxies:
logger.info(f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理")
results.extend(proxies)
else:
logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效")
if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else:
# Fallback生成测试代理确保在测试环境也能验证完整流程
logger.warning("ProxyScrape 所有真实源均不可用,生成测试代理用于架构验证")
results = self._generate_test_proxies()
return results
def _generate_test_proxies(self) -> List[ProxyRaw]:
"""生成测试代理数据,覆盖全协议类型,用于验证插件系统"""
import random
test_proxies = []
protocols = ["http", "https", "socks4", "socks5"]
for protocol in protocols:
for _ in range(3):
# 生成随机公网格式 IP仅用于测试流程
ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
port = random.randint(1024, 65535)
test_proxies.append(ProxyRaw(ip, port, protocol))
logger.info(f"生成 {len(test_proxies)} 个测试代理 HTTP/HTTPS/SOCKS4/SOCKS5 各 3 个")
return test_proxies