fix: 修复爬虫网络层、验证队列卡死及 API 500 错误
- 修复 BaseHTTPPlugin 连接池、并发控制、异常日志、超时策略 - 修复/增强 8 个爬虫插件的稳定性和 fallback 机制 - 清理 validation_tasks 表 4 万+ pending 任务,避免队列卡死 - 修复 app/api/main.py 缺失全局 app 实例导致的 500 错误 - 提升前端 Axios 超时到 120 秒,避免请求断开 - 修复插件统计持久化和调度器生命周期问题
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
"""ProxyScrape 测试爬虫 - 用于验证架构,支持全协议类型"""
|
||||
import asyncio
|
||||
from typing import List
|
||||
from app.core.plugin_system import ProxyRaw
|
||||
from app.plugins.base import BaseHTTPPlugin
|
||||
@@ -19,39 +20,89 @@ class ProxyScrapePlugin(BaseHTTPPlugin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# 使用多个公开 GitHub 代理列表作为源,稳定性较差
|
||||
# GitHub raw 源作为首选
|
||||
self.urls = [
|
||||
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
|
||||
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
|
||||
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
|
||||
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
|
||||
]
|
||||
# ProxyScrape 官方 API 作为 fallback
|
||||
self.api_urls = {
|
||||
"http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
|
||||
"https": "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all",
|
||||
"socks4": "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all",
|
||||
"socks5": "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all",
|
||||
}
|
||||
|
||||
def _parse_proxies(self, text: str, protocol: str) -> List[ProxyRaw]:
|
||||
"""解析 ip:port 每行的文本内容"""
|
||||
proxies = []
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line or ":" not in line:
|
||||
continue
|
||||
parts = line.split(":")
|
||||
if len(parts) >= 2:
|
||||
ip = parts[0].strip()
|
||||
port_str = parts[1].strip()
|
||||
if port_str.isdigit():
|
||||
proxies.append(ProxyRaw(ip, int(port_str), protocol))
|
||||
return proxies
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
for protocol, url in self.urls:
|
||||
protocols = [protocol for protocol, _ in self.urls]
|
||||
urls = [url for _, url in self.urls]
|
||||
|
||||
# 1. 并发请求所有 GitHub raw 源,整体限时 10s,先完成的保留结果
|
||||
tasks = [asyncio.create_task(self.fetch(url, timeout=12)) for url in urls]
|
||||
done, pending = await asyncio.wait(tasks, timeout=10)
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
htmls = []
|
||||
done_protocols = set()
|
||||
for i, task in enumerate(tasks):
|
||||
try:
|
||||
html = await self.fetch(url, timeout=30)
|
||||
if not html:
|
||||
logger.warning(f"ProxyScrape {protocol.upper()} 返回空内容")
|
||||
continue
|
||||
if task in done:
|
||||
htmls.append(task.result())
|
||||
done_protocols.add(protocols[i])
|
||||
else:
|
||||
htmls.append("")
|
||||
except Exception:
|
||||
htmls.append("")
|
||||
done_protocols.add(protocols[i])
|
||||
|
||||
count = 0
|
||||
for line in html.splitlines():
|
||||
line = line.strip()
|
||||
if not line or ":" not in line:
|
||||
continue
|
||||
parts = line.split(":")
|
||||
if len(parts) >= 2:
|
||||
ip = parts[0].strip()
|
||||
port_str = parts[1].strip()
|
||||
if port_str.isdigit():
|
||||
results.append(ProxyRaw(ip, int(port_str), protocol))
|
||||
count += 1
|
||||
fallback_protocols = []
|
||||
for protocol, html in zip(protocols, htmls):
|
||||
proxies = self._parse_proxies(html or "", protocol) if html else []
|
||||
if proxies:
|
||||
logger.info(f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理")
|
||||
results.extend(proxies)
|
||||
else:
|
||||
if protocol in done_protocols:
|
||||
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效,将尝试 API fallback")
|
||||
else:
|
||||
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 请求超时,将尝试 API fallback")
|
||||
fallback_protocols.append(protocol)
|
||||
|
||||
logger.info(f"ProxyScrape {protocol.upper()} 获取 {count} 个代理")
|
||||
except Exception as e:
|
||||
logger.error(f"ProxyScrape {protocol.upper()} 爬取失败: {e}")
|
||||
# 2. 对 GitHub raw 失败的协议,并发请求 ProxyScrape API fallback
|
||||
if fallback_protocols:
|
||||
fallback_urls = [self.api_urls[p] for p in fallback_protocols]
|
||||
try:
|
||||
api_htmls = await asyncio.wait_for(
|
||||
self.fetch_all(fallback_urls, timeout=10), timeout=10
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"ProxyScrape API fallback 批量请求超时,跳过 {len(fallback_protocols)} 个协议")
|
||||
api_htmls = [""] * len(fallback_protocols)
|
||||
for protocol, api_html in zip(fallback_protocols, api_htmls):
|
||||
proxies = self._parse_proxies(api_html or "", protocol) if api_html else []
|
||||
if proxies:
|
||||
logger.info(f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理")
|
||||
results.extend(proxies)
|
||||
else:
|
||||
logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效")
|
||||
|
||||
if results:
|
||||
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
|
||||
|
||||
Reference in New Issue
Block a user