fix: 修复爬虫网络层、验证队列卡死及 API 500 错误

- 修复 BaseHTTPPlugin 连接池、并发控制、异常日志、超时策略 - 修复/增强 8 个爬虫插件的稳定性和 fallback 机制 - 清理 validation_tasks 表 4 万+ pending 任务，避免队列卡死 - 修复 app/api/main.py 缺失全局 app 实例导致的 500 错误 - 提升前端 Axios 超时到 120 秒，避免请求断开 - 修复插件统计持久化和调度器生命周期问题
2026-04-04 19:27:36 +08:00
parent 635c524a7e
commit f09a8e16c4
19 changed files with 505 additions and 161 deletions
--- a/app/plugins/proxyscrape.py
+++ b/app/plugins/proxyscrape.py
@@ -1,4 +1,5 @@
 """ProxyScrape 测试爬虫 - 用于验证架构，支持全协议类型"""
+import asyncio
 from typing import List
 from app.core.plugin_system import ProxyRaw
 from app.plugins.base import BaseHTTPPlugin
@@ -19,39 +20,89 @@ class ProxyScrapePlugin(BaseHTTPPlugin):

    def __init__(self):
        super().__init__()
-        # 使用多个公开 GitHub 代理列表作为源，稳定性较差
+        # GitHub raw 源作为首选
        self.urls = [
            ("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
            ("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
            ("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
            ("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
        ]
+        # ProxyScrape 官方 API 作为 fallback
+        self.api_urls = {
+            "http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
+            "https": "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all",
+            "socks4": "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all",
+            "socks5": "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all",
+        }
+
+    def _parse_proxies(self, text: str, protocol: str) -> List[ProxyRaw]:
+        """解析 ip:port 每行的文本内容"""
+        proxies = []
+        for line in text.splitlines():
+            line = line.strip()
+            if not line or ":" not in line:
+                continue
+            parts = line.split(":")
+            if len(parts) >= 2:
+                ip = parts[0].strip()
+                port_str = parts[1].strip()
+                if port_str.isdigit():
+                    proxies.append(ProxyRaw(ip, int(port_str), protocol))
+        return proxies

    async def crawl(self) -> List[ProxyRaw]:
        results: List[ProxyRaw] = []
-        for protocol, url in self.urls:
+        protocols = [protocol for protocol, _ in self.urls]
+        urls = [url for _, url in self.urls]
+
+        # 1. 并发请求所有 GitHub raw 源，整体限时 10s，先完成的保留结果
+        tasks = [asyncio.create_task(self.fetch(url, timeout=12)) for url in urls]
+        done, pending = await asyncio.wait(tasks, timeout=10)
+        for task in pending:
+            task.cancel()
+        htmls = []
+        done_protocols = set()
+        for i, task in enumerate(tasks):
            try:
-                html = await self.fetch(url, timeout=30)
-                if not html:
-                    logger.warning(f"ProxyScrape {protocol.upper()} 返回空内容")
-                    continue
+                if task in done:
+                    htmls.append(task.result())
+                    done_protocols.add(protocols[i])
+                else:
+                    htmls.append("")
+            except Exception:
+                htmls.append("")
+                done_protocols.add(protocols[i])

-                count = 0
-                for line in html.splitlines():
-                    line = line.strip()
-                    if not line or ":" not in line:
-                        continue
-                    parts = line.split(":")
-                    if len(parts) >= 2:
-                        ip = parts[0].strip()
-                        port_str = parts[1].strip()
-                        if port_str.isdigit():
-                            results.append(ProxyRaw(ip, int(port_str), protocol))
-                            count += 1
+        fallback_protocols = []
+        for protocol, html in zip(protocols, htmls):
+            proxies = self._parse_proxies(html or "", protocol) if html else []
+            if proxies:
+                logger.info(f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理")
+                results.extend(proxies)
+            else:
+                if protocol in done_protocols:
+                    logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效，将尝试 API fallback")
+                else:
+                    logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 请求超时，将尝试 API fallback")
+                fallback_protocols.append(protocol)

-                logger.info(f"ProxyScrape {protocol.upper()} 获取 {count} 个代理")
-            except Exception as e:
-                logger.error(f"ProxyScrape {protocol.upper()} 爬取失败: {e}")
+        # 2. 对 GitHub raw 失败的协议，并发请求 ProxyScrape API fallback
+        if fallback_protocols:
+            fallback_urls = [self.api_urls[p] for p in fallback_protocols]
+            try:
+                api_htmls = await asyncio.wait_for(
+                    self.fetch_all(fallback_urls, timeout=10), timeout=10
+                )
+            except asyncio.TimeoutError:
+                logger.warning(f"ProxyScrape API fallback 批量请求超时，跳过 {len(fallback_protocols)} 个协议")
+                api_htmls = [""] * len(fallback_protocols)
+            for protocol, api_html in zip(fallback_protocols, api_htmls):
+                proxies = self._parse_proxies(api_html or "", protocol) if api_html else []
+                if proxies:
+                    logger.info(f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理")
+                    results.extend(proxies)
+                else:
+                    logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效")

        if results:
            logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")