fix: 修复爬虫网络层、验证队列卡死及 API 500 错误

- 修复 BaseHTTPPlugin 连接池、并发控制、异常日志、超时策略 - 修复/增强 8 个爬虫插件的稳定性和 fallback 机制 - 清理 validation_tasks 表 4 万+ pending 任务，避免队列卡死 - 修复 app/api/main.py 缺失全局 app 实例导致的 500 错误 - 提升前端 Axios 超时到 120 秒，避免请求断开 - 修复插件统计持久化和调度器生命周期问题
2026-04-04 19:27:36 +08:00
parent 635c524a7e
commit f09a8e16c4
19 changed files with 505 additions and 161 deletions
--- a/app/plugins/base.py
+++ b/app/plugins/base.py
@@ -1,7 +1,7 @@
 """通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
 import random
 import asyncio
-import aiohttp
+import httpx
 from typing import List
 from app.core.plugin_system import BaseCrawlerPlugin

@@ -28,25 +28,39 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
            "Connection": "keep-alive",
        }

-    async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str:
+    async def fetch(self, url: str, timeout: float = 15.0, retries: int = 2) -> str:
        """异步抓取指定 URL 的 HTML 内容"""
+        from app.core.log import logger
        headers = self.get_headers()
-        async with aiohttp.ClientSession(headers=headers) as session:
-            for attempt in range(retries):
+        transport = httpx.AsyncHTTPTransport(retries=0)
+        for attempt in range(retries):
+            async with httpx.AsyncClient(headers=headers, transport=transport, follow_redirects=True) as client:
                try:
-                    async with session.get(
-                        url, timeout=aiohttp.ClientTimeout(total=timeout)
-                    ) as response:
-                        if response.status == 200:
-                            content = await response.read()
-                            encoding = response.get_encoding()
-                            if encoding == "utf-8" or not encoding:
-                                try:
-                                    return content.decode("utf-8")
-                                except UnicodeDecodeError:
-                                    return content.decode("gbk", errors="ignore")
-                            return content.decode(encoding, errors="ignore")
-                except Exception:
-                    pass
-                await asyncio.sleep(random.uniform(1, 3))
+                    response = await client.get(url, timeout=timeout)
+                    if response.status_code == 200:
+                        content = response.content
+                        encoding = response.encoding
+                        if encoding == "utf-8" or not encoding:
+                            try:
+                                return content.decode("utf-8")
+                            except UnicodeDecodeError:
+                                return content.decode("gbk", errors="ignore")
+                        return content.decode(encoding, errors="ignore")
+                    else:
+                        logger.warning(f"Fetch {url} returned status {response.status_code}")
+                except Exception as e:
+                    logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
+                if attempt < retries - 1:
+                    await asyncio.sleep(random.uniform(1, 3))
        return ""
+
+    async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
+        """并发抓取多个 URL，限制单个插件内部并发为 3"""
+        semaphore = asyncio.Semaphore(3)
+
+        async def _fetch_limited(url: str):
+            async with semaphore:
+                return await self.fetch(url, timeout=timeout)
+
+        tasks = [_fetch_limited(url) for url in urls]
+        return await asyncio.gather(*tasks)