fix: 修复爬虫网络层、验证队列卡死及 API 500 错误
- 修复 BaseHTTPPlugin 连接池、并发控制、异常日志、超时策略 - 修复/增强 8 个爬虫插件的稳定性和 fallback 机制 - 清理 validation_tasks 表 4 万+ pending 任务,避免队列卡死 - 修复 app/api/main.py 缺失全局 app 实例导致的 500 错误 - 提升前端 Axios 超时到 120 秒,避免请求断开 - 修复插件统计持久化和调度器生命周期问题
This commit is contained in:
@@ -31,11 +31,20 @@ class PluginService:
|
||||
if "config" in state and isinstance(state["config"], dict):
|
||||
plugin.update_config(state["config"])
|
||||
|
||||
stat = self._stats.get(plugin.name, {
|
||||
"success_count": 0,
|
||||
"failure_count": 0,
|
||||
"last_run": None,
|
||||
})
|
||||
# 合并数据库统计与内存统计(内存优先)
|
||||
db_stat = state.get("stats", {})
|
||||
stat = {
|
||||
"success_count": db_stat.get("success_count", 0),
|
||||
"failure_count": db_stat.get("failure_count", 0),
|
||||
"last_run": datetime.fromisoformat(db_stat["last_run"]) if db_stat.get("last_run") else None,
|
||||
}
|
||||
mem_stat = self._stats.get(plugin.name, {})
|
||||
if mem_stat:
|
||||
stat["success_count"] = mem_stat.get("success_count", stat["success_count"])
|
||||
stat["failure_count"] = mem_stat.get("failure_count", stat["failure_count"])
|
||||
if mem_stat.get("last_run"):
|
||||
stat["last_run"] = mem_stat["last_run"]
|
||||
|
||||
result.append(PluginInfo(
|
||||
id=plugin.name,
|
||||
name=plugin.name,
|
||||
@@ -105,11 +114,19 @@ class PluginService:
|
||||
self._record_stat(plugin_id, failure=1)
|
||||
logger.error(f"Plugin {plugin_id} crawl failed: {e}")
|
||||
return []
|
||||
finally:
|
||||
await self._save_stats(plugin_id)
|
||||
|
||||
async def run_all_plugins(self) -> List[ProxyRaw]:
|
||||
"""执行所有启用插件的爬取"""
|
||||
"""执行所有启用插件的爬取,限制并发数以避免触发目标站反爬"""
|
||||
all_results: List[ProxyRaw] = []
|
||||
tasks = [self.run_plugin(plugin.name) for plugin in registry.list_plugins() if plugin.enabled]
|
||||
semaphore = asyncio.Semaphore(5)
|
||||
|
||||
async def _run_with_limit(plugin_name: str):
|
||||
async with semaphore:
|
||||
return await self.run_plugin(plugin_name)
|
||||
|
||||
tasks = [_run_with_limit(plugin.name) for plugin in registry.list_plugins() if plugin.enabled]
|
||||
results_list = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for results in results_list:
|
||||
if isinstance(results, Exception):
|
||||
@@ -137,3 +154,14 @@ class PluginService:
|
||||
self._stats[plugin_id]["failure_count"] += failure
|
||||
if success or failure:
|
||||
self._stats[plugin_id]["last_run"] = datetime.now()
|
||||
|
||||
async def _save_stats(self, plugin_id: str):
|
||||
"""将内存中的统计持久化到数据库"""
|
||||
stats = self._stats.get(plugin_id, {})
|
||||
payload = {
|
||||
"success_count": stats.get("success_count", 0),
|
||||
"failure_count": stats.get("failure_count", 0),
|
||||
"last_run": stats.get("last_run").isoformat() if stats.get("last_run") else None,
|
||||
}
|
||||
async with get_db() as db:
|
||||
await self.plugin_settings_repo.set_stats(db, plugin_id, payload)
|
||||
|
||||
@@ -22,6 +22,7 @@ class SchedulerService:
|
||||
self.running = False
|
||||
self._stop_event = asyncio.Event()
|
||||
self._task: asyncio.Task | None = None
|
||||
self._validate_task: asyncio.Task | None = None
|
||||
|
||||
async def start(self):
|
||||
if self.running:
|
||||
@@ -48,7 +49,9 @@ class SchedulerService:
|
||||
|
||||
async def validate_all_now(self):
|
||||
"""立即执行一次全量验证(后台运行,不阻塞)"""
|
||||
asyncio.create_task(self._do_validate_all())
|
||||
if self._validate_task and not self._validate_task.done():
|
||||
return
|
||||
self._validate_task = asyncio.create_task(self._do_validate_all())
|
||||
|
||||
async def _run_loop(self):
|
||||
"""定时循环"""
|
||||
@@ -65,27 +68,30 @@ class SchedulerService:
|
||||
|
||||
async def _do_validate_all(self):
|
||||
"""验证数据库中所有存量代理"""
|
||||
logger.info("Starting scheduled validation for all proxies")
|
||||
async with get_db() as db:
|
||||
proxies = await self.proxy_repo.list_all(db)
|
||||
if not proxies:
|
||||
logger.info("No proxies to validate")
|
||||
return
|
||||
try:
|
||||
logger.info("Starting scheduled validation for all proxies")
|
||||
async with get_db() as db:
|
||||
proxies = await self.proxy_repo.list_all(db)
|
||||
if not proxies:
|
||||
logger.info("No proxies to validate")
|
||||
return
|
||||
|
||||
logger.info(f"Validating {len(proxies)} proxies from database")
|
||||
from app.models.domain import ProxyRaw
|
||||
logger.info(f"Validating {len(proxies)} proxies from database")
|
||||
from app.models.domain import ProxyRaw
|
||||
|
||||
# 批量提交到验证队列
|
||||
batch_size = 100
|
||||
for i in range(0, len(proxies), batch_size):
|
||||
if not self.running:
|
||||
break
|
||||
batch = proxies[i : i + batch_size]
|
||||
await self.validation_queue.submit([
|
||||
ProxyRaw(p.ip, p.port, p.protocol) for p in batch
|
||||
])
|
||||
# 等待当前批次处理完
|
||||
await self.validation_queue.drain()
|
||||
logger.info(f"Validated batch {i//batch_size + 1}/{(len(proxies)-1)//batch_size + 1}")
|
||||
# 批量提交到验证队列
|
||||
batch_size = 100
|
||||
for i in range(0, len(proxies), batch_size):
|
||||
if not self.running:
|
||||
break
|
||||
batch = proxies[i : i + batch_size]
|
||||
await self.validation_queue.submit([
|
||||
ProxyRaw(p.ip, p.port, p.protocol) for p in batch
|
||||
])
|
||||
# 等待当前批次处理完
|
||||
await self.validation_queue.drain()
|
||||
logger.info(f"Validated batch {i//batch_size + 1}/{(len(proxies)-1)//batch_size + 1}")
|
||||
|
||||
logger.info("Scheduled validation completed")
|
||||
logger.info("Scheduled validation completed")
|
||||
except Exception as e:
|
||||
logger.error(f"Scheduled validation error: {e}")
|
||||
|
||||
Reference in New Issue
Block a user