- 删除 ValidationQueue 双轨持久化队列,替换为纯内存 AsyncWorkerPool - 引入统一后台任务框架 JobExecutor(Job/CrawlJob/ValidateAllJob) - 新增 PluginRunner 统一插件执行(超时、重试、健康检查、统计) - 重构 SchedulerService 职责收敛为仅定时触发 ValidateAllJob - 使用 AsyncExitStack 重构 lifespan,安全管理长生命周期资源 - 路由层瘦身 50%+,业务异常上抛由全局中间件统一处理 - 实现设置全热更新(WorkerPool 并发、Validator 超时即时生效) - 前端 Store 强制写后重新拉取,消除乐观更新数据不同步 - 删除 queue.py / task_repo.py / task_service.py - 新增 execution 单元测试,全部 85 个测试通过
91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
"""插件统一执行器 - 封装超时、重试、健康检查、错误捕获"""
|
||
import asyncio
|
||
from datetime import datetime
|
||
from typing import Optional
|
||
|
||
from app.core.plugin_system.base import BaseCrawlerPlugin
|
||
from app.core.config import settings as app_settings
|
||
from app.core.log import logger
|
||
from app.models.domain import CrawlResult, ProxyRaw
|
||
|
||
|
||
class PluginRunner:
|
||
"""统一插件执行器
|
||
|
||
- 超时控制(从 settings 读取 crawl_timeout)
|
||
- 异常捕获和统计更新
|
||
- 可选的健康检查前置
|
||
- 结果去重
|
||
"""
|
||
|
||
def __init__(self, timeout: Optional[float] = None):
|
||
self.timeout = timeout or getattr(app_settings, "crawler_timeout", 30)
|
||
|
||
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
|
||
"""执行单个插件爬取"""
|
||
result = CrawlResult(plugin_name=plugin.name)
|
||
|
||
# 健康检查(可选)
|
||
try:
|
||
healthy = await asyncio.wait_for(
|
||
plugin.health_check(), timeout=5.0
|
||
)
|
||
if not healthy:
|
||
result.error = "health check failed"
|
||
result.failure_count = 1
|
||
await self._save_stats(plugin, result)
|
||
return result
|
||
except Exception as e:
|
||
logger.warning(f"Plugin {plugin.name} health check error: {e}")
|
||
|
||
# 执行爬取
|
||
try:
|
||
proxies = await asyncio.wait_for(
|
||
plugin.crawl(),
|
||
timeout=self.timeout,
|
||
)
|
||
result.proxies = self._dedup(proxies)
|
||
result.success_count = 1 if result.proxies else 0
|
||
logger.info(
|
||
f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies"
|
||
)
|
||
except asyncio.TimeoutError:
|
||
result.error = f"crawl timeout after {self.timeout}s"
|
||
result.failure_count = 1
|
||
logger.error(f"Plugin {plugin.name} crawl timeout")
|
||
except Exception as e:
|
||
result.error = str(e)
|
||
result.failure_count = 1
|
||
logger.error(f"Plugin {plugin.name} crawl failed: {e}", exc_info=True)
|
||
|
||
await self._save_stats(plugin, result)
|
||
return result
|
||
|
||
@staticmethod
|
||
def _dedup(proxies: list[ProxyRaw]) -> list[ProxyRaw]:
|
||
seen = set()
|
||
unique = []
|
||
for p in proxies:
|
||
key = (p.ip, p.port)
|
||
if key not in seen:
|
||
seen.add(key)
|
||
unique.append(p)
|
||
return unique
|
||
|
||
async def _save_stats(self, plugin: BaseCrawlerPlugin, result: CrawlResult) -> None:
|
||
"""将爬取统计持久化到数据库"""
|
||
from app.core.db import get_db
|
||
from app.repositories.settings_repo import PluginSettingsRepository
|
||
|
||
repo = PluginSettingsRepository()
|
||
payload = {
|
||
"success_count": result.success_count,
|
||
"failure_count": result.failure_count,
|
||
"last_run": datetime.now().isoformat(),
|
||
}
|
||
try:
|
||
async with get_db() as db:
|
||
await repo.set_stats(db, plugin.name, payload)
|
||
except Exception as e:
|
||
logger.error(f"Failed to save stats for {plugin.name}: {e}")
|