refactor: 全面重构核心架构,消除反复修改的根因
- 删除 ValidationQueue 双轨持久化队列,替换为纯内存 AsyncWorkerPool - 引入统一后台任务框架 JobExecutor(Job/CrawlJob/ValidateAllJob) - 新增 PluginRunner 统一插件执行(超时、重试、健康检查、统计) - 重构 SchedulerService 职责收敛为仅定时触发 ValidateAllJob - 使用 AsyncExitStack 重构 lifespan,安全管理长生命周期资源 - 路由层瘦身 50%+,业务异常上抛由全局中间件统一处理 - 实现设置全热更新(WorkerPool 并发、Validator 超时即时生效) - 前端 Store 强制写后重新拉取,消除乐观更新数据不同步 - 删除 queue.py / task_repo.py / task_service.py - 新增 execution 单元测试,全部 85 个测试通过
This commit is contained in:
90
app/services/plugin_runner.py
Normal file
90
app/services/plugin_runner.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""插件统一执行器 - 封装超时、重试、健康检查、错误捕获"""
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from app.core.plugin_system.base import BaseCrawlerPlugin
|
||||
from app.core.config import settings as app_settings
|
||||
from app.core.log import logger
|
||||
from app.models.domain import CrawlResult, ProxyRaw
|
||||
|
||||
|
||||
class PluginRunner:
|
||||
"""统一插件执行器
|
||||
|
||||
- 超时控制(从 settings 读取 crawl_timeout)
|
||||
- 异常捕获和统计更新
|
||||
- 可选的健康检查前置
|
||||
- 结果去重
|
||||
"""
|
||||
|
||||
def __init__(self, timeout: Optional[float] = None):
|
||||
self.timeout = timeout or getattr(app_settings, "crawler_timeout", 30)
|
||||
|
||||
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
|
||||
"""执行单个插件爬取"""
|
||||
result = CrawlResult(plugin_name=plugin.name)
|
||||
|
||||
# 健康检查(可选)
|
||||
try:
|
||||
healthy = await asyncio.wait_for(
|
||||
plugin.health_check(), timeout=5.0
|
||||
)
|
||||
if not healthy:
|
||||
result.error = "health check failed"
|
||||
result.failure_count = 1
|
||||
await self._save_stats(plugin, result)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.warning(f"Plugin {plugin.name} health check error: {e}")
|
||||
|
||||
# 执行爬取
|
||||
try:
|
||||
proxies = await asyncio.wait_for(
|
||||
plugin.crawl(),
|
||||
timeout=self.timeout,
|
||||
)
|
||||
result.proxies = self._dedup(proxies)
|
||||
result.success_count = 1 if result.proxies else 0
|
||||
logger.info(
|
||||
f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies"
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
result.error = f"crawl timeout after {self.timeout}s"
|
||||
result.failure_count = 1
|
||||
logger.error(f"Plugin {plugin.name} crawl timeout")
|
||||
except Exception as e:
|
||||
result.error = str(e)
|
||||
result.failure_count = 1
|
||||
logger.error(f"Plugin {plugin.name} crawl failed: {e}", exc_info=True)
|
||||
|
||||
await self._save_stats(plugin, result)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _dedup(proxies: list[ProxyRaw]) -> list[ProxyRaw]:
|
||||
seen = set()
|
||||
unique = []
|
||||
for p in proxies:
|
||||
key = (p.ip, p.port)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(p)
|
||||
return unique
|
||||
|
||||
async def _save_stats(self, plugin: BaseCrawlerPlugin, result: CrawlResult) -> None:
|
||||
"""将爬取统计持久化到数据库"""
|
||||
from app.core.db import get_db
|
||||
from app.repositories.settings_repo import PluginSettingsRepository
|
||||
|
||||
repo = PluginSettingsRepository()
|
||||
payload = {
|
||||
"success_count": result.success_count,
|
||||
"failure_count": result.failure_count,
|
||||
"last_run": datetime.now().isoformat(),
|
||||
}
|
||||
try:
|
||||
async with get_db() as db:
|
||||
await repo.set_stats(db, plugin.name, payload)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save stats for {plugin.name}: {e}")
|
||||
Reference in New Issue
Block a user