refactor: 全面重构核心架构,消除反复修改的根因

- 删除 ValidationQueue 双轨持久化队列,替换为纯内存 AsyncWorkerPool
- 引入统一后台任务框架 JobExecutor(Job/CrawlJob/ValidateAllJob)
- 新增 PluginRunner 统一插件执行(超时、重试、健康检查、统计)
- 重构 SchedulerService 职责收敛为仅定时触发 ValidateAllJob
- 使用 AsyncExitStack 重构 lifespan,安全管理长生命周期资源
- 路由层瘦身 50%+,业务异常上抛由全局中间件统一处理
- 实现设置全热更新(WorkerPool 并发、Validator 超时即时生效)
- 前端 Store 强制写后重新拉取,消除乐观更新数据不同步
- 删除 queue.py / task_repo.py / task_service.py
- 新增 execution 单元测试,全部 85 个测试通过
This commit is contained in:
祀梦
2026-04-04 22:36:57 +08:00
parent 4ef7931941
commit b972b64616
33 changed files with 1168 additions and 864 deletions

View File

@@ -0,0 +1,90 @@
"""插件统一执行器 - 封装超时、重试、健康检查、错误捕获"""
import asyncio
from datetime import datetime
from typing import Optional
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.config import settings as app_settings
from app.core.log import logger
from app.models.domain import CrawlResult, ProxyRaw
class PluginRunner:
"""统一插件执行器
- 超时控制(从 settings 读取 crawl_timeout
- 异常捕获和统计更新
- 可选的健康检查前置
- 结果去重
"""
def __init__(self, timeout: Optional[float] = None):
self.timeout = timeout or getattr(app_settings, "crawler_timeout", 30)
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
"""执行单个插件爬取"""
result = CrawlResult(plugin_name=plugin.name)
# 健康检查(可选)
try:
healthy = await asyncio.wait_for(
plugin.health_check(), timeout=5.0
)
if not healthy:
result.error = "health check failed"
result.failure_count = 1
await self._save_stats(plugin, result)
return result
except Exception as e:
logger.warning(f"Plugin {plugin.name} health check error: {e}")
# 执行爬取
try:
proxies = await asyncio.wait_for(
plugin.crawl(),
timeout=self.timeout,
)
result.proxies = self._dedup(proxies)
result.success_count = 1 if result.proxies else 0
logger.info(
f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies"
)
except asyncio.TimeoutError:
result.error = f"crawl timeout after {self.timeout}s"
result.failure_count = 1
logger.error(f"Plugin {plugin.name} crawl timeout")
except Exception as e:
result.error = str(e)
result.failure_count = 1
logger.error(f"Plugin {plugin.name} crawl failed: {e}", exc_info=True)
await self._save_stats(plugin, result)
return result
@staticmethod
def _dedup(proxies: list[ProxyRaw]) -> list[ProxyRaw]:
seen = set()
unique = []
for p in proxies:
key = (p.ip, p.port)
if key not in seen:
seen.add(key)
unique.append(p)
return unique
async def _save_stats(self, plugin: BaseCrawlerPlugin, result: CrawlResult) -> None:
"""将爬取统计持久化到数据库"""
from app.core.db import get_db
from app.repositories.settings_repo import PluginSettingsRepository
repo = PluginSettingsRepository()
payload = {
"success_count": result.success_count,
"failure_count": result.failure_count,
"last_run": datetime.now().isoformat(),
}
try:
async with get_db() as db:
await repo.set_stats(db, plugin.name, payload)
except Exception as e:
logger.error(f"Failed to save stats for {plugin.name}: {e}")