"""插件统一执行器 - 封装超时、重试、健康检查、错误捕获""" import asyncio from datetime import datetime from typing import Optional from app.core.plugin_system.base import BaseCrawlerPlugin from app.core.config import settings as app_settings from app.core.log import logger from app.models.domain import CrawlResult, ProxyRaw class PluginRunner: """统一插件执行器 - 超时控制(从 settings 读取 crawl_timeout) - 异常捕获和统计更新 - 可选的健康检查前置 - 结果去重 """ def __init__(self, timeout: Optional[float] = None): self.timeout = timeout or getattr(app_settings, "crawler_timeout", 30) async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult: """执行单个插件爬取""" result = CrawlResult(plugin_name=plugin.name) # 健康检查(可选) try: healthy = await asyncio.wait_for( plugin.health_check(), timeout=5.0 ) if not healthy: result.error = "health check failed" result.failure_count = 1 await self._save_stats(plugin, result) return result except Exception as e: logger.warning(f"Plugin {plugin.name} health check error: {e}") # 执行爬取 try: proxies = await asyncio.wait_for( plugin.crawl(), timeout=self.timeout, ) result.proxies = self._dedup(proxies) result.success_count = 1 if result.proxies else 0 logger.info( f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies" ) except asyncio.TimeoutError: result.error = f"crawl timeout after {self.timeout}s" result.failure_count = 1 logger.error(f"Plugin {plugin.name} crawl timeout") except Exception as e: result.error = str(e) result.failure_count = 1 logger.error(f"Plugin {plugin.name} crawl failed: {e}", exc_info=True) await self._save_stats(plugin, result) return result @staticmethod def _dedup(proxies: list[ProxyRaw]) -> list[ProxyRaw]: seen = set() unique = [] for p in proxies: key = (p.ip, p.port) if key not in seen: seen.add(key) unique.append(p) return unique async def _save_stats(self, plugin: BaseCrawlerPlugin, result: CrawlResult) -> None: """将爬取统计持久化到数据库""" from app.core.db import get_db from app.repositories.settings_repo import PluginSettingsRepository repo = PluginSettingsRepository() payload = { "success_count": result.success_count, "failure_count": result.failure_count, "last_run": datetime.now().isoformat(), } try: async with get_db() as db: await repo.set_stats(db, plugin.name, payload) except Exception as e: logger.error(f"Failed to save stats for {plugin.name}: {e}")