Files
ProxyPool/app/services/plugin_runner.py
祀梦 b972b64616 refactor: 全面重构核心架构,消除反复修改的根因
- 删除 ValidationQueue 双轨持久化队列,替换为纯内存 AsyncWorkerPool
- 引入统一后台任务框架 JobExecutor(Job/CrawlJob/ValidateAllJob)
- 新增 PluginRunner 统一插件执行(超时、重试、健康检查、统计)
- 重构 SchedulerService 职责收敛为仅定时触发 ValidateAllJob
- 使用 AsyncExitStack 重构 lifespan,安全管理长生命周期资源
- 路由层瘦身 50%+,业务异常上抛由全局中间件统一处理
- 实现设置全热更新(WorkerPool 并发、Validator 超时即时生效)
- 前端 Store 强制写后重新拉取,消除乐观更新数据不同步
- 删除 queue.py / task_repo.py / task_service.py
- 新增 execution 单元测试,全部 85 个测试通过
2026-04-04 22:36:57 +08:00

91 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""插件统一执行器 - 封装超时、重试、健康检查、错误捕获"""
import asyncio
from datetime import datetime
from typing import Optional
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.config import settings as app_settings
from app.core.log import logger
from app.models.domain import CrawlResult, ProxyRaw
class PluginRunner:
"""统一插件执行器
- 超时控制(从 settings 读取 crawl_timeout
- 异常捕获和统计更新
- 可选的健康检查前置
- 结果去重
"""
def __init__(self, timeout: Optional[float] = None):
self.timeout = timeout or getattr(app_settings, "crawler_timeout", 30)
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
"""执行单个插件爬取"""
result = CrawlResult(plugin_name=plugin.name)
# 健康检查(可选)
try:
healthy = await asyncio.wait_for(
plugin.health_check(), timeout=5.0
)
if not healthy:
result.error = "health check failed"
result.failure_count = 1
await self._save_stats(plugin, result)
return result
except Exception as e:
logger.warning(f"Plugin {plugin.name} health check error: {e}")
# 执行爬取
try:
proxies = await asyncio.wait_for(
plugin.crawl(),
timeout=self.timeout,
)
result.proxies = self._dedup(proxies)
result.success_count = 1 if result.proxies else 0
logger.info(
f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies"
)
except asyncio.TimeoutError:
result.error = f"crawl timeout after {self.timeout}s"
result.failure_count = 1
logger.error(f"Plugin {plugin.name} crawl timeout")
except Exception as e:
result.error = str(e)
result.failure_count = 1
logger.error(f"Plugin {plugin.name} crawl failed: {e}", exc_info=True)
await self._save_stats(plugin, result)
return result
@staticmethod
def _dedup(proxies: list[ProxyRaw]) -> list[ProxyRaw]:
seen = set()
unique = []
for p in proxies:
key = (p.ip, p.port)
if key not in seen:
seen.add(key)
unique.append(p)
return unique
async def _save_stats(self, plugin: BaseCrawlerPlugin, result: CrawlResult) -> None:
"""将爬取统计持久化到数据库"""
from app.core.db import get_db
from app.repositories.settings_repo import PluginSettingsRepository
repo = PluginSettingsRepository()
payload = {
"success_count": result.success_count,
"failure_count": result.failure_count,
"last_run": datetime.now().isoformat(),
}
try:
async with get_db() as db:
await repo.set_stats(db, plugin.name, payload)
except Exception as e:
logger.error(f"Failed to save stats for {plugin.name}: {e}")