refactor: 全面重构核心架构,消除反复修改的根因

- 删除 ValidationQueue 双轨持久化队列,替换为纯内存 AsyncWorkerPool
- 引入统一后台任务框架 JobExecutor(Job/CrawlJob/ValidateAllJob)
- 新增 PluginRunner 统一插件执行(超时、重试、健康检查、统计)
- 重构 SchedulerService 职责收敛为仅定时触发 ValidateAllJob
- 使用 AsyncExitStack 重构 lifespan,安全管理长生命周期资源
- 路由层瘦身 50%+,业务异常上抛由全局中间件统一处理
- 实现设置全热更新(WorkerPool 并发、Validator 超时即时生效)
- 前端 Store 强制写后重新拉取,消除乐观更新数据不同步
- 删除 queue.py / task_repo.py / task_service.py
- 新增 execution 单元测试,全部 85 个测试通过
This commit is contained in:
祀梦
2026-04-04 22:36:57 +08:00
parent 4ef7931941
commit b972b64616
33 changed files with 1168 additions and 864 deletions

View File

@@ -0,0 +1,13 @@
"""统一后台任务执行框架"""
from .job import JobStatus, Job, CrawlJob, ValidateAllJob
from .worker_pool import AsyncWorkerPool
from .executor import JobExecutor
__all__ = [
"JobStatus",
"Job",
"CrawlJob",
"ValidateAllJob",
"AsyncWorkerPool",
"JobExecutor",
]

View File

@@ -0,0 +1,138 @@
"""Job 执行器 - 统一管理所有后台 Job 的生命周期"""
import asyncio
from contextlib import asynccontextmanager
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from app.core.execution.job import Job, JobStatus
from app.core.execution.worker_pool import AsyncWorkerPool
from app.core.log import logger
class JobExecutor:
"""后台任务执行器
- 维护内存中的 Job 状态表
- 限制最大并发 Job 数
- 自动清理过期 Job
"""
def __init__(
self,
worker_pool: Optional[AsyncWorkerPool] = None,
max_concurrent_jobs: int = 10,
cleanup_interval_seconds: int = 300,
):
self.worker_pool = worker_pool
self.max_concurrent_jobs = max_concurrent_jobs
self._jobs: Dict[str, Job] = {}
self._running = False
self._semaphore = asyncio.Semaphore(max_concurrent_jobs)
self._cleanup_interval = cleanup_interval_seconds
self._cleanup_task: Optional[asyncio.Task] = None
async def __aenter__(self):
await self.start()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.stop()
async def start(self) -> None:
if self._running:
return
self._running = True
self._cleanup_task = asyncio.create_task(self._cleanup_loop())
logger.info("JobExecutor started")
async def stop(self) -> None:
if not self._running:
return
self._running = False
await self.cancel_all()
if self._cleanup_task:
self._cleanup_task.cancel()
try:
await self._cleanup_task
except asyncio.CancelledError:
pass
logger.info("JobExecutor stopped")
def submit_job(self, job: Job) -> str:
"""提交一个 Job 到后台执行"""
self._jobs[job.id] = job
asyncio.create_task(self._run_job(job))
return job.id
async def _run_job(self, job: Job) -> None:
async with self._semaphore:
try:
if job.is_cancelled:
logger.info(f"Job {job.id} was cancelled before running")
return
result = await job.run()
# 如果子类没有显式设置完成状态,自动设为 completed
if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
job._set_completed(result)
logger.info(f"Job {job.id} completed: {result}")
except asyncio.CancelledError:
job.status = JobStatus.CANCELLED
job._touch()
logger.info(f"Job {job.id} cancelled during execution")
except Exception as e:
job._set_failed(str(e))
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
def get_job(self, job_id: str) -> Optional[Job]:
return self._jobs.get(job_id)
def list_jobs(
self,
status: Optional[JobStatus] = None,
limit: int = 100,
) -> List[Job]:
jobs = list(self._jobs.values())
if status:
jobs = [j for j in jobs if j.status == status]
jobs.sort(key=lambda j: j.created_at, reverse=True)
return jobs[:limit]
async def cancel_job(self, job_id: str) -> bool:
job = self._jobs.get(job_id)
if not job:
return False
job.cancel()
return True
async def cancel_all(self) -> int:
cancelled = 0
for job in list(self._jobs.values()):
if job.status in (JobStatus.PENDING, JobStatus.RUNNING):
job.cancel()
cancelled += 1
return cancelled
async def _cleanup_loop(self) -> None:
"""定期清理已完成且过期的 Job"""
while self._running:
try:
await asyncio.sleep(self._cleanup_interval)
self._cleanup_old_jobs()
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"JobExecutor cleanup error: {e}")
def _cleanup_old_jobs(self) -> int:
cutoff = datetime.now() - timedelta(seconds=self._cleanup_interval)
to_remove = [
job_id
for job_id, job in self._jobs.items()
if job.status in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED)
and job.updated_at < cutoff
]
for job_id in to_remove:
del self._jobs[job_id]
if to_remove:
logger.info(f"JobExecutor cleaned up {len(to_remove)} old jobs")
return len(to_remove)

159
app/core/execution/job.py Normal file
View File

@@ -0,0 +1,159 @@
"""Job 定义 - 所有后台异步任务的统一抽象"""
import uuid
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from app.models.domain import ProxyRaw
class JobStatus(Enum):
PENDING = "pending"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
@dataclass
class Job(ABC):
"""后台任务基类"""
id: str = field(default_factory=lambda: str(uuid.uuid4()))
status: JobStatus = JobStatus.PENDING
progress: float = 0.0 # 0-100
result: Any = None
error: Optional[str] = None
created_at: datetime = field(default_factory=datetime.now)
updated_at: datetime = field(default_factory=datetime.now)
_cancelled: bool = field(default=False, repr=False)
def cancel(self) -> None:
"""请求取消任务"""
self._cancelled = True
if self.status in (JobStatus.PENDING, JobStatus.RUNNING):
self.status = JobStatus.CANCELLED
self._touch()
def _touch(self) -> None:
self.updated_at = datetime.now()
def _set_running(self) -> None:
self.status = JobStatus.RUNNING
self._touch()
def _set_completed(self, result: Any = None) -> None:
self.status = JobStatus.COMPLETED
self.result = result
self.progress = 100.0
self._touch()
def _set_failed(self, error: str) -> None:
self.status = JobStatus.FAILED
self.error = error
self._touch()
@property
def is_cancelled(self) -> bool:
return self._cancelled
@abstractmethod
async def run(self) -> Any:
"""执行任务的核心逻辑,子类必须实现"""
raise NotImplementedError
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"status": self.status.value,
"progress": round(self.progress, 2),
"result": self.result,
"error": self.error,
"created_at": self.created_at.isoformat(),
"updated_at": self.updated_at.isoformat(),
}
@dataclass
class CrawlJob(Job):
"""插件爬取任务"""
plugin_id: str = ""
plugin_runner: Any = field(repr=False, default=None)
proxy_service: Any = field(repr=False, default=None)
validator_pool: Any = field(repr=False, default=None)
async def run(self) -> Dict[str, Any]:
from app.services.plugin_service import PluginService
from app.core.log import logger
self._set_running()
if not self.plugin_runner:
raise RuntimeError("plugin_runner is not set")
plugin_service = self.proxy_service or PluginService()
plugin = plugin_service.get_plugin(self.plugin_id)
if not plugin:
raise ValueError(f"Plugin '{self.plugin_id}' not found")
result = await self.plugin_runner.run(plugin)
proxies: List[ProxyRaw] = result.proxies if result else []
if proxies and self.validator_pool:
await self.validator_pool.submit(proxies)
logger.info(f"CrawlJob {self.id}: submitted {len(proxies)} proxies for validation")
payload = {
"plugin_id": self.plugin_id,
"proxy_count": len(proxies),
}
if result:
payload["success_count"] = result.success_count
payload["failure_count"] = result.failure_count
self._set_completed(payload)
return payload
@dataclass
class ValidateAllJob(Job):
"""全量验证任务 - 验证数据库中所有存量代理"""
proxy_repo: Any = field(repr=False, default=None)
validator_pool: Any = field(repr=False, default=None)
batch_size: int = 100
async def run(self) -> Dict[str, Any]:
from app.repositories.proxy_repo import ProxyRepository
from app.core.db import get_db
from app.core.log import logger
self._set_running()
repo = self.proxy_repo or ProxyRepository()
async with get_db() as db:
proxies = await repo.list_all(db)
if not proxies:
self._set_completed({"total": 0, "submitted": 0})
return self.result
total = len(proxies)
submitted = 0
for i in range(0, total, self.batch_size):
if self.is_cancelled:
logger.info(f"ValidateAllJob {self.id}: cancelled")
break
batch = proxies[i : i + self.batch_size]
raws = [ProxyRaw(p.ip, p.port, p.protocol) for p in batch]
if self.validator_pool:
await self.validator_pool.submit(raws)
submitted += len(raws)
self.progress = min(100.0, (submitted / total) * 100)
self._touch()
payload = {"total": total, "submitted": submitted}
self._set_completed(payload)
logger.info(f"ValidateAllJob {self.id}: submitted {submitted}/{total} proxies")
return payload

View File

@@ -0,0 +1,97 @@
"""轻量级纯内存异步 Worker Pool"""
import asyncio
from typing import Callable, Coroutine, List, TypeVar
from app.core.log import logger
T = TypeVar("T")
Handler = Callable[[T], Coroutine[None, None, None]]
class AsyncWorkerPool:
"""纯内存异步工作池
职责单一:接收任务列表 -> 分发给 N 个 Worker 协程 -> 调用 handler 处理
不包含任何持久化逻辑,也不维护复杂的状态机。
"""
def __init__(
self,
worker_count: int,
handler: Handler,
queue_maxsize: int = 10000,
name: str = "WorkerPool",
):
self.worker_count = worker_count
self.handler = handler
self.name = name
self._queue: asyncio.Queue = asyncio.Queue(maxsize=queue_maxsize)
self._workers: List[asyncio.Task] = []
self._running = False
self._shutdown_event = asyncio.Event()
async def __aenter__(self):
await self.start()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.stop()
async def start(self) -> None:
if self._running:
return
self._running = True
self._shutdown_event.clear()
self._workers = [
asyncio.create_task(self._worker_loop(i), name=f"{self.name}-worker-{i}")
for i in range(self.worker_count)
]
logger.info(f"{self.name} started with {self.worker_count} workers")
async def stop(self) -> None:
if not self._running:
return
self._running = False
self._shutdown_event.set()
# 发送足够数量的 sentinel确保所有 Worker 都能收到
for _ in range(len(self._workers) + self._queue.qsize()):
try:
self._queue.put_nowait(None)
except asyncio.QueueFull:
break
if self._workers:
await asyncio.gather(*self._workers, return_exceptions=True)
self._workers.clear()
logger.info(f"{self.name} stopped")
async def submit(self, items: List[T]) -> None:
"""提交一批任务到队列(阻塞直到有空位,天然背压)"""
for item in items:
await self._queue.put(item)
async def drain(self) -> None:
"""等待队列中所有任务被消费完毕"""
await self._queue.join()
async def resize(self, new_worker_count: int) -> None:
"""动态调整 Worker 数量:先全部停止,再按新数量启动"""
if new_worker_count == self.worker_count:
return
logger.info(f"{self.name} resizing from {self.worker_count} to {new_worker_count}")
# 安全做法:先 stop 再 start避免新旧 Worker 竞争 sentinel 导致死锁
await self.stop()
self.worker_count = new_worker_count
await self.start()
async def _worker_loop(self, worker_id: int) -> None:
while True:
item = await self._queue.get()
try:
if item is None or not self._running:
self._queue.task_done()
break
await self.handler(item)
except Exception as e:
logger.error(f"{self.name} worker {worker_id} handler error: {e}", exc_info=True)
finally:
self._queue.task_done()