refactor(crawl): parallel plugins via JobExecutor; per-plugin throttle

- Remove global crawl_slot gate; all CrawlJobs share only executor semaphore
- max_concurrent_jobs = max(24, n_plugins+8) for crawl-all + aggregator headroom
- BaseHTTPPlugin max_concurrency 3->2; fpw multi-URL plugins 4->2
- fetch_all: short random delay before each request to ease single-host pressure

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 14:08:26 +08:00
parent 957cee3100
commit a26ae50051
6 changed files with 11 additions and 38 deletions

View File

@@ -1,22 +0,0 @@
"""批量爬取时限制同时发起 HTTP 的插件数,避免 crawl-all 与验证/聚合任务抢满执行器槽位。"""
import asyncio
from contextlib import asynccontextmanager
from typing import AsyncIterator
# 与单插件内 max_concurrency 相乘后仍应对外网友好;过小会拉长总耗时。
CRAWL_MAX_CONCURRENT = 4
_sem: asyncio.Semaphore | None = None
def _get_sem() -> asyncio.Semaphore:
global _sem
if _sem is None:
_sem = asyncio.Semaphore(CRAWL_MAX_CONCURRENT)
return _sem
@asynccontextmanager
async def crawl_slot() -> AsyncIterator[None]:
async with _get_sem():
yield

View File

@@ -1,11 +1,9 @@
"""Job 执行器 - 统一管理所有后台 Job 的生命周期"""
import asyncio
from contextlib import asynccontextmanager
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from app.core.execution.job import CrawlJob, Job, JobStatus
from app.core.execution.crawl_gate import crawl_slot
from app.core.execution.job import Job, JobStatus
from app.core.execution.worker_pool import AsyncWorkerPool
from app.core.log import logger
@@ -86,14 +84,8 @@ class JobExecutor:
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
try:
# CrawlJob 先等爬取槽位再占执行器,避免十几个任务占满 max_concurrent_jobs 却只排队等外网
if isinstance(job, CrawlJob):
async with crawl_slot():
async with self._semaphore:
await _execute()
else:
async with self._semaphore:
await _execute()
async with self._semaphore:
await _execute()
finally:
self._tasks.pop(job.id, None)