refactor(crawl): parallel plugins via JobExecutor; per-plugin throttle
- Remove global crawl_slot gate; all CrawlJobs share only executor semaphore - max_concurrent_jobs = max(24, n_plugins+8) for crawl-all + aggregator headroom - BaseHTTPPlugin max_concurrency 3->2; fpw multi-URL plugins 4->2 - fetch_all: short random delay before each request to ease single-host pressure Made-with: Cursor
This commit is contained in:
@@ -1,11 +1,9 @@
|
||||
"""Job 执行器 - 统一管理所有后台 Job 的生命周期"""
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.execution.job import CrawlJob, Job, JobStatus
|
||||
from app.core.execution.crawl_gate import crawl_slot
|
||||
from app.core.execution.job import Job, JobStatus
|
||||
from app.core.execution.worker_pool import AsyncWorkerPool
|
||||
from app.core.log import logger
|
||||
|
||||
@@ -86,14 +84,8 @@ class JobExecutor:
|
||||
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
|
||||
|
||||
try:
|
||||
# CrawlJob 先等爬取槽位再占执行器,避免十几个任务占满 max_concurrent_jobs 却只排队等外网
|
||||
if isinstance(job, CrawlJob):
|
||||
async with crawl_slot():
|
||||
async with self._semaphore:
|
||||
await _execute()
|
||||
else:
|
||||
async with self._semaphore:
|
||||
await _execute()
|
||||
async with self._semaphore:
|
||||
await _execute()
|
||||
finally:
|
||||
self._tasks.pop(job.id, None)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user