fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP
- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
This commit is contained in:
22
app/core/execution/crawl_gate.py
Normal file
22
app/core/execution/crawl_gate.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""批量爬取时限制同时发起 HTTP 的插件数,避免 crawl-all 与验证/聚合任务抢满执行器槽位。"""
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import AsyncIterator
|
||||
|
||||
# 与单插件内 max_concurrency 相乘后仍应对外网友好;过小会拉长总耗时。
|
||||
CRAWL_MAX_CONCURRENT = 4
|
||||
|
||||
_sem: asyncio.Semaphore | None = None
|
||||
|
||||
|
||||
def _get_sem() -> asyncio.Semaphore:
|
||||
global _sem
|
||||
if _sem is None:
|
||||
_sem = asyncio.Semaphore(CRAWL_MAX_CONCURRENT)
|
||||
return _sem
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def crawl_slot() -> AsyncIterator[None]:
|
||||
async with _get_sem():
|
||||
yield
|
||||
@@ -4,7 +4,8 @@ from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.execution.job import Job, JobStatus
|
||||
from app.core.execution.job import CrawlJob, Job, JobStatus
|
||||
from app.core.execution.crawl_gate import crawl_slot
|
||||
from app.core.execution.worker_pool import AsyncWorkerPool
|
||||
from app.core.log import logger
|
||||
|
||||
@@ -67,24 +68,32 @@ class JobExecutor:
|
||||
return job.id
|
||||
|
||||
async def _run_job(self, job: Job) -> None:
|
||||
async def _execute() -> None:
|
||||
try:
|
||||
if job.is_cancelled:
|
||||
logger.info(f"Job {job.id} was cancelled before running")
|
||||
return
|
||||
result = await job.run()
|
||||
if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
|
||||
job._set_completed(result)
|
||||
logger.info(f"Job {job.id} completed: {result}")
|
||||
except asyncio.CancelledError:
|
||||
job.status = JobStatus.CANCELLED
|
||||
job._touch()
|
||||
logger.info(f"Job {job.id} cancelled during execution")
|
||||
except Exception as e:
|
||||
job._set_failed(str(e))
|
||||
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
|
||||
|
||||
try:
|
||||
async with self._semaphore:
|
||||
try:
|
||||
if job.is_cancelled:
|
||||
logger.info(f"Job {job.id} was cancelled before running")
|
||||
return
|
||||
result = await job.run()
|
||||
# 如果子类没有显式设置完成状态,自动设为 completed
|
||||
if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
|
||||
job._set_completed(result)
|
||||
logger.info(f"Job {job.id} completed: {result}")
|
||||
except asyncio.CancelledError:
|
||||
job.status = JobStatus.CANCELLED
|
||||
job._touch()
|
||||
logger.info(f"Job {job.id} cancelled during execution")
|
||||
except Exception as e:
|
||||
job._set_failed(str(e))
|
||||
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
|
||||
# CrawlJob 先等爬取槽位再占执行器,避免十几个任务占满 max_concurrent_jobs 却只排队等外网
|
||||
if isinstance(job, CrawlJob):
|
||||
async with crawl_slot():
|
||||
async with self._semaphore:
|
||||
await _execute()
|
||||
else:
|
||||
async with self._semaphore:
|
||||
await _execute()
|
||||
finally:
|
||||
self._tasks.pop(job.id, None)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user