fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP

- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued
- BaseHTTPPlugin: longer connect budget for slow international links
- proxyscrape: jsDelivr mirror + longer GitHub/API phases
- fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 13:48:41 +08:00
parent 2c98abaf91
commit 957cee3100
13 changed files with 116 additions and 52 deletions

View File

@@ -4,7 +4,8 @@ from contextlib import asynccontextmanager
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from app.core.execution.job import Job, JobStatus
from app.core.execution.job import CrawlJob, Job, JobStatus
from app.core.execution.crawl_gate import crawl_slot
from app.core.execution.worker_pool import AsyncWorkerPool
from app.core.log import logger
@@ -67,24 +68,32 @@ class JobExecutor:
return job.id
async def _run_job(self, job: Job) -> None:
async def _execute() -> None:
try:
if job.is_cancelled:
logger.info(f"Job {job.id} was cancelled before running")
return
result = await job.run()
if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
job._set_completed(result)
logger.info(f"Job {job.id} completed: {result}")
except asyncio.CancelledError:
job.status = JobStatus.CANCELLED
job._touch()
logger.info(f"Job {job.id} cancelled during execution")
except Exception as e:
job._set_failed(str(e))
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
try:
async with self._semaphore:
try:
if job.is_cancelled:
logger.info(f"Job {job.id} was cancelled before running")
return
result = await job.run()
# 如果子类没有显式设置完成状态,自动设为 completed
if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
job._set_completed(result)
logger.info(f"Job {job.id} completed: {result}")
except asyncio.CancelledError:
job.status = JobStatus.CANCELLED
job._touch()
logger.info(f"Job {job.id} cancelled during execution")
except Exception as e:
job._set_failed(str(e))
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
# CrawlJob 先等爬取槽位再占执行器,避免十几个任务占满 max_concurrent_jobs 却只排队等外网
if isinstance(job, CrawlJob):
async with crawl_slot():
async with self._semaphore:
await _execute()
else:
async with self._semaphore:
await _execute()
finally:
self._tasks.pop(job.id, None)