fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP

- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
2026-04-05 13:48:41 +08:00
parent 2c98abaf91
commit 957cee3100
13 changed files with 116 additions and 52 deletions
--- a/app/core/execution/crawl_gate.py
+++ b/app/core/execution/crawl_gate.py
@@ -0,0 +1,22 @@
+"""批量爬取时限制同时发起 HTTP 的插件数，避免 crawl-all 与验证/聚合任务抢满执行器槽位。"""
+import asyncio
+from contextlib import asynccontextmanager
+from typing import AsyncIterator
+
+# 与单插件内 max_concurrency 相乘后仍应对外网友好；过小会拉长总耗时。
+CRAWL_MAX_CONCURRENT = 4
+
+_sem: asyncio.Semaphore | None = None
+
+
+def _get_sem() -> asyncio.Semaphore:
+    global _sem
+    if _sem is None:
+        _sem = asyncio.Semaphore(CRAWL_MAX_CONCURRENT)
+    return _sem
+
+
+@asynccontextmanager
+async def crawl_slot() -> AsyncIterator[None]:
+    async with _get_sem():
+        yield
--- a/app/core/execution/executor.py
+++ b/app/core/execution/executor.py
@@ -4,7 +4,8 @@ from contextlib import asynccontextmanager
 from datetime import datetime, timedelta
 from typing import Any, Dict, List, Optional

-from app.core.execution.job import Job, JobStatus
+from app.core.execution.job import CrawlJob, Job, JobStatus
+from app.core.execution.crawl_gate import crawl_slot
 from app.core.execution.worker_pool import AsyncWorkerPool
 from app.core.log import logger

@@ -67,24 +68,32 @@ class JobExecutor:
        return job.id

    async def _run_job(self, job: Job) -> None:
+        async def _execute() -> None:
+            try:
+                if job.is_cancelled:
+                    logger.info(f"Job {job.id} was cancelled before running")
+                    return
+                result = await job.run()
+                if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
+                    job._set_completed(result)
+                logger.info(f"Job {job.id} completed: {result}")
+            except asyncio.CancelledError:
+                job.status = JobStatus.CANCELLED
+                job._touch()
+                logger.info(f"Job {job.id} cancelled during execution")
+            except Exception as e:
+                job._set_failed(str(e))
+                logger.error(f"Job {job.id} failed: {e}", exc_info=True)
+
        try:
-            async with self._semaphore:
-                try:
-                    if job.is_cancelled:
-                        logger.info(f"Job {job.id} was cancelled before running")
-                        return
-                    result = await job.run()
-                    # 如果子类没有显式设置完成状态，自动设为 completed
-                    if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
-                        job._set_completed(result)
-                    logger.info(f"Job {job.id} completed: {result}")
-                except asyncio.CancelledError:
-                    job.status = JobStatus.CANCELLED
-                    job._touch()
-                    logger.info(f"Job {job.id} cancelled during execution")
-                except Exception as e:
-                    job._set_failed(str(e))
-                    logger.error(f"Job {job.id} failed: {e}", exc_info=True)
+            # CrawlJob 先等爬取槽位再占执行器，避免十几个任务占满 max_concurrent_jobs 却只排队等外网
+            if isinstance(job, CrawlJob):
+                async with crawl_slot():
+                    async with self._semaphore:
+                        await _execute()
+            else:
+                async with self._semaphore:
+                    await _execute()
        finally:
            self._tasks.pop(job.id, None)