fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP

- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued
- BaseHTTPPlugin: longer connect budget for slow international links
- proxyscrape: jsDelivr mirror + longer GitHub/API phases
- fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 13:48:41 +08:00
parent 2c98abaf91
commit 957cee3100
13 changed files with 116 additions and 52 deletions

View File

@@ -0,0 +1,22 @@
"""批量爬取时限制同时发起 HTTP 的插件数,避免 crawl-all 与验证/聚合任务抢满执行器槽位。"""
import asyncio
from contextlib import asynccontextmanager
from typing import AsyncIterator
# 与单插件内 max_concurrency 相乘后仍应对外网友好;过小会拉长总耗时。
CRAWL_MAX_CONCURRENT = 4
_sem: asyncio.Semaphore | None = None
def _get_sem() -> asyncio.Semaphore:
global _sem
if _sem is None:
_sem = asyncio.Semaphore(CRAWL_MAX_CONCURRENT)
return _sem
@asynccontextmanager
async def crawl_slot() -> AsyncIterator[None]:
async with _get_sem():
yield