- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
23 lines
633 B
Python
23 lines
633 B
Python
"""批量爬取时限制同时发起 HTTP 的插件数,避免 crawl-all 与验证/聚合任务抢满执行器槽位。"""
|
|
import asyncio
|
|
from contextlib import asynccontextmanager
|
|
from typing import AsyncIterator
|
|
|
|
# 与单插件内 max_concurrency 相乘后仍应对外网友好;过小会拉长总耗时。
|
|
CRAWL_MAX_CONCURRENT = 4
|
|
|
|
_sem: asyncio.Semaphore | None = None
|
|
|
|
|
|
def _get_sem() -> asyncio.Semaphore:
|
|
global _sem
|
|
if _sem is None:
|
|
_sem = asyncio.Semaphore(CRAWL_MAX_CONCURRENT)
|
|
return _sem
|
|
|
|
|
|
@asynccontextmanager
|
|
async def crawl_slot() -> AsyncIterator[None]:
|
|
async with _get_sem():
|
|
yield
|