fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP

- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
2026-04-05 13:48:41 +08:00
parent 2c98abaf91
commit 957cee3100
13 changed files with 116 additions and 52 deletions
--- a/app/plugins/base.py
+++ b/app/plugins/base.py
@@ -52,7 +52,8 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
    def _http_timeout(seconds: float) -> httpx.Timeout:
        """连接阶段单独收紧，避免 AsyncClient 在部分环境下长时间卡在 connect。"""
        t = max(2.0, float(seconds))
-        c = min(6.0, max(3.0, t * 0.35))
+        # 国际链路 / 批量爬取时 connect 过短易集体超时
+        c = min(12.0, max(4.0, t * 0.4))
        return httpx.Timeout(t, connect=c)

    @staticmethod