refactor(crawl): parallel plugins via JobExecutor; per-plugin throttle

- Remove global crawl_slot gate; all CrawlJobs share only executor semaphore - max_concurrent_jobs = max(24, n_plugins+8) for crawl-all + aggregator headroom - BaseHTTPPlugin max_concurrency 3->2; fpw multi-URL plugins 4->2 - fetch_all: short random delay before each request to ease single-host pressure Made-with: Cursor
2026-04-05 14:08:26 +08:00
parent 957cee3100
commit a26ae50051
6 changed files with 11 additions and 38 deletions
--- a/app/plugins/base.py
+++ b/app/plugins/base.py
@@ -26,7 +26,7 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
        self.urls: List[str] = []
        self.current_url: str = ""
        self._client: Optional[httpx.AsyncClient] = None
-        self.max_concurrency: int = 3
+        self.max_concurrency: int = 2

    def get_headers(self) -> dict:
        return {
@@ -178,6 +178,7 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):

        async def _fetch_limited(url: str):
            async with semaphore:
+                await asyncio.sleep(random.uniform(0.08, 0.45))
                return await self.fetch(url, timeout=timeout, retries=retries)

        tasks = [_fetch_limited(url) for url in urls]