refactor(crawl): parallel plugins via JobExecutor; per-plugin throttle
- Remove global crawl_slot gate; all CrawlJobs share only executor semaphore - max_concurrent_jobs = max(24, n_plugins+8) for crawl-all + aggregator headroom - BaseHTTPPlugin max_concurrency 3->2; fpw multi-URL plugins 4->2 - fetch_all: short random delay before each request to ease single-host pressure Made-with: Cursor
This commit is contained in:
@@ -26,7 +26,7 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
self.urls: List[str] = []
|
||||
self.current_url: str = ""
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
self.max_concurrency: int = 3
|
||||
self.max_concurrency: int = 2
|
||||
|
||||
def get_headers(self) -> dict:
|
||||
return {
|
||||
@@ -178,6 +178,7 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
|
||||
async def _fetch_limited(url: str):
|
||||
async with semaphore:
|
||||
await asyncio.sleep(random.uniform(0.08, 0.45))
|
||||
return await self.fetch(url, timeout=timeout, retries=retries)
|
||||
|
||||
tasks = [_fetch_limited(url) for url in urls]
|
||||
|
||||
@@ -13,7 +13,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_concurrency = 4
|
||||
self.max_concurrency = 2
|
||||
self.api_pairs = [
|
||||
("http", "https://www.proxy-list.download/api/v1/get?type=http"),
|
||||
("https", "https://www.proxy-list.download/api/v1/get?type=https"),
|
||||
|
||||
@@ -14,7 +14,7 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_concurrency = 4
|
||||
self.max_concurrency = 2
|
||||
# 与 sslproxies 同模板的镜像站较多,socks-proxy 在部分网络下不稳定,多源提高成功率
|
||||
self.urls = [
|
||||
"https://www.sslproxies.org/",
|
||||
|
||||
Reference in New Issue
Block a user