fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP
- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
This commit is contained in:
@@ -13,7 +13,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_concurrency = 8
|
||||
self.max_concurrency = 4
|
||||
self.api_pairs = [
|
||||
("http", "https://www.proxy-list.download/api/v1/get?type=http"),
|
||||
("https", "https://www.proxy-list.download/api/v1/get?type=https"),
|
||||
@@ -30,7 +30,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
urls = [u for _, u in self.api_pairs]
|
||||
htmls = await self.fetch_all(urls, timeout=10, retries=1)
|
||||
htmls = await self.fetch_all(urls, timeout=25, retries=2)
|
||||
for (protocol, _), text in zip(self.api_pairs, htmls):
|
||||
if not text:
|
||||
continue
|
||||
@@ -41,7 +41,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
|
||||
if not results:
|
||||
logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用")
|
||||
fb_urls = [u for _, u in self.fallback_pairs]
|
||||
fb_htmls = await self.fetch_all(fb_urls, timeout=10, retries=1)
|
||||
fb_htmls = await self.fetch_all(fb_urls, timeout=25, retries=2)
|
||||
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
|
||||
if not text:
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user