fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP
- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
This commit is contained in:
@@ -14,7 +14,7 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_concurrency = 6
|
||||
self.max_concurrency = 4
|
||||
# 与 sslproxies 同模板的镜像站较多,socks-proxy 在部分网络下不稳定,多源提高成功率
|
||||
self.urls = [
|
||||
"https://www.sslproxies.org/",
|
||||
@@ -39,7 +39,7 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
|
||||
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
results: List[ProxyRaw] = []
|
||||
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
|
||||
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
||||
for url, html in zip(self.urls, htmls):
|
||||
if not html:
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user