fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP

- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued
- BaseHTTPPlugin: longer connect budget for slow international links
- proxyscrape: jsDelivr mirror + longer GitHub/API phases
- fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 13:48:41 +08:00
parent 2c98abaf91
commit 957cee3100
13 changed files with 116 additions and 52 deletions

View File

@@ -0,0 +1,22 @@
"""批量爬取时限制同时发起 HTTP 的插件数,避免 crawl-all 与验证/聚合任务抢满执行器槽位。"""
import asyncio
from contextlib import asynccontextmanager
from typing import AsyncIterator
# 与单插件内 max_concurrency 相乘后仍应对外网友好;过小会拉长总耗时。
CRAWL_MAX_CONCURRENT = 4
_sem: asyncio.Semaphore | None = None
def _get_sem() -> asyncio.Semaphore:
global _sem
if _sem is None:
_sem = asyncio.Semaphore(CRAWL_MAX_CONCURRENT)
return _sem
@asynccontextmanager
async def crawl_slot() -> AsyncIterator[None]:
async with _get_sem():
yield

View File

@@ -4,7 +4,8 @@ from contextlib import asynccontextmanager
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from app.core.execution.job import Job, JobStatus from app.core.execution.job import CrawlJob, Job, JobStatus
from app.core.execution.crawl_gate import crawl_slot
from app.core.execution.worker_pool import AsyncWorkerPool from app.core.execution.worker_pool import AsyncWorkerPool
from app.core.log import logger from app.core.log import logger
@@ -67,24 +68,32 @@ class JobExecutor:
return job.id return job.id
async def _run_job(self, job: Job) -> None: async def _run_job(self, job: Job) -> None:
async def _execute() -> None:
try:
if job.is_cancelled:
logger.info(f"Job {job.id} was cancelled before running")
return
result = await job.run()
if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
job._set_completed(result)
logger.info(f"Job {job.id} completed: {result}")
except asyncio.CancelledError:
job.status = JobStatus.CANCELLED
job._touch()
logger.info(f"Job {job.id} cancelled during execution")
except Exception as e:
job._set_failed(str(e))
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
try: try:
async with self._semaphore: # CrawlJob 先等爬取槽位再占执行器,避免十几个任务占满 max_concurrent_jobs 却只排队等外网
try: if isinstance(job, CrawlJob):
if job.is_cancelled: async with crawl_slot():
logger.info(f"Job {job.id} was cancelled before running") async with self._semaphore:
return await _execute()
result = await job.run() else:
# 如果子类没有显式设置完成状态,自动设为 completed async with self._semaphore:
if job.status not in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED): await _execute()
job._set_completed(result)
logger.info(f"Job {job.id} completed: {result}")
except asyncio.CancelledError:
job.status = JobStatus.CANCELLED
job._touch()
logger.info(f"Job {job.id} cancelled during execution")
except Exception as e:
job._set_failed(str(e))
logger.error(f"Job {job.id} failed: {e}", exc_info=True)
finally: finally:
self._tasks.pop(job.id, None) self._tasks.pop(job.id, None)

View File

@@ -52,7 +52,8 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
def _http_timeout(seconds: float) -> httpx.Timeout: def _http_timeout(seconds: float) -> httpx.Timeout:
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。""" """连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
t = max(2.0, float(seconds)) t = max(2.0, float(seconds))
c = min(6.0, max(3.0, t * 0.35)) # 国际链路 / 批量爬取时 connect 过短易集体超时
c = min(12.0, max(4.0, t * 0.4))
return httpx.Timeout(t, connect=c) return httpx.Timeout(t, connect=c)
@staticmethod @staticmethod

View File

@@ -47,7 +47,7 @@ class FpwCheckerproxyPlugin(BaseHTTPPlugin):
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = [] merged: List[ProxyRaw] = []
seen: Set[Tuple[str, int, str]] = set() seen: Set[Tuple[str, int, str]] = set()
htmls = await self.fetch_all(self.urls, timeout=12, retries=1) htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for html in htmls: for html in htmls:
if not html or len(html) < 200: if not html or len(html) < 200:
continue continue

View File

@@ -55,7 +55,7 @@ class FpwFreeproxylistsPlugin(BaseHTTPPlugin):
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
seen = set() seen = set()
out: List[ProxyRaw] = [] out: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=10, retries=1) htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for url, html in zip(self.urls, htmls): for url, html in zip(self.urls, htmls):
if not html: if not html:
continue continue

View File

@@ -47,7 +47,7 @@ class FpwGatherproxyPlugin(BaseHTTPPlugin):
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
seen = set() seen = set()
out: List[ProxyRaw] = [] out: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=10, retries=1) htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for url, html in zip(self.urls, htmls): for url, html in zip(self.urls, htmls):
if not html: if not html:
continue continue

View File

@@ -21,7 +21,7 @@ class FpwHidemyPlugin(BaseHTTPPlugin):
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = [] results: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1) htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for url, html in zip(self.urls, htmls): for url, html in zip(self.urls, htmls):
if not html: if not html:
continue continue

View File

@@ -51,7 +51,7 @@ class FpwPremproxyPlugin(BaseHTTPPlugin):
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = [] merged: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1) htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for url, html in zip(self.urls, htmls): for url, html in zip(self.urls, htmls):
if not html: if not html:
continue continue

View File

@@ -13,7 +13,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_concurrency = 8 self.max_concurrency = 4
self.api_pairs = [ self.api_pairs = [
("http", "https://www.proxy-list.download/api/v1/get?type=http"), ("http", "https://www.proxy-list.download/api/v1/get?type=http"),
("https", "https://www.proxy-list.download/api/v1/get?type=https"), ("https", "https://www.proxy-list.download/api/v1/get?type=https"),
@@ -30,7 +30,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = [] results: List[ProxyRaw] = []
urls = [u for _, u in self.api_pairs] urls = [u for _, u in self.api_pairs]
htmls = await self.fetch_all(urls, timeout=10, retries=1) htmls = await self.fetch_all(urls, timeout=25, retries=2)
for (protocol, _), text in zip(self.api_pairs, htmls): for (protocol, _), text in zip(self.api_pairs, htmls):
if not text: if not text:
continue continue
@@ -41,7 +41,7 @@ class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
if not results: if not results:
logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用") logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用")
fb_urls = [u for _, u in self.fallback_pairs] fb_urls = [u for _, u in self.fallback_pairs]
fb_htmls = await self.fetch_all(fb_urls, timeout=10, retries=1) fb_htmls = await self.fetch_all(fb_urls, timeout=25, retries=2)
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls): for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
if not text: if not text:
continue continue

View File

@@ -65,7 +65,7 @@ class FpwProxynovaPlugin(BaseHTTPPlugin):
return out return out
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
html = await self.fetch(self.urls[0], timeout=14, retries=1) html = await self.fetch(self.urls[0], timeout=25, retries=2)
if not html: if not html:
return [] return []
results = self._parse_rows(html) results = self._parse_rows(html)

View File

@@ -14,7 +14,7 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.max_concurrency = 6 self.max_concurrency = 4
# 与 sslproxies 同模板的镜像站较多socks-proxy 在部分网络下不稳定,多源提高成功率 # 与 sslproxies 同模板的镜像站较多socks-proxy 在部分网络下不稳定,多源提高成功率
self.urls = [ self.urls = [
"https://www.sslproxies.org/", "https://www.sslproxies.org/",
@@ -39,7 +39,7 @@ class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = [] results: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1) htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for url, html in zip(self.urls, htmls): for url, html in zip(self.urls, htmls):
if not html: if not html:
continue continue

View File

@@ -130,7 +130,7 @@ class FpwSpysOnePlugin(BaseHTTPPlugin):
async def _one(proto: str, url: str, xf5: str) -> Tuple[str, str]: async def _one(proto: str, url: str, xf5: str) -> Tuple[str, str]:
data = {**form_base, "xf5": xf5} data = {**form_base, "xf5": xf5}
html = await self.fetch_post(url, data=data, timeout=14, retries=1) html = await self.fetch_post(url, data=data, timeout=25, retries=2)
return proto, html or "" return proto, html or ""
pairs = await asyncio.gather( pairs = await asyncio.gather(

View File

@@ -20,13 +20,16 @@ class ProxyScrapePlugin(BaseHTTPPlugin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
# GitHub raw 源作为首选 # GitHub raw 首选;国内/高负载时 jsDelivr 镜像常更稳
self.urls = [ self.urls = [
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"), ("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"), ("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"), ("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"), ("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
] ]
self._mirror_prefix = (
"https://cdn.jsdelivr.net/gh/monosans/proxy-list@main/proxies/"
)
# ProxyScrape 官方 API 作为 fallback # ProxyScrape 官方 API 作为 fallback
self.api_urls = { self.api_urls = {
"http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all", "http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
@@ -56,14 +59,18 @@ class ProxyScrapePlugin(BaseHTTPPlugin):
results: List[ProxyRaw] = [] results: List[ProxyRaw] = []
protocols = [protocol for protocol, _ in self.urls] protocols = [protocol for protocol, _ in self.urls]
urls = [url for _, url in self.urls] urls = [url for _, url in self.urls]
fetch_timeout = 28.0
# 1. 并发请求所有 GitHub raw 源,整体限时 10s先完成的保留结果 # 1. GitHub raw:放宽总等待,避免 crawl-all 时与其它插件抢带宽导致集体超时
tasks = [asyncio.create_task(self.fetch(url, timeout=12)) for url in urls] tasks = [
done, pending = await asyncio.wait(tasks, timeout=10) asyncio.create_task(self.fetch(url, timeout=fetch_timeout))
for url in urls
]
done, pending = await asyncio.wait(tasks, timeout=45)
for task in pending: for task in pending:
task.cancel() task.cancel()
htmls = [] htmls: list[str] = []
done_protocols = set() done_protocols: set[str] = set()
for i, task in enumerate(tasks): for i, task in enumerate(tasks):
try: try:
if task in done: if task in done:
@@ -73,35 +80,60 @@ class ProxyScrapePlugin(BaseHTTPPlugin):
htmls.append("") htmls.append("")
except Exception: except Exception:
htmls.append("") htmls.append("")
# 异常时不加入 done_protocols以便触发 API fallback
fallback_protocols = [] need_mirror: list[str] = []
for protocol, html in zip(protocols, htmls): for protocol, html in zip(protocols, htmls):
proxies = self._parse_proxies(html or "", protocol) if html else [] proxies = self._parse_proxies(html or "", protocol) if html else []
if proxies: if proxies:
logger.info(f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理") logger.info(
f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理"
)
results.extend(proxies) results.extend(proxies)
else: else:
if protocol in done_protocols: if protocol in done_protocols:
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效,将尝试 API fallback") logger.warning(
f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效,尝试镜像与 API"
)
else: else:
logger.warning(f"ProxyScrape {protocol.upper()} GitHub raw 请求超时,将尝试 API fallback") logger.warning(
fallback_protocols.append(protocol) f"ProxyScrape {protocol.upper()} GitHub raw 请求超时,尝试镜像与 API"
)
need_mirror.append(protocol)
# 2. 对 GitHub raw 失败的协议,并发请求 ProxyScrape API fallback # 2. jsDelivr 镜像(顺序请求,减轻与其它插件的瞬时并发叠加)
if fallback_protocols: still_need_api: list[str] = []
fallback_urls = [self.api_urls[p] for p in fallback_protocols] for protocol in need_mirror:
mirror_url = f"{self._mirror_prefix}{protocol}.txt"
text = await self.fetch(mirror_url, timeout=fetch_timeout, retries=2)
proxies = self._parse_proxies(text or "", protocol) if text else []
if proxies:
logger.info(
f"ProxyScrape {protocol.upper()} jsDelivr 镜像获取 {len(proxies)} 个代理"
)
results.extend(proxies)
else:
still_need_api.append(protocol)
# 3. ProxyScrape 官方 API
if still_need_api:
fallback_urls = [self.api_urls[p] for p in still_need_api]
try: try:
api_htmls = await asyncio.wait_for( api_htmls = await asyncio.wait_for(
self.fetch_all(fallback_urls, timeout=10), timeout=10 self.fetch_all(fallback_urls, timeout=25), timeout=35
) )
except asyncio.TimeoutError: except asyncio.TimeoutError:
logger.warning(f"ProxyScrape API fallback 批量请求超时,跳过 {len(fallback_protocols)} 个协议") logger.warning(
api_htmls = [""] * len(fallback_protocols) f"ProxyScrape API fallback 批量请求超时,跳过 {len(still_need_api)} 个协议"
for protocol, api_html in zip(fallback_protocols, api_htmls): )
proxies = self._parse_proxies(api_html or "", protocol) if api_html else [] api_htmls = [""] * len(still_need_api)
for protocol, api_html in zip(still_need_api, api_htmls):
proxies = (
self._parse_proxies(api_html or "", protocol) if api_html else []
)
if proxies: if proxies:
logger.info(f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理") logger.info(
f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理"
)
results.extend(proxies) results.extend(proxies)
else: else:
logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效") logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效")