fix: 修复设置系统脱节、队列计数漂移、资源泄露等全量问题

- 统一设置系统:create_scheduler_service 读取 DB 设置覆盖默认值
- 修复 ProxyRepository.update_score 误删所有无效代理的 SQL
- ValidationQueue:修复 Worker 计数漂移与启动恢复任务饿死
- SchedulerService:移除 drain() 阻塞,主循环可正常响应 stop
- TaskService:在调度器周期内自动清理过期任务,防止内存泄漏
- lifespan/conftest:规范关闭顺序,消除 Event loop closed 警告
- Repository:异常日志增加 exc_info,今日新增按 created_at 统计
- ValidatorService:防止 HTTP session 重复关闭,移除 SOCKS 多余 close
- 前端:补全 pluginsStore.isEmpty,ProxyList 最低分数上限改为 100
- 删除 config.py 中冗余的 cors_origins_list property
This commit is contained in:
祀梦
2026-04-04 20:31:52 +08:00
parent 0788a13c8a
commit 875e61f17e
26 changed files with 568 additions and 355 deletions

View File

@@ -2,7 +2,7 @@
import random
import asyncio
import httpx
from typing import List
from typing import List, Optional
from app.core.plugin_system import BaseCrawlerPlugin
@@ -19,6 +19,7 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
]
self.urls: List[str] = []
self.current_url: str = ""
self._client: Optional[httpx.AsyncClient] = None
def get_headers(self) -> dict:
return {
@@ -28,30 +29,38 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
"Connection": "keep-alive",
}
def _get_client(self) -> httpx.AsyncClient:
"""获取或创建复用的 AsyncClient"""
if self._client is None or self._client.is_closed:
transport = httpx.AsyncHTTPTransport(retries=0)
self._client = httpx.AsyncClient(
transport=transport,
follow_redirects=True,
)
return self._client
async def fetch(self, url: str, timeout: float = 15.0, retries: int = 2) -> str:
"""异步抓取指定 URL 的 HTML 内容"""
from app.core.log import logger
headers = self.get_headers()
transport = httpx.AsyncHTTPTransport(retries=0)
client = self._get_client()
for attempt in range(retries):
async with httpx.AsyncClient(headers=headers, transport=transport, follow_redirects=True) as client:
try:
response = await client.get(url, timeout=timeout)
if response.status_code == 200:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
else:
logger.warning(f"Fetch {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
response = await client.get(url, headers=self.get_headers(), timeout=timeout)
if response.status_code == 200:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
else:
logger.warning(f"Fetch {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
return ""
async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
@@ -64,3 +73,9 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
tasks = [_fetch_limited(url) for url in urls]
return await asyncio.gather(*tasks)
async def close(self):
"""关闭复用的 HTTP 客户端"""
if self._client and not self._client.is_closed:
await self._client.aclose()
self._client = None