- 统一设置系统:create_scheduler_service 读取 DB 设置覆盖默认值 - 修复 ProxyRepository.update_score 误删所有无效代理的 SQL - ValidationQueue:修复 Worker 计数漂移与启动恢复任务饿死 - SchedulerService:移除 drain() 阻塞,主循环可正常响应 stop - TaskService:在调度器周期内自动清理过期任务,防止内存泄漏 - lifespan/conftest:规范关闭顺序,消除 Event loop closed 警告 - Repository:异常日志增加 exc_info,今日新增按 created_at 统计 - ValidatorService:防止 HTTP session 重复关闭,移除 SOCKS 多余 close - 前端:补全 pluginsStore.isEmpty,ProxyList 最低分数上限改为 100 - 删除 config.py 中冗余的 cors_origins_list property
82 lines
3.5 KiB
Python
82 lines
3.5 KiB
Python
"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
|
||
import random
|
||
import asyncio
|
||
import httpx
|
||
from typing import List, Optional
|
||
from app.core.plugin_system import BaseCrawlerPlugin
|
||
|
||
|
||
class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||
"""基于 HTTP 的爬虫插件基类"""
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.user_agents = [
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
|
||
]
|
||
self.urls: List[str] = []
|
||
self.current_url: str = ""
|
||
self._client: Optional[httpx.AsyncClient] = None
|
||
|
||
def get_headers(self) -> dict:
|
||
return {
|
||
"User-Agent": random.choice(self.user_agents),
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
|
||
"Connection": "keep-alive",
|
||
}
|
||
|
||
def _get_client(self) -> httpx.AsyncClient:
|
||
"""获取或创建复用的 AsyncClient"""
|
||
if self._client is None or self._client.is_closed:
|
||
transport = httpx.AsyncHTTPTransport(retries=0)
|
||
self._client = httpx.AsyncClient(
|
||
transport=transport,
|
||
follow_redirects=True,
|
||
)
|
||
return self._client
|
||
|
||
async def fetch(self, url: str, timeout: float = 15.0, retries: int = 2) -> str:
|
||
"""异步抓取指定 URL 的 HTML 内容"""
|
||
from app.core.log import logger
|
||
client = self._get_client()
|
||
for attempt in range(retries):
|
||
try:
|
||
response = await client.get(url, headers=self.get_headers(), timeout=timeout)
|
||
if response.status_code == 200:
|
||
content = response.content
|
||
encoding = response.encoding
|
||
if encoding == "utf-8" or not encoding:
|
||
try:
|
||
return content.decode("utf-8")
|
||
except UnicodeDecodeError:
|
||
return content.decode("gbk", errors="ignore")
|
||
return content.decode(encoding, errors="ignore")
|
||
else:
|
||
logger.warning(f"Fetch {url} returned status {response.status_code}")
|
||
except Exception as e:
|
||
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
|
||
if attempt < retries - 1:
|
||
await asyncio.sleep(random.uniform(1, 3))
|
||
return ""
|
||
|
||
async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
|
||
"""并发抓取多个 URL,限制单个插件内部并发为 3"""
|
||
semaphore = asyncio.Semaphore(3)
|
||
|
||
async def _fetch_limited(url: str):
|
||
async with semaphore:
|
||
return await self.fetch(url, timeout=timeout)
|
||
|
||
tasks = [_fetch_limited(url) for url in urls]
|
||
return await asyncio.gather(*tasks)
|
||
|
||
async def close(self):
|
||
"""关闭复用的 HTTP 客户端"""
|
||
if self._client and not self._client.is_closed:
|
||
await self._client.aclose()
|
||
self._client = None
|