主要变更: - 后端代码从根目录迁移到 app/ 目录 - 前端代码从 frontend/ 重命名为 WebUI/ - 更新所有导入路径以适配新结构 - 提取公共 API 响应函数到 app/api/common.py - 精简验证器服务代码 - 更新启动脚本和文档 测试: - 新增完整测试套件 (tests/) - 单元测试: 模型、仓库层 - 集成测试: 覆盖所有 22+ API 端点 - E2E 测试: 4个完整工作流场景 - 添加 pytest 配置和测试运行脚本
53 lines
2.4 KiB
Python
53 lines
2.4 KiB
Python
"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
|
|
import random
|
|
import asyncio
|
|
import aiohttp
|
|
from typing import List
|
|
from app.core.plugin_system import BaseCrawlerPlugin
|
|
|
|
|
|
class BaseHTTPPlugin(BaseCrawlerPlugin):
|
|
"""基于 HTTP 的爬虫插件基类"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.user_agents = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
|
|
]
|
|
self.urls: List[str] = []
|
|
self.current_url: str = ""
|
|
|
|
def get_headers(self) -> dict:
|
|
return {
|
|
"User-Agent": random.choice(self.user_agents),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
|
|
"Connection": "keep-alive",
|
|
}
|
|
|
|
async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str:
|
|
"""异步抓取指定 URL 的 HTML 内容"""
|
|
headers = self.get_headers()
|
|
async with aiohttp.ClientSession(headers=headers) as session:
|
|
for attempt in range(retries):
|
|
try:
|
|
async with session.get(
|
|
url, timeout=aiohttp.ClientTimeout(total=timeout)
|
|
) as response:
|
|
if response.status == 200:
|
|
content = await response.read()
|
|
encoding = response.get_encoding()
|
|
if encoding == "utf-8" or not encoding:
|
|
try:
|
|
return content.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
return content.decode("gbk", errors="ignore")
|
|
return content.decode(encoding, errors="ignore")
|
|
except Exception:
|
|
pass
|
|
await asyncio.sleep(random.uniform(1, 3))
|
|
return ""
|