后端重构: - 新增分层架构:API Routes -> Services -> Repositories -> Infrastructure - 彻底移除全局单例,全面采用 FastAPI 依赖注入 - 新增 api/ 目录拆分路由(proxies, plugins, scheduler, settings, stats) - 新增 services/ 业务逻辑层:ProxyService, PluginService, SchedulerService, ValidatorService, SettingsService - 新增 repositories/ 数据访问层:ProxyRepository, SettingsRepository, PluginSettingsRepository - 新增 models/ 层:Pydantic Schemas + Domain Models - 重写 core/config.py:采用 Pydantic Settings 管理配置 - 新增 core/db.py:基于 asynccontextmanager 的连接管理,支持数据库迁移 - 新增 core/exceptions.py:统一业务异常体系 插件系统重构(核心): - 新增 core/plugin_system/:BaseCrawlerPlugin + PluginRegistry - 采用显式注册模式(装饰器 + plugins/__init__.py),类型安全、测试友好 - 新增 plugins/base.py:BaseHTTPPlugin 通用 HTTP 爬虫基类 - 迁移全部 7 个插件到新架构(fate0, proxylist_download, ip3366, ip89, kuaidaili, speedx, yundaili) - 插件状态持久化到 plugin_settings 表 任务调度重构: - 新增 core/tasks/queue.py:ValidationQueue + WorkerPool - 解耦爬取与验证:爬虫只负责爬取,代理提交队列后由 Worker 异步验证 - 调度器定时从数据库拉取存量代理并分批投入验证队列 前端调整: - 新增 frontend/src/services/ 层拆分 API 调用逻辑 - 调整 stores/ 和 views/ 使用 Service 层 - 保持 API 兼容性,页面无需大幅修改 其他: - 新增 main.py 作为新入口 - 新增 DESIGN.md 架构设计文档 - 更新 requirements.txt 增加 pydantic-settings
53 lines
2.4 KiB
Python
53 lines
2.4 KiB
Python
"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
|
|
import random
|
|
import asyncio
|
|
import aiohttp
|
|
from typing import List
|
|
from core.plugin_system import BaseCrawlerPlugin
|
|
|
|
|
|
class BaseHTTPPlugin(BaseCrawlerPlugin):
|
|
"""基于 HTTP 的爬虫插件基类"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.user_agents = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
|
|
]
|
|
self.urls: List[str] = []
|
|
self.current_url: str = ""
|
|
|
|
def get_headers(self) -> dict:
|
|
return {
|
|
"User-Agent": random.choice(self.user_agents),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
|
|
"Connection": "keep-alive",
|
|
}
|
|
|
|
async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str:
|
|
"""异步抓取指定 URL 的 HTML 内容"""
|
|
headers = self.get_headers()
|
|
async with aiohttp.ClientSession(headers=headers) as session:
|
|
for attempt in range(retries):
|
|
try:
|
|
async with session.get(
|
|
url, timeout=aiohttp.ClientTimeout(total=timeout)
|
|
) as response:
|
|
if response.status == 200:
|
|
content = await response.read()
|
|
encoding = response.get_encoding()
|
|
if encoding == "utf-8" or not encoding:
|
|
try:
|
|
return content.decode("utf-8")
|
|
except UnicodeDecodeError:
|
|
return content.decode("gbk", errors="ignore")
|
|
return content.decode(encoding, errors="ignore")
|
|
except Exception:
|
|
pass
|
|
await asyncio.sleep(random.uniform(1, 3))
|
|
return ""
|