重构: 迁移后端代码到 app 目录,前端移动到 WebUI,添加完整测试套件

主要变更:
- 后端代码从根目录迁移到 app/ 目录
- 前端代码从 frontend/ 重命名为 WebUI/
- 更新所有导入路径以适配新结构
- 提取公共 API 响应函数到 app/api/common.py
- 精简验证器服务代码
- 更新启动脚本和文档

测试:
- 新增完整测试套件 (tests/)
- 单元测试: 模型、仓库层
- 集成测试: 覆盖所有 22+ API 端点
- E2E 测试: 4个完整工作流场景
- 添加 pytest 配置和测试运行脚本
This commit is contained in:
祀梦
2026-04-04 13:32:36 +08:00
parent df3cc87f88
commit 38bd66128b
109 changed files with 2017 additions and 548 deletions

13
app/core/__init__.py Normal file
View File

@@ -0,0 +1,13 @@
"""核心基础设施包"""
from .config import settings
from .log import logger
from .exceptions import ProxyPoolException, PluginNotFoundException, ProxyNotFoundException, ValidationException
__all__ = [
"settings",
"logger",
"ProxyPoolException",
"PluginNotFoundException",
"ProxyNotFoundException",
"ValidationException",
]

59
app/core/config.py Normal file
View File

@@ -0,0 +1,59 @@
"""全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件"""
import os
from typing import List
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
# 数据库配置
db_path: str = "db/proxies.sqlite"
# API 服务配置
host: str = "0.0.0.0"
port: int = 9949
# 验证器配置
validator_timeout: int = 5
validator_max_concurrency: int = 200
validator_connect_timeout: int = 3
# 爬虫配置
crawler_num_validators: int = 50
crawler_max_queue_size: int = 500
# 日志配置
log_level: str = "INFO"
log_dir: str = "logs"
# 导出配置
export_max_records: int = 10000
# 代理评分配置
score_valid: int = 10
score_invalid: int = -5
score_min: int = 0
score_max: int = 100
# 插件配置
plugins_dir: str = "plugins"
# CORS 配置
cors_origins: str = "http://localhost:8080,http://localhost:5173,http://localhost:9948"
@property
def cors_origins_list(self) -> List[str]:
return [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()]
@property
def base_dir(self) -> str:
return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# 全局配置实例(启动时加载一次)
settings = Settings()

120
app/core/db.py Normal file
View File

@@ -0,0 +1,120 @@
"""数据库连接管理 - 使用上下文管理器,避免全局单例连接泄漏"""
import os
import aiosqlite
from contextlib import asynccontextmanager
from typing import AsyncIterator
from app.core.config import settings
from app.core.log import logger
DB_PATH = os.path.join(settings.base_dir, settings.db_path)
def ensure_db_dir():
db_dir = os.path.dirname(DB_PATH)
if db_dir and not os.path.exists(db_dir):
os.makedirs(db_dir, exist_ok=True)
async def init_db():
"""初始化数据库表结构(支持迁移)"""
ensure_db_dir()
async with aiosqlite.connect(DB_PATH) as db:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
await db.execute("PRAGMA cache_size=-64000")
await db.execute("PRAGMA temp_store=MEMORY")
await db.execute("""
CREATE TABLE IF NOT EXISTS proxies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ip TEXT NOT NULL,
port INTEGER NOT NULL,
protocol TEXT DEFAULT 'http',
score INTEGER DEFAULT 10,
response_time_ms REAL,
last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(ip, port)
)
""")
# 迁移:如果旧表缺少 response_time_ms 列,则添加
try:
await db.execute("SELECT response_time_ms FROM proxies LIMIT 1")
except Exception:
await db.execute("ALTER TABLE proxies ADD COLUMN response_time_ms REAL")
logger.info("Migrated: added response_time_ms column")
# 迁移:如果旧表缺少 created_at 列,则添加
try:
await db.execute("SELECT created_at FROM proxies LIMIT 1")
except Exception:
await db.execute("ALTER TABLE proxies ADD COLUMN created_at TIMESTAMP")
await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL")
logger.info("Migrated: added created_at column")
await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)")
# 插件设置表
await db.execute("""
CREATE TABLE IF NOT EXISTS plugin_settings (
plugin_id TEXT PRIMARY KEY,
enabled INTEGER DEFAULT 1,
config_json TEXT DEFAULT '{}',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 迁移:为旧版 plugin_settings 表增加 config_json 列
try:
await db.execute("SELECT config_json FROM plugin_settings LIMIT 1")
except Exception:
await db.execute("ALTER TABLE plugin_settings ADD COLUMN config_json TEXT DEFAULT '{}'")
logger.info("Migrated: added config_json column to plugin_settings")
# 验证任务队列表
await db.execute("""
CREATE TABLE IF NOT EXISTS validation_tasks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ip TEXT NOT NULL,
port INTEGER NOT NULL,
protocol TEXT DEFAULT 'http',
status TEXT DEFAULT 'pending',
result TEXT,
response_time_ms REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
await db.execute("CREATE INDEX IF NOT EXISTS idx_validation_status ON validation_tasks(status)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_validation_created ON validation_tasks(created_at)")
# 系统设置表
await db.execute("""
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
await db.commit()
logger.info("Database initialized")
@asynccontextmanager
async def get_db() -> AsyncIterator[aiosqlite.Connection]:
"""获取数据库连接的异步上下文管理器"""
ensure_db_dir()
db = await aiosqlite.connect(DB_PATH)
try:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
yield db
finally:
await db.close()

24
app/core/exceptions.py Normal file
View File

@@ -0,0 +1,24 @@
"""业务异常定义"""
class ProxyPoolException(Exception):
"""基础业务异常"""
def __init__(self, message: str, code: int = 500):
self.message = message
self.code = code
super().__init__(self.message)
class PluginNotFoundException(ProxyPoolException):
def __init__(self, plugin_id: str):
super().__init__(f"Plugin '{plugin_id}' not found", 404)
class ProxyNotFoundException(ProxyPoolException):
def __init__(self, ip: str, port: int):
super().__init__(f"Proxy {ip}:{port} not found", 404)
class ValidationException(ProxyPoolException):
def __init__(self, message: str):
super().__init__(message, 400)

47
app/core/log.py Normal file
View File

@@ -0,0 +1,47 @@
import logging
import os
from logging.handlers import RotatingFileHandler
from datetime import datetime
class LogHandler(logging.Logger):
def __init__(self, name='ProxyPool', level=logging.INFO):
super().__init__(name, level)
# 获取项目根目录并创建 logs 目录
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
log_dir = os.path.join(base_dir, 'logs')
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# 使用日期作为文件名
log_filename = f"{datetime.now().strftime('%Y-%m-%d')}.log"
log_file = os.path.join(log_dir, log_filename)
# 设置格式
formatter = logging.Formatter(
'[%(asctime)s] %(name)s [%(levelname)s] %(filename)s[line:%(lineno)d]: %(message)s'
)
# 文件处理器使用RotatingFileHandler支持日志轮转
# 每个日志文件最大10MB保留5个备份
file_handler = RotatingFileHandler(
log_file,
maxBytes=10*1024*1024,
backupCount=5,
encoding='utf-8'
)
file_handler.setFormatter(formatter)
self.addHandler(file_handler)
# 控制台处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
self.addHandler(console_handler)
# 实例化一个默认 logger 供外部直接使用
logger = LogHandler()
if __name__ == '__main__':
logger.info('这是一条按日期存储的日志测试')

View File

@@ -0,0 +1,5 @@
"""插件系统包"""
from .base import BaseCrawlerPlugin, ProxyRaw
from .registry import registry
__all__ = ["BaseCrawlerPlugin", "ProxyRaw", "registry"]

View File

@@ -0,0 +1,55 @@
"""插件基类 - 所有爬虫插件必须继承此基类"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Any
@dataclass
class ProxyRaw:
"""爬虫产出的原始代理数据"""
ip: str
port: int
protocol: str = "http"
def __post_init__(self):
self.protocol = self.protocol.lower().strip()
if self.protocol not in ("http", "https", "socks4", "socks5"):
self.protocol = "http"
class BaseCrawlerPlugin(ABC):
"""爬虫插件基类
添加新爬虫只需:
1. 继承 BaseCrawlerPlugin
2. 实现 crawl() 方法返回 List[ProxyRaw]
3. 用 @registry.register 装饰或在 __init__ 中显式注册
"""
name: str = ""
display_name: str = ""
description: str = ""
enabled: bool = True
default_config: Dict[str, Any] = {}
def __init__(self):
self._config: Dict[str, Any] = dict(self.default_config or {})
@property
def config(self) -> Dict[str, Any]:
return self._config
def update_config(self, updates: Dict[str, Any]) -> None:
"""更新插件配置,只覆盖存在的键"""
for key, value in updates.items():
if key in self._config:
self._config[key] = value
@abstractmethod
async def crawl(self) -> List[ProxyRaw]:
"""爬取代理的核心方法。只负责爬取,不要在这里验证。"""
raise NotImplementedError
async def health_check(self) -> bool:
"""可选:检查插件健康状态"""
return True

View File

@@ -0,0 +1,77 @@
"""插件注册中心 - 显式注册,类型安全,测试友好"""
import importlib
import inspect
import os
from typing import Dict, List, Type, Optional
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.log import logger
class PluginRegistry:
"""插件注册中心"""
def __init__(self):
self._plugins: Dict[str, Type[BaseCrawlerPlugin]] = {}
self._instances: Dict[str, BaseCrawlerPlugin] = {}
def register(self, plugin_cls: Type[BaseCrawlerPlugin]) -> Type[BaseCrawlerPlugin]:
"""注册一个插件类。支持装饰器语法。"""
if not inspect.isclass(plugin_cls) or not issubclass(plugin_cls, BaseCrawlerPlugin):
raise ValueError("Plugin must be a subclass of BaseCrawlerPlugin")
if not plugin_cls.name:
raise ValueError(f"Plugin {plugin_cls.__name__} must have a 'name' attribute")
self._plugins[plugin_cls.name] = plugin_cls
logger.info(f"Plugin registered: {plugin_cls.name} ({plugin_cls.__name__})")
return plugin_cls
def get(self, name: str) -> Optional[BaseCrawlerPlugin]:
"""获取插件实例(懒加载)"""
if name not in self._instances:
cls = self._plugins.get(name)
if cls:
self._instances[name] = cls()
return self._instances.get(name)
def list_plugins(self) -> List[BaseCrawlerPlugin]:
"""获取所有已注册插件的实例列表"""
result = []
for name in self._plugins:
instance = self.get(name)
if instance:
result.append(instance)
return result
def get_plugin_names(self) -> List[str]:
return list(self._plugins.keys())
def auto_discover(self, package_name: str):
"""自动扫描指定包下的所有模块并注册其中的插件类。
注意为了类型安全和可控性推荐显式注册。auto_discover 仅作为兼容。"""
try:
package = importlib.import_module(package_name)
package_dir = os.path.dirname(package.__file__)
except Exception as e:
logger.error(f"Auto discover failed for package {package_name}: {e}")
return
for filename in os.listdir(package_dir):
if filename.endswith(".py") and not filename.startswith("__"):
module_name = f"{package_name}.{filename[:-3]}"
try:
module = importlib.import_module(module_name)
for attr_name in dir(module):
obj = getattr(module, attr_name)
if (
inspect.isclass(obj)
and issubclass(obj, BaseCrawlerPlugin)
and obj is not BaseCrawlerPlugin
and obj not in self._plugins.values()
):
self.register(obj)
except Exception as e:
logger.error(f"Failed to load module {module_name}: {e}")
# 全局注册中心实例
registry = PluginRegistry()

View File

@@ -0,0 +1,4 @@
"""任务队列包"""
from .queue import ValidationQueue
__all__ = ["ValidationQueue"]

149
app/core/tasks/queue.py Normal file
View File

@@ -0,0 +1,149 @@
"""验证任务队列 - 解耦爬取与验证,支持背压控制和持久化"""
import asyncio
from typing import Optional
from app.models.domain import ProxyRaw
from app.repositories.task_repo import ValidationTaskRepository
from app.core.db import get_db
from app.core.log import logger
class ValidationQueue:
"""代理验证队列(支持持久化到 SQLite
工作流程:
1. 爬虫将原始代理 submit() 到队列(写入数据库 + 内存信号)
2. Worker 池从数据库消费并验证
3. 验证通过的代理写入数据库
4. 服务重启时自动恢复未完成的 pending 任务
"""
def __init__(
self,
validator,
proxy_repo,
worker_count: int = 50,
score_valid: int = 10,
score_invalid: int = -5,
score_min: int = 0,
score_max: int = 100,
):
self.validator = validator
self.proxy_repo = proxy_repo
self.task_repo = ValidationTaskRepository()
self.worker_count = worker_count
self.score_valid = score_valid
self.score_invalid = score_invalid
self.score_min = score_min
self.score_max = score_max
self._signal: asyncio.Queue[None] = asyncio.Queue()
self._workers: list[asyncio.Task] = []
self._running = False
self._db_lock = asyncio.Lock()
# 统计
self.valid_count = 0
self.invalid_count = 0
async def start(self):
if self._running:
return
self._running = True
# 恢复之前中断的 processing 任务
async with get_db() as db:
recovered = await self.task_repo.reset_processing(db)
pending = await self.task_repo.get_pending_count(db)
if recovered:
logger.info(f"ValidationQueue recovered {recovered} interrupted tasks")
if pending:
logger.info(f"ValidationQueue has {pending} pending tasks to process")
for i in range(self.worker_count):
self._workers.append(asyncio.create_task(self._worker_loop(i)))
# 唤醒 Worker 处理恢复的 pending 任务
if pending:
for _ in range(min(pending, self.worker_count)):
self._signal.put_nowait(None)
logger.info(f"ValidationQueue started with {self.worker_count} workers")
async def stop(self):
if not self._running:
return
self._running = False
for _ in self._workers:
self._signal.put_nowait(None) # sentinel
if self._workers:
await asyncio.gather(*self._workers, return_exceptions=True)
self._workers.clear()
logger.info("ValidationQueue stopped")
async def submit(self, proxies: list[ProxyRaw]):
"""提交代理到验证队列(持久化 + 唤醒 Worker"""
async with self._db_lock:
async with get_db() as db:
inserted = await self.task_repo.insert_batch(db, proxies)
if inserted:
for _ in range(min(inserted, self.worker_count)):
self._signal.put_nowait(None)
async def submit_one(self, proxy: ProxyRaw):
await self.submit([proxy])
async def drain(self):
"""等待队列中当前所有 pending 任务处理完毕"""
while True:
async with get_db() as db:
count = await self.task_repo.get_pending_count(db)
if count == 0:
break
await asyncio.sleep(0.5)
async def _worker_loop(self, worker_id: int):
while True:
await self._signal.get()
self._signal.task_done()
if not self._running:
break
await self._process_one_task(worker_id)
async def _process_one_task(self, worker_id: int):
"""从数据库取一个任务并验证"""
async with self._db_lock:
async with get_db() as db:
task = await self.task_repo.acquire_pending(db)
if not task:
return
proxy = ProxyRaw(task["ip"], task["port"], task["protocol"])
try:
is_valid, latency = await self.validator.validate(
proxy.ip, proxy.port, proxy.protocol
)
except Exception as e:
logger.error(f"Worker {worker_id} validation error: {e}")
is_valid, latency = False, 0.0
async with self._db_lock:
async with get_db() as db:
if is_valid:
await self.proxy_repo.insert_or_update(
db, proxy.ip, proxy.port, proxy.protocol, score=self.score_valid
)
if latency:
await self.proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
await self.task_repo.complete_task(db, task["id"], True, latency)
self.valid_count += 1
logger.debug(f"ValidationQueue: valid {proxy.ip}:{proxy.port}")
else:
await self.task_repo.complete_task(db, task["id"], False, 0.0)
self.invalid_count += 1
logger.debug(f"ValidationQueue: invalid {proxy.ip}:{proxy.port}")
def reset_stats(self):
self.valid_count = 0
self.invalid_count = 0