重构: 迁移后端代码到 app 目录,前端移动到 WebUI,添加完整测试套件

主要变更:
- 后端代码从根目录迁移到 app/ 目录
- 前端代码从 frontend/ 重命名为 WebUI/
- 更新所有导入路径以适配新结构
- 提取公共 API 响应函数到 app/api/common.py
- 精简验证器服务代码
- 更新启动脚本和文档

测试:
- 新增完整测试套件 (tests/)
- 单元测试: 模型、仓库层
- 集成测试: 覆盖所有 22+ API 端点
- E2E 测试: 4个完整工作流场景
- 添加 pytest 配置和测试运行脚本
This commit is contained in:
祀梦
2026-04-04 13:32:36 +08:00
parent df3cc87f88
commit 38bd66128b
109 changed files with 2017 additions and 548 deletions

4
app/api/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
"""API 包"""
from .main import create_app
__all__ = ["create_app"]

41
app/api/common.py Normal file
View File

@@ -0,0 +1,41 @@
"""API 通用工具函数"""
from typing import Any, Optional
from fastapi.responses import JSONResponse
def success_response(message: str, data: Any = None) -> dict:
"""成功响应"""
return {"code": 200, "message": message, "data": data}
def error_response(message: str, code: int = 500) -> JSONResponse:
"""错误响应"""
return JSONResponse(
status_code=code,
content={"code": code, "message": message, "data": None},
)
def format_proxy(proxy) -> dict:
"""格式化代理对象"""
return {
"ip": proxy.ip,
"port": proxy.port,
"protocol": proxy.protocol,
"score": proxy.score,
"last_check": proxy.last_check.isoformat() if proxy.last_check else None,
}
def format_plugin(plugin) -> dict:
"""格式化插件对象"""
return {
"id": plugin.id,
"name": plugin.display_name,
"display_name": plugin.display_name,
"description": plugin.description,
"enabled": plugin.enabled,
"last_run": plugin.last_run.isoformat() if plugin.last_run else None,
"success_count": plugin.success_count,
"failure_count": plugin.failure_count,
}

45
app/api/deps.py Normal file
View File

@@ -0,0 +1,45 @@
"""依赖注入"""
from fastapi import Request
from app.services.proxy_service import ProxyService
from app.services.plugin_service import PluginService
from app.services.scheduler_service import SchedulerService
from app.services.validator_service import ValidatorService
from app.repositories.proxy_repo import ProxyRepository
from app.core.tasks.queue import ValidationQueue
from app.core.config import settings as app_settings
def get_proxy_service() -> ProxyService:
return ProxyService()
def get_plugin_service() -> PluginService:
return PluginService()
def get_scheduler_service(request: Request) -> SchedulerService:
return request.app.state.scheduler_service
def get_validation_queue(request: Request) -> ValidationQueue:
return request.app.state.validation_queue
def create_scheduler_service() -> SchedulerService:
"""在应用启动时创建 SchedulerService非请求上下文"""
validator = ValidatorService(
timeout=app_settings.validator_timeout,
connect_timeout=app_settings.validator_connect_timeout,
max_concurrency=app_settings.validator_max_concurrency,
)
proxy_repo = ProxyRepository()
queue = ValidationQueue(
validator=validator,
proxy_repo=proxy_repo,
worker_count=app_settings.validator_max_concurrency,
score_valid=app_settings.score_valid,
score_invalid=app_settings.score_invalid,
score_min=app_settings.score_min,
score_max=app_settings.score_max,
)
return SchedulerService(validation_queue=queue, proxy_repo=proxy_repo)

33
app/api/errors.py Normal file
View File

@@ -0,0 +1,33 @@
"""统一异常处理"""
from fastapi import Request
from fastapi.responses import JSONResponse
from pydantic import ValidationError
from app.core.exceptions import ProxyPoolException
from app.core.log import logger
async def proxy_pool_exception_handler(request: Request, exc: ProxyPoolException):
return JSONResponse(
status_code=exc.code,
content={"code": exc.code, "message": exc.message, "data": None},
)
async def pydantic_validation_handler(request: Request, exc: ValidationError):
logger.error(f"Validation error: {exc}")
return JSONResponse(
status_code=422,
content={
"code": 422,
"message": "参数验证失败",
"data": exc.errors(),
},
)
async def general_exception_handler(request: Request, exc: Exception):
logger.error(f"Unhandled exception: {exc}", exc_info=True)
return JSONResponse(
status_code=500,
content={"code": 500, "message": "服务器内部错误", "data": None},
)

41
app/api/lifespan.py Normal file
View File

@@ -0,0 +1,41 @@
"""应用生命周期管理"""
from contextlib import asynccontextmanager
from fastapi import FastAPI
from app.core.db import init_db, get_db
from app.core.config import settings as app_settings
from app.core.log import logger
from app.api.deps import create_scheduler_service
from app.repositories.settings_repo import SettingsRepository
settings_repo = SettingsRepository()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""应用启动和关闭时的生命周期管理"""
# 初始化数据库
await init_db()
# 创建调度器并挂载到 app.state
scheduler_service = create_scheduler_service()
app.state.scheduler_service = scheduler_service
app.state.validation_queue = scheduler_service.validation_queue
# 加载设置并决定是否启动调度器
try:
async with get_db() as db:
settings = await settings_repo.get_all(db)
scheduler_service.interval_minutes = settings.get(
"validate_interval_minutes", app_settings.validator_timeout
)
if settings.get("auto_validate", True):
await scheduler_service.start()
except Exception as e:
logger.error(f"Failed to load settings on startup: {e}")
logger.info("API server started")
yield
# 关闭调度器
await scheduler_service.stop()
logger.info("API server shutdown")

55
app/api/main.py Normal file
View File

@@ -0,0 +1,55 @@
"""FastAPI 应用工厂"""
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.lifespan import lifespan
from app.api.routes import api_router
from app.api.errors import proxy_pool_exception_handler, pydantic_validation_handler, general_exception_handler
from app.core.exceptions import ProxyPoolException
from pydantic import ValidationError
from app.core.config import settings as app_settings
# 导入并注册所有插件(显式注册模式)
import app.plugins
def create_app() -> FastAPI:
app = FastAPI(
title="代理池API",
version="2.0.0",
lifespan=lifespan,
)
# CORS
app.add_middleware(
CORSMiddleware,
allow_origins=app_settings.cors_origins_list,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 异常处理
app.add_exception_handler(ProxyPoolException, proxy_pool_exception_handler)
app.add_exception_handler(ValidationError, pydantic_validation_handler)
app.add_exception_handler(Exception, general_exception_handler)
# 路由
app.include_router(api_router)
@app.get("/")
async def root():
return {"message": "欢迎使用代理池API", "status": "running", "data": None}
@app.get("/health")
async def health_check():
from datetime import datetime
scheduler = app.state.scheduler_service
return {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"database": "connected",
"scheduler": "running" if scheduler.running else "stopped",
"version": "2.0.0",
}
return app

View File

@@ -0,0 +1,9 @@
"""路由包"""
from fastapi import APIRouter
from app.api.routes import proxies, plugins, scheduler, settings
api_router = APIRouter()
api_router.include_router(proxies.router)
api_router.include_router(plugins.router)
api_router.include_router(scheduler.router)
api_router.include_router(settings.router)

154
app/api/routes/plugins.py Normal file
View File

@@ -0,0 +1,154 @@
"""插件相关路由"""
import asyncio
from fastapi import APIRouter, Depends
from app.services.plugin_service import PluginService
from app.services.scheduler_service import SchedulerService
from app.api.deps import get_plugin_service, get_scheduler_service
from app.api.common import success_response, error_response, format_plugin
from app.core.log import logger
router = APIRouter(prefix="/api/plugins", tags=["plugins"])
@router.get("")
async def list_plugins(service: PluginService = Depends(get_plugin_service)):
try:
plugins = await service.list_plugins()
return success_response("获取插件列表成功", {"plugins": [format_plugin(p) for p in plugins]})
except Exception as e:
logger.error(f"List plugins failed: {e}")
return error_response("获取插件列表失败", 500)
@router.put("/{plugin_id}/toggle")
async def toggle_plugin(
plugin_id: str,
request: dict,
service: PluginService = Depends(get_plugin_service),
):
enabled = request.get("enabled")
if enabled is None:
return error_response("缺少 enabled 参数", 400)
try:
success = await service.toggle_plugin(plugin_id, enabled)
if not success:
return error_response("插件不存在", 404)
return success_response(
f"插件 {plugin_id}{'启用' if enabled else '禁用'}",
{"plugin_id": plugin_id, "enabled": enabled},
)
except Exception as e:
logger.error(f"Toggle plugin failed: {e}")
return error_response("切换插件状态失败", 500)
@router.get("/{plugin_id}/config")
async def get_plugin_config(
plugin_id: str,
service: PluginService = Depends(get_plugin_service),
):
try:
config = await service.get_plugin_config(plugin_id)
if config is None:
return error_response("插件不存在", 404)
return success_response("获取插件配置成功", {"plugin_id": plugin_id, "config": config})
except Exception as e:
logger.error(f"Get plugin config failed: {e}")
return error_response("获取插件配置失败", 500)
@router.post("/{plugin_id}/config")
async def update_plugin_config(
plugin_id: str,
request: dict,
service: PluginService = Depends(get_plugin_service),
):
config = request.get("config", {})
if not isinstance(config, dict):
return error_response("config 必须是对象", 400)
try:
success = await service.update_plugin_config(plugin_id, config)
if not success:
return error_response("插件不存在或配置无效", 404)
return success_response("保存插件配置成功", {"plugin_id": plugin_id, "config": config})
except Exception as e:
logger.error(f"Update plugin config failed: {e}")
return error_response("保存插件配置失败", 500)
@router.post("/{plugin_id}/crawl")
async def crawl_plugin(
plugin_id: str,
plugin_service: PluginService = Depends(get_plugin_service),
scheduler_service: SchedulerService = Depends(get_scheduler_service),
):
plugin = plugin_service.get_plugin(plugin_id)
if not plugin:
return error_response("插件不存在", 404)
try:
results = await plugin_service.run_plugin(plugin_id)
if not results:
return success_response(
f"插件 {plugin_id} 爬取完成,未获取到代理",
{"plugin_id": plugin_id, "proxy_count": 0, "valid_count": 0},
)
logger.info(f"Plugin {plugin_id} crawled {len(results)} proxies")
scheduler_service.validation_queue.reset_stats()
await scheduler_service.validation_queue.submit(results)
try:
await asyncio.wait_for(scheduler_service.validation_queue.drain(), timeout=30.0)
except asyncio.TimeoutError:
pass
return success_response(
f"插件 {plugin_id} 爬取并验证完成",
{
"plugin_id": plugin_id,
"proxy_count": len(results),
"valid_count": scheduler_service.validation_queue.valid_count,
"invalid_count": scheduler_service.validation_queue.invalid_count,
},
)
except Exception as e:
logger.error(f"Crawl plugin {plugin_id} failed: {e}")
return error_response(f"插件爬取失败: {str(e)}", 500)
@router.post("/crawl-all")
async def crawl_all(
plugin_service: PluginService = Depends(get_plugin_service),
scheduler_service: SchedulerService = Depends(get_scheduler_service),
):
try:
results = await plugin_service.run_all_plugins()
if not results:
return success_response(
"所有插件爬取完成,未获取到代理",
{"total_crawled": 0, "valid_count": 0, "invalid_count": 0},
)
logger.info(f"All plugins crawled {len(results)} unique proxies")
scheduler_service.validation_queue.reset_stats()
await scheduler_service.validation_queue.submit(results)
try:
await asyncio.wait_for(scheduler_service.validation_queue.drain(), timeout=60.0)
except asyncio.TimeoutError:
pass
return success_response(
"所有插件爬取并验证完成",
{
"total_crawled": len(results),
"valid_count": scheduler_service.validation_queue.valid_count,
"invalid_count": scheduler_service.validation_queue.invalid_count,
},
)
except Exception as e:
logger.error(f"Crawl all failed: {e}")
return error_response(f"批量爬取失败: {str(e)}", 500)

125
app/api/routes/proxies.py Normal file
View File

@@ -0,0 +1,125 @@
"""代理相关路由(含统计信息)"""
from typing import Optional
from fastapi import APIRouter, Depends, Query
from app.services.proxy_service import ProxyService
from app.services.scheduler_service import SchedulerService
from app.models.schemas import ProxyListRequest, BatchDeleteRequest
from app.api.deps import get_proxy_service, get_scheduler_service
from app.api.common import success_response, error_response, format_proxy
from app.core.log import logger
router = APIRouter(prefix="/api/proxies", tags=["proxies"])
@router.get("/stats")
async def get_stats(
proxy_service: ProxyService = Depends(get_proxy_service),
scheduler_service: SchedulerService = Depends(get_scheduler_service),
):
try:
stats = await proxy_service.get_stats()
stats["scheduler_running"] = scheduler_service.running
return success_response("获取统计信息成功", stats)
except Exception as e:
logger.error(f"Get stats failed: {e}")
return error_response("获取统计信息失败", 500)
@router.post("")
async def list_proxies(
request: ProxyListRequest,
service: ProxyService = Depends(get_proxy_service),
):
try:
proxies, total = await service.list_proxies(
page=request.page,
page_size=request.page_size,
protocol=request.protocol,
min_score=request.min_score,
max_score=request.max_score,
sort_by=request.sort_by,
sort_order=request.sort_order,
)
return success_response(
"获取代理列表成功",
{
"list": [format_proxy(p) for p in proxies],
"total": total,
"page": request.page,
"page_size": request.page_size,
},
)
except Exception as e:
logger.error(f"List proxies failed: {e}")
return error_response("获取代理列表失败", 500)
@router.get("/random")
async def get_random_proxy(service: ProxyService = Depends(get_proxy_service)):
try:
proxy = await service.get_random_proxy()
if not proxy:
return error_response("没有找到可用的代理", 404)
return success_response("获取随机代理成功", format_proxy(proxy))
except Exception as e:
logger.error(f"Get random proxy failed: {e}")
return error_response("获取随机代理失败", 500)
@router.get("/export/{fmt}")
async def export_proxies(
fmt: str,
protocol: Optional[str] = None,
limit: int = Query(default=10000, ge=1, le=100000),
service: ProxyService = Depends(get_proxy_service),
):
if fmt not in ("csv", "txt", "json"):
return error_response("不支持的导出格式", 400)
from fastapi.responses import StreamingResponse
media_types = {"csv": "text/csv", "txt": "text/plain", "json": "application/json"}
async def generate():
async for chunk in service.export_proxies(fmt, protocol, limit):
yield chunk
return StreamingResponse(
generate(),
media_type=media_types[fmt],
headers={"Content-Disposition": f"attachment; filename=proxies.{fmt}"},
)
@router.delete("/{ip}/{port}")
async def delete_proxy(ip: str, port: int, service: ProxyService = Depends(get_proxy_service)):
try:
await service.delete_proxy(ip, port)
return success_response("删除代理成功")
except Exception as e:
logger.error(f"Delete proxy failed: {e}")
return error_response("删除代理失败", 500)
@router.post("/batch-delete")
async def batch_delete(
request: BatchDeleteRequest,
service: ProxyService = Depends(get_proxy_service),
):
try:
proxies = [(item.ip, item.port) for item in request.proxies]
deleted = await service.batch_delete(proxies)
return success_response(f"批量删除 {deleted} 个代理成功", {"deleted_count": deleted})
except Exception as e:
logger.error(f"Batch delete failed: {e}")
return error_response("批量删除失败", 500)
@router.delete("/clean-invalid")
async def clean_invalid(service: ProxyService = Depends(get_proxy_service)):
try:
count = await service.clean_invalid()
return success_response(f"清理了 {count} 个无效代理", {"deleted_count": count})
except Exception as e:
logger.error(f"Clean invalid failed: {e}")
return error_response("清理无效代理失败", 500)

View File

@@ -0,0 +1,64 @@
"""调度器相关路由"""
from fastapi import APIRouter, Depends
from app.services.scheduler_service import SchedulerService
from app.repositories.settings_repo import SettingsRepository
from app.core.db import get_db
from app.api.deps import get_scheduler_service
from app.api.common import success_response, error_response
from app.core.log import logger
router = APIRouter(prefix="/api/scheduler", tags=["scheduler"])
settings_repo = SettingsRepository()
async def _save_auto_validate_setting(enabled: bool):
"""保存自动验证设置"""
async with get_db() as db:
settings = await settings_repo.get_all(db)
settings["auto_validate"] = enabled
from app.models.schemas import SettingsSchema
await settings_repo.save(db, SettingsSchema(**settings).model_dump())
@router.post("/start")
async def start_scheduler(scheduler: SchedulerService = Depends(get_scheduler_service)):
try:
if scheduler.running:
return success_response("验证调度器已在运行", {"running": True})
await scheduler.start()
await _save_auto_validate_setting(True)
return success_response("验证调度器已启动", {"running": True})
except Exception as e:
logger.error(f"Start scheduler failed: {e}")
return error_response(f"启动调度器失败: {str(e)}", 500)
@router.post("/stop")
async def stop_scheduler(scheduler: SchedulerService = Depends(get_scheduler_service)):
try:
if not scheduler.running:
return success_response("验证调度器未运行", {"running": False})
await scheduler.stop()
await _save_auto_validate_setting(False)
return success_response("验证调度器已停止", {"running": False})
except Exception as e:
logger.error(f"Stop scheduler failed: {e}")
return error_response(f"停止调度器失败: {str(e)}", 500)
@router.post("/validate-now")
async def validate_now(scheduler: SchedulerService = Depends(get_scheduler_service)):
try:
scheduler.validate_all_now()
return success_response("已开始全量验证", {"started": True})
except Exception as e:
logger.error(f"Validate now failed: {e}")
return error_response(f"启动验证失败: {str(e)}", 500)
@router.get("/status")
async def scheduler_status(scheduler: SchedulerService = Depends(get_scheduler_service)):
return success_response(
"获取状态成功",
{"running": scheduler.running, "interval_minutes": scheduler.interval_minutes},
)

View File

@@ -0,0 +1,34 @@
"""设置相关路由"""
from fastapi import APIRouter
from app.core.db import get_db
from app.repositories.settings_repo import SettingsRepository
from app.models.schemas import SettingsSchema
from app.api.common import success_response, error_response
from app.core.log import logger
router = APIRouter(prefix="/api/settings", tags=["settings"])
settings_repo = SettingsRepository()
@router.get("")
async def get_settings():
try:
async with get_db() as db:
settings = await settings_repo.get_all(db)
return success_response("获取设置成功", settings)
except Exception as e:
logger.error(f"Get settings failed: {e}")
return error_response("获取设置失败", 500)
@router.post("")
async def save_settings(request: SettingsSchema):
try:
async with get_db() as db:
success = await settings_repo.save(db, request.model_dump())
if not success:
return error_response("保存设置失败", 500)
return success_response("保存设置成功", request.model_dump())
except Exception as e:
logger.error(f"Save settings failed: {e}")
return error_response(f"保存设置失败: {str(e)}", 500)

13
app/core/__init__.py Normal file
View File

@@ -0,0 +1,13 @@
"""核心基础设施包"""
from .config import settings
from .log import logger
from .exceptions import ProxyPoolException, PluginNotFoundException, ProxyNotFoundException, ValidationException
__all__ = [
"settings",
"logger",
"ProxyPoolException",
"PluginNotFoundException",
"ProxyNotFoundException",
"ValidationException",
]

59
app/core/config.py Normal file
View File

@@ -0,0 +1,59 @@
"""全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件"""
import os
from typing import List
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
# 数据库配置
db_path: str = "db/proxies.sqlite"
# API 服务配置
host: str = "0.0.0.0"
port: int = 9949
# 验证器配置
validator_timeout: int = 5
validator_max_concurrency: int = 200
validator_connect_timeout: int = 3
# 爬虫配置
crawler_num_validators: int = 50
crawler_max_queue_size: int = 500
# 日志配置
log_level: str = "INFO"
log_dir: str = "logs"
# 导出配置
export_max_records: int = 10000
# 代理评分配置
score_valid: int = 10
score_invalid: int = -5
score_min: int = 0
score_max: int = 100
# 插件配置
plugins_dir: str = "plugins"
# CORS 配置
cors_origins: str = "http://localhost:8080,http://localhost:5173,http://localhost:9948"
@property
def cors_origins_list(self) -> List[str]:
return [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()]
@property
def base_dir(self) -> str:
return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# 全局配置实例(启动时加载一次)
settings = Settings()

120
app/core/db.py Normal file
View File

@@ -0,0 +1,120 @@
"""数据库连接管理 - 使用上下文管理器,避免全局单例连接泄漏"""
import os
import aiosqlite
from contextlib import asynccontextmanager
from typing import AsyncIterator
from app.core.config import settings
from app.core.log import logger
DB_PATH = os.path.join(settings.base_dir, settings.db_path)
def ensure_db_dir():
db_dir = os.path.dirname(DB_PATH)
if db_dir and not os.path.exists(db_dir):
os.makedirs(db_dir, exist_ok=True)
async def init_db():
"""初始化数据库表结构(支持迁移)"""
ensure_db_dir()
async with aiosqlite.connect(DB_PATH) as db:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
await db.execute("PRAGMA cache_size=-64000")
await db.execute("PRAGMA temp_store=MEMORY")
await db.execute("""
CREATE TABLE IF NOT EXISTS proxies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ip TEXT NOT NULL,
port INTEGER NOT NULL,
protocol TEXT DEFAULT 'http',
score INTEGER DEFAULT 10,
response_time_ms REAL,
last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(ip, port)
)
""")
# 迁移:如果旧表缺少 response_time_ms 列,则添加
try:
await db.execute("SELECT response_time_ms FROM proxies LIMIT 1")
except Exception:
await db.execute("ALTER TABLE proxies ADD COLUMN response_time_ms REAL")
logger.info("Migrated: added response_time_ms column")
# 迁移:如果旧表缺少 created_at 列,则添加
try:
await db.execute("SELECT created_at FROM proxies LIMIT 1")
except Exception:
await db.execute("ALTER TABLE proxies ADD COLUMN created_at TIMESTAMP")
await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL")
logger.info("Migrated: added created_at column")
await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)")
# 插件设置表
await db.execute("""
CREATE TABLE IF NOT EXISTS plugin_settings (
plugin_id TEXT PRIMARY KEY,
enabled INTEGER DEFAULT 1,
config_json TEXT DEFAULT '{}',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 迁移:为旧版 plugin_settings 表增加 config_json 列
try:
await db.execute("SELECT config_json FROM plugin_settings LIMIT 1")
except Exception:
await db.execute("ALTER TABLE plugin_settings ADD COLUMN config_json TEXT DEFAULT '{}'")
logger.info("Migrated: added config_json column to plugin_settings")
# 验证任务队列表
await db.execute("""
CREATE TABLE IF NOT EXISTS validation_tasks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ip TEXT NOT NULL,
port INTEGER NOT NULL,
protocol TEXT DEFAULT 'http',
status TEXT DEFAULT 'pending',
result TEXT,
response_time_ms REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
await db.execute("CREATE INDEX IF NOT EXISTS idx_validation_status ON validation_tasks(status)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_validation_created ON validation_tasks(created_at)")
# 系统设置表
await db.execute("""
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
await db.commit()
logger.info("Database initialized")
@asynccontextmanager
async def get_db() -> AsyncIterator[aiosqlite.Connection]:
"""获取数据库连接的异步上下文管理器"""
ensure_db_dir()
db = await aiosqlite.connect(DB_PATH)
try:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
yield db
finally:
await db.close()

24
app/core/exceptions.py Normal file
View File

@@ -0,0 +1,24 @@
"""业务异常定义"""
class ProxyPoolException(Exception):
"""基础业务异常"""
def __init__(self, message: str, code: int = 500):
self.message = message
self.code = code
super().__init__(self.message)
class PluginNotFoundException(ProxyPoolException):
def __init__(self, plugin_id: str):
super().__init__(f"Plugin '{plugin_id}' not found", 404)
class ProxyNotFoundException(ProxyPoolException):
def __init__(self, ip: str, port: int):
super().__init__(f"Proxy {ip}:{port} not found", 404)
class ValidationException(ProxyPoolException):
def __init__(self, message: str):
super().__init__(message, 400)

47
app/core/log.py Normal file
View File

@@ -0,0 +1,47 @@
import logging
import os
from logging.handlers import RotatingFileHandler
from datetime import datetime
class LogHandler(logging.Logger):
def __init__(self, name='ProxyPool', level=logging.INFO):
super().__init__(name, level)
# 获取项目根目录并创建 logs 目录
base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
log_dir = os.path.join(base_dir, 'logs')
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# 使用日期作为文件名
log_filename = f"{datetime.now().strftime('%Y-%m-%d')}.log"
log_file = os.path.join(log_dir, log_filename)
# 设置格式
formatter = logging.Formatter(
'[%(asctime)s] %(name)s [%(levelname)s] %(filename)s[line:%(lineno)d]: %(message)s'
)
# 文件处理器使用RotatingFileHandler支持日志轮转
# 每个日志文件最大10MB保留5个备份
file_handler = RotatingFileHandler(
log_file,
maxBytes=10*1024*1024,
backupCount=5,
encoding='utf-8'
)
file_handler.setFormatter(formatter)
self.addHandler(file_handler)
# 控制台处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
self.addHandler(console_handler)
# 实例化一个默认 logger 供外部直接使用
logger = LogHandler()
if __name__ == '__main__':
logger.info('这是一条按日期存储的日志测试')

View File

@@ -0,0 +1,5 @@
"""插件系统包"""
from .base import BaseCrawlerPlugin, ProxyRaw
from .registry import registry
__all__ = ["BaseCrawlerPlugin", "ProxyRaw", "registry"]

View File

@@ -0,0 +1,55 @@
"""插件基类 - 所有爬虫插件必须继承此基类"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Any
@dataclass
class ProxyRaw:
"""爬虫产出的原始代理数据"""
ip: str
port: int
protocol: str = "http"
def __post_init__(self):
self.protocol = self.protocol.lower().strip()
if self.protocol not in ("http", "https", "socks4", "socks5"):
self.protocol = "http"
class BaseCrawlerPlugin(ABC):
"""爬虫插件基类
添加新爬虫只需:
1. 继承 BaseCrawlerPlugin
2. 实现 crawl() 方法返回 List[ProxyRaw]
3. 用 @registry.register 装饰或在 __init__ 中显式注册
"""
name: str = ""
display_name: str = ""
description: str = ""
enabled: bool = True
default_config: Dict[str, Any] = {}
def __init__(self):
self._config: Dict[str, Any] = dict(self.default_config or {})
@property
def config(self) -> Dict[str, Any]:
return self._config
def update_config(self, updates: Dict[str, Any]) -> None:
"""更新插件配置,只覆盖存在的键"""
for key, value in updates.items():
if key in self._config:
self._config[key] = value
@abstractmethod
async def crawl(self) -> List[ProxyRaw]:
"""爬取代理的核心方法。只负责爬取,不要在这里验证。"""
raise NotImplementedError
async def health_check(self) -> bool:
"""可选:检查插件健康状态"""
return True

View File

@@ -0,0 +1,77 @@
"""插件注册中心 - 显式注册,类型安全,测试友好"""
import importlib
import inspect
import os
from typing import Dict, List, Type, Optional
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.log import logger
class PluginRegistry:
"""插件注册中心"""
def __init__(self):
self._plugins: Dict[str, Type[BaseCrawlerPlugin]] = {}
self._instances: Dict[str, BaseCrawlerPlugin] = {}
def register(self, plugin_cls: Type[BaseCrawlerPlugin]) -> Type[BaseCrawlerPlugin]:
"""注册一个插件类。支持装饰器语法。"""
if not inspect.isclass(plugin_cls) or not issubclass(plugin_cls, BaseCrawlerPlugin):
raise ValueError("Plugin must be a subclass of BaseCrawlerPlugin")
if not plugin_cls.name:
raise ValueError(f"Plugin {plugin_cls.__name__} must have a 'name' attribute")
self._plugins[plugin_cls.name] = plugin_cls
logger.info(f"Plugin registered: {plugin_cls.name} ({plugin_cls.__name__})")
return plugin_cls
def get(self, name: str) -> Optional[BaseCrawlerPlugin]:
"""获取插件实例(懒加载)"""
if name not in self._instances:
cls = self._plugins.get(name)
if cls:
self._instances[name] = cls()
return self._instances.get(name)
def list_plugins(self) -> List[BaseCrawlerPlugin]:
"""获取所有已注册插件的实例列表"""
result = []
for name in self._plugins:
instance = self.get(name)
if instance:
result.append(instance)
return result
def get_plugin_names(self) -> List[str]:
return list(self._plugins.keys())
def auto_discover(self, package_name: str):
"""自动扫描指定包下的所有模块并注册其中的插件类。
注意为了类型安全和可控性推荐显式注册。auto_discover 仅作为兼容。"""
try:
package = importlib.import_module(package_name)
package_dir = os.path.dirname(package.__file__)
except Exception as e:
logger.error(f"Auto discover failed for package {package_name}: {e}")
return
for filename in os.listdir(package_dir):
if filename.endswith(".py") and not filename.startswith("__"):
module_name = f"{package_name}.{filename[:-3]}"
try:
module = importlib.import_module(module_name)
for attr_name in dir(module):
obj = getattr(module, attr_name)
if (
inspect.isclass(obj)
and issubclass(obj, BaseCrawlerPlugin)
and obj is not BaseCrawlerPlugin
and obj not in self._plugins.values()
):
self.register(obj)
except Exception as e:
logger.error(f"Failed to load module {module_name}: {e}")
# 全局注册中心实例
registry = PluginRegistry()

View File

@@ -0,0 +1,4 @@
"""任务队列包"""
from .queue import ValidationQueue
__all__ = ["ValidationQueue"]

149
app/core/tasks/queue.py Normal file
View File

@@ -0,0 +1,149 @@
"""验证任务队列 - 解耦爬取与验证,支持背压控制和持久化"""
import asyncio
from typing import Optional
from app.models.domain import ProxyRaw
from app.repositories.task_repo import ValidationTaskRepository
from app.core.db import get_db
from app.core.log import logger
class ValidationQueue:
"""代理验证队列(支持持久化到 SQLite
工作流程:
1. 爬虫将原始代理 submit() 到队列(写入数据库 + 内存信号)
2. Worker 池从数据库消费并验证
3. 验证通过的代理写入数据库
4. 服务重启时自动恢复未完成的 pending 任务
"""
def __init__(
self,
validator,
proxy_repo,
worker_count: int = 50,
score_valid: int = 10,
score_invalid: int = -5,
score_min: int = 0,
score_max: int = 100,
):
self.validator = validator
self.proxy_repo = proxy_repo
self.task_repo = ValidationTaskRepository()
self.worker_count = worker_count
self.score_valid = score_valid
self.score_invalid = score_invalid
self.score_min = score_min
self.score_max = score_max
self._signal: asyncio.Queue[None] = asyncio.Queue()
self._workers: list[asyncio.Task] = []
self._running = False
self._db_lock = asyncio.Lock()
# 统计
self.valid_count = 0
self.invalid_count = 0
async def start(self):
if self._running:
return
self._running = True
# 恢复之前中断的 processing 任务
async with get_db() as db:
recovered = await self.task_repo.reset_processing(db)
pending = await self.task_repo.get_pending_count(db)
if recovered:
logger.info(f"ValidationQueue recovered {recovered} interrupted tasks")
if pending:
logger.info(f"ValidationQueue has {pending} pending tasks to process")
for i in range(self.worker_count):
self._workers.append(asyncio.create_task(self._worker_loop(i)))
# 唤醒 Worker 处理恢复的 pending 任务
if pending:
for _ in range(min(pending, self.worker_count)):
self._signal.put_nowait(None)
logger.info(f"ValidationQueue started with {self.worker_count} workers")
async def stop(self):
if not self._running:
return
self._running = False
for _ in self._workers:
self._signal.put_nowait(None) # sentinel
if self._workers:
await asyncio.gather(*self._workers, return_exceptions=True)
self._workers.clear()
logger.info("ValidationQueue stopped")
async def submit(self, proxies: list[ProxyRaw]):
"""提交代理到验证队列(持久化 + 唤醒 Worker"""
async with self._db_lock:
async with get_db() as db:
inserted = await self.task_repo.insert_batch(db, proxies)
if inserted:
for _ in range(min(inserted, self.worker_count)):
self._signal.put_nowait(None)
async def submit_one(self, proxy: ProxyRaw):
await self.submit([proxy])
async def drain(self):
"""等待队列中当前所有 pending 任务处理完毕"""
while True:
async with get_db() as db:
count = await self.task_repo.get_pending_count(db)
if count == 0:
break
await asyncio.sleep(0.5)
async def _worker_loop(self, worker_id: int):
while True:
await self._signal.get()
self._signal.task_done()
if not self._running:
break
await self._process_one_task(worker_id)
async def _process_one_task(self, worker_id: int):
"""从数据库取一个任务并验证"""
async with self._db_lock:
async with get_db() as db:
task = await self.task_repo.acquire_pending(db)
if not task:
return
proxy = ProxyRaw(task["ip"], task["port"], task["protocol"])
try:
is_valid, latency = await self.validator.validate(
proxy.ip, proxy.port, proxy.protocol
)
except Exception as e:
logger.error(f"Worker {worker_id} validation error: {e}")
is_valid, latency = False, 0.0
async with self._db_lock:
async with get_db() as db:
if is_valid:
await self.proxy_repo.insert_or_update(
db, proxy.ip, proxy.port, proxy.protocol, score=self.score_valid
)
if latency:
await self.proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
await self.task_repo.complete_task(db, task["id"], True, latency)
self.valid_count += 1
logger.debug(f"ValidationQueue: valid {proxy.ip}:{proxy.port}")
else:
await self.task_repo.complete_task(db, task["id"], False, 0.0)
self.invalid_count += 1
logger.debug(f"ValidationQueue: invalid {proxy.ip}:{proxy.port}")
def reset_stats(self):
self.valid_count = 0
self.invalid_count = 0

30
app/models/__init__.py Normal file
View File

@@ -0,0 +1,30 @@
"""数据模型包"""
from .domain import ProxyRaw, Proxy, PluginInfo
from .schemas import (
ProxyCreate,
ProxyResponse,
PluginResponse,
SettingsSchema,
CrawlResult,
ProxyListRequest,
ProxyDeleteItem,
BatchDeleteRequest,
PluginToggleRequest,
ExportRequest,
)
__all__ = [
"ProxyRaw",
"Proxy",
"PluginInfo",
"ProxyCreate",
"ProxyResponse",
"PluginResponse",
"SettingsSchema",
"CrawlResult",
"ProxyListRequest",
"ProxyDeleteItem",
"BatchDeleteRequest",
"PluginToggleRequest",
"ExportRequest",
]

42
app/models/domain.py Normal file
View File

@@ -0,0 +1,42 @@
"""领域模型 - 纯数据结构,不依赖任何框架"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
@dataclass
class ProxyRaw:
"""爬虫爬取的原始代理数据"""
ip: str
port: int
protocol: str = "http"
def __post_init__(self):
self.protocol = self.protocol.lower().strip()
if self.protocol not in ("http", "https", "socks4", "socks5"):
self.protocol = "http"
@dataclass
class Proxy:
"""数据库中的代理实体"""
ip: str
port: int
protocol: str
score: int
response_time_ms: Optional[float] = None
last_check: Optional[datetime] = None
created_at: Optional[datetime] = None
@dataclass
class PluginInfo:
"""插件元数据"""
id: str
name: str
display_name: str
description: str
enabled: bool
last_run: Optional[datetime] = None
success_count: int = 0
failure_count: int = 0

105
app/models/schemas.py Normal file
View File

@@ -0,0 +1,105 @@
"""Pydantic 模型 - 用于 API 请求/响应校验"""
from pydantic import BaseModel, Field, field_validator
from typing import Optional, List
class ProxyCreate(BaseModel):
ip: str
port: int = Field(ge=1, le=65535)
protocol: str = "http"
score: int = Field(default=10, ge=0, le=100)
@field_validator("protocol")
@classmethod
def validate_protocol(cls, v: str):
v = v.lower().strip()
if v not in ("http", "https", "socks4", "socks5"):
raise ValueError("protocol must be http, https, socks4 or socks5")
return v
class ProxyResponse(BaseModel):
ip: str
port: int
protocol: str
score: int
last_check: Optional[str] = None
class PluginResponse(BaseModel):
id: str
name: str
display_name: str
description: str
enabled: bool
last_run: Optional[str] = None
success_count: int = 0
failure_count: int = 0
class SettingsSchema(BaseModel):
crawl_timeout: int = Field(default=30, ge=5, le=120)
validation_timeout: int = Field(default=10, ge=3, le=60)
max_retries: int = Field(default=3, ge=0, le=10)
default_concurrency: int = Field(default=50, ge=10, le=200)
min_proxy_score: int = Field(default=0, ge=0, le=100)
proxy_expiry_days: int = Field(default=7, ge=1, le=30)
auto_validate: bool = True
validate_interval_minutes: int = Field(default=30, ge=5, le=1440)
class CrawlResult(BaseModel):
plugin_id: str
proxy_count: int
valid_count: int
invalid_count: int = 0
class ProxyListRequest(BaseModel):
page: int = Field(default=1, ge=1)
page_size: int = Field(default=20, ge=1, le=100)
protocol: Optional[str] = None
min_score: int = Field(default=0, ge=0)
max_score: Optional[int] = Field(default=None, ge=0)
sort_by: str = "last_check"
sort_order: str = "DESC"
@field_validator("protocol")
@classmethod
def validate_protocol(cls, v):
if v is not None and v.lower() not in ("http", "https", "socks4", "socks5"):
raise ValueError("协议类型必须是 http, https, socks4 或 socks5")
return v.lower() if v else v
@field_validator("sort_by")
@classmethod
def validate_sort_by(cls, v):
if v not in ("ip", "port", "protocol", "score", "last_check"):
raise ValueError("排序字段必须是 ip, port, protocol, score 或 last_check")
return v
@field_validator("sort_order")
@classmethod
def validate_sort_order(cls, v):
if v.upper() not in ("ASC", "DESC"):
raise ValueError("排序方式必须是 ASC 或 DESC")
return v.upper()
class ProxyDeleteItem(BaseModel):
ip: str
port: int = Field(ge=1, le=65535)
class BatchDeleteRequest(BaseModel):
proxies: List[ProxyDeleteItem] = Field(max_length=1000)
class PluginToggleRequest(BaseModel):
enabled: bool
class ExportRequest(BaseModel):
format: str = Field(pattern=r"^(csv|txt|json)$")
protocol: Optional[str] = None
limit: int = Field(default=10000, ge=1, le=100000)

21
app/plugins/__init__.py Normal file
View File

@@ -0,0 +1,21 @@
"""插件包 - 在这里显式注册所有爬虫插件"""
from app.core.plugin_system import registry
from .fate0 import Fate0Plugin
from .proxylist_download import ProxyListDownloadPlugin
from .ip3366 import Ip3366Plugin
from .ip89 import Ip89Plugin
from .kuaidaili import KuaiDaiLiPlugin
from .speedx import SpeedXPlugin
from .yundaili import YunDaiLiPlugin
from .proxyscrape import ProxyScrapePlugin
# 显式注册所有插件
registry.register(Fate0Plugin)
registry.register(ProxyListDownloadPlugin)
registry.register(Ip3366Plugin)
registry.register(Ip89Plugin)
registry.register(KuaiDaiLiPlugin)
registry.register(SpeedXPlugin)
registry.register(YunDaiLiPlugin)
registry.register(ProxyScrapePlugin)

52
app/plugins/base.py Normal file
View File

@@ -0,0 +1,52 @@
"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
import random
import asyncio
import aiohttp
from typing import List
from app.core.plugin_system import BaseCrawlerPlugin
class BaseHTTPPlugin(BaseCrawlerPlugin):
"""基于 HTTP 的爬虫插件基类"""
def __init__(self):
super().__init__()
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
]
self.urls: List[str] = []
self.current_url: str = ""
def get_headers(self) -> dict:
return {
"User-Agent": random.choice(self.user_agents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
}
async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str:
"""异步抓取指定 URL 的 HTML 内容"""
headers = self.get_headers()
async with aiohttp.ClientSession(headers=headers) as session:
for attempt in range(retries):
try:
async with session.get(
url, timeout=aiohttp.ClientTimeout(total=timeout)
) as response:
if response.status == 200:
content = await response.read()
encoding = response.get_encoding()
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
except Exception:
pass
await asyncio.sleep(random.uniform(1, 3))
return ""

38
app/plugins/fate0.py Normal file
View File

@@ -0,0 +1,38 @@
import json
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class Fate0Plugin(BaseHTTPPlugin):
name = "fate0"
display_name = "Fate0聚合源"
description = "从 GitHub 持续更新的高质量代理聚合列表"
def __init__(self):
super().__init__()
self.urls = ["https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=30)
if not html:
continue
for line in html.split("\n"):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
ip = data.get("host")
port = data.get("port")
protocol = data.get("type", "http")
if ip and port:
results.append(ProxyRaw(ip, int(port), protocol))
except Exception:
continue
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

56
app/plugins/ip3366.py Normal file
View File

@@ -0,0 +1,56 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class Ip3366Plugin(BaseHTTPPlugin):
name = "ip3366"
display_name = "IP3366"
description = "从 IP3366 网站爬取免费代理"
default_config = {"max_pages": 5}
def __init__(self):
super().__init__()
self._update_urls()
def _update_urls(self):
max_pages = self.config.get("max_pages", 5)
self.urls = [
f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, max_pages + 1)
] + [
f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, max_pages + 1)
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
list_div = soup.find("div", id="list")
if not list_div:
continue
table = list_div.find("table")
if not table:
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 5:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

39
app/plugins/ip89.py Normal file
View File

@@ -0,0 +1,39 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class Ip89Plugin(BaseHTTPPlugin):
name = "ip89"
display_name = "89免费代理"
description = "从 89ip.cn 爬取免费代理"
def __init__(self):
super().__init__()
self.urls = [f"https://www.89ip.cn/index_{i}.html" for i in range(1, 6)]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", class_="layui-table")
if not table:
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 2:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), "http"))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

49
app/plugins/kuaidaili.py Normal file
View File

@@ -0,0 +1,49 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class KuaiDaiLiPlugin(BaseHTTPPlugin):
name = "kuaidaili"
display_name = "快代理"
description = "从快代理网站爬取免费代理"
def __init__(self):
super().__init__()
self.urls = [
f"https://www.kuaidaili.com/free/inha/{i}/" for i in range(1, 11)
] + [
f"https://www.kuaidaili.com/free/intr/{i}/" for i in range(1, 11)
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
table = soup.find("table")
if not table:
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬")
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 5:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

View File

@@ -0,0 +1,55 @@
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyListDownloadPlugin(BaseHTTPPlugin):
name = "proxylist_download"
display_name = "ProxyListDownload"
description = "从 ProxyListDownload API 获取代理"
def __init__(self):
super().__init__()
self.urls = [
"https://www.proxy-list.download/api/v1/get?type=http",
"https://www.proxy-list.download/api/v1/get?type=https",
"https://www.proxy-list.download/api/v1/get?type=socks4",
"https://www.proxy-list.download/api/v1/get?type=socks5",
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=30)
if not html:
continue
# 根据 URL 判断协议
if "type=socks4" in url:
protocol = "socks4"
elif "type=socks5" in url:
protocol = "socks5"
elif "type=https" in url:
protocol = "https"
else:
protocol = "http"
lines = html.split("\r\n")
if len(lines) <= 1:
lines = html.split("\n")
for line in lines:
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port = parts[1].strip()
if ip and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

View File

@@ -0,0 +1,75 @@
"""ProxyScrape 测试爬虫 - 用于验证架构,支持全协议类型"""
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyScrapePlugin(BaseHTTPPlugin):
"""
从 ProxyScrape 公开 API 获取代理。
覆盖 http/https/socks4/socks5 全协议,专门用于测试插件系统的可扩展性。
"""
name = "proxyscrape"
display_name = "ProxyScrape测试源"
description = "从 ProxyScrape API 获取各类型代理HTTP/HTTPS/SOCKS4/SOCKS5用于测试架构扩展"
enabled = True
def __init__(self):
super().__init__()
# 使用多个公开 GitHub 代理列表作为源,稳定性较高
self.urls = [
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
]
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
for protocol, url in self.urls:
try:
html = await self.fetch(url, timeout=30)
if not html:
logger.warning(f"ProxyScrape {protocol.upper()} 返回空内容")
continue
count = 0
for line in html.splitlines():
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port_str = parts[1].strip()
if port_str.isdigit():
results.append(ProxyRaw(ip, int(port_str), protocol))
count += 1
logger.info(f"ProxyScrape {protocol.upper()} 获取 {count} 个代理")
except Exception as e:
logger.error(f"ProxyScrape {protocol.upper()} 爬取失败: {e}")
if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else:
# Fallback生成测试代理确保在测试环境也能验证完整流程
logger.warning("ProxyScrape 所有真实源均不可用,生成测试代理用于架构验证")
results = self._generate_test_proxies()
return results
def _generate_test_proxies(self) -> List[ProxyRaw]:
"""生成测试代理数据,覆盖全协议类型,用于验证插件系统"""
import random
test_proxies = []
protocols = ["http", "https", "socks4", "socks5"]
for protocol in protocols:
for _ in range(3):
# 生成随机公网格式 IP仅用于测试流程
ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
port = random.randint(1024, 65535)
test_proxies.append(ProxyRaw(ip, port, protocol))
logger.info(f"生成 {len(test_proxies)} 个测试代理: HTTP/HTTPS/SOCKS4/SOCKS5 各 3 个")
return test_proxies

51
app/plugins/speedx.py Normal file
View File

@@ -0,0 +1,51 @@
import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class SpeedXPlugin(BaseHTTPPlugin):
name = "speedx"
display_name = "SpeedX代理源"
description = "从 SpeedX GitHub 仓库获取 SOCKS 代理列表"
def __init__(self):
super().__init__()
self.urls = [
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt",
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=30)
if not html:
continue
# 根据 URL 判断协议
protocol = "http"
if "socks5" in url:
protocol = "socks5"
elif "socks4" in url:
protocol = "socks4"
for line in html.split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port = parts[1].strip()
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
continue
if not port.isdigit() or not (1 <= int(port) <= 65535):
continue
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

51
app/plugins/yundaili.py Normal file
View File

@@ -0,0 +1,51 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class YunDaiLiPlugin(BaseHTTPPlugin):
name = "yundaili"
display_name = "云代理"
description = "从云代理网站爬取免费代理"
def __init__(self):
super().__init__()
self.urls = [
f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, 6)
] + [
f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, 6)
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
list_table = soup.find("div", id="list")
if not list_table:
continue
table = list_table.find("table")
if not table:
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 5:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

View File

@@ -0,0 +1,11 @@
"""数据访问层包"""
from .proxy_repo import ProxyRepository
from .settings_repo import SettingsRepository, PluginSettingsRepository
from .task_repo import ValidationTaskRepository
__all__ = [
"ProxyRepository",
"SettingsRepository",
"PluginSettingsRepository",
"ValidationTaskRepository",
]

View File

@@ -0,0 +1,277 @@
"""代理数据访问层 - 所有 SQL 操作收敛于此"""
import aiosqlite
from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Union
from app.models.domain import Proxy
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
def _to_datetime(value: Union[str, datetime, None]) -> Optional[datetime]:
if value is None:
return None
if isinstance(value, datetime):
return value
if isinstance(value, str):
for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"):
try:
return datetime.strptime(value, fmt)
except ValueError:
continue
return None
def _row_to_proxy(row: Tuple) -> Proxy:
return Proxy(
ip=row[0],
port=row[1],
protocol=row[2],
score=row[3],
response_time_ms=row[4],
last_check=_to_datetime(row[5]),
created_at=_to_datetime(row[6]),
)
class ProxyRepository:
"""代理 Repository"""
@staticmethod
async def insert_or_update(
db: aiosqlite.Connection,
ip: str,
port: int,
protocol: str = "http",
score: int = 10,
) -> bool:
if protocol not in VALID_PROTOCOLS:
protocol = "http"
try:
await db.execute(
"""
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
ON CONFLICT(ip, port) DO UPDATE SET
protocol = excluded.protocol,
score = excluded.score,
last_check = CURRENT_TIMESTAMP
""",
(ip, port, protocol, score),
)
await db.commit()
return True
except Exception as e:
logger.error(f"insert_or_update proxy failed: {e}")
return False
@staticmethod
async def update_score(
db: aiosqlite.Connection,
ip: str,
port: int,
delta: int,
min_score: int = 0,
max_score: int = 100,
) -> bool:
try:
async with db.execute(
"SELECT score FROM proxies WHERE ip = ? AND port = ?", (ip, port)
) as cursor:
row = await cursor.fetchone()
if not row:
return False
current_score = row[0]
new_score = max(min_score, min(max_score, current_score + delta))
await db.execute(
"UPDATE proxies SET score = ?, last_check = CURRENT_TIMESTAMP WHERE ip = ? AND port = ?",
(new_score, ip, port),
)
if new_score <= 0:
await db.execute("DELETE FROM proxies WHERE score <= 0")
await db.commit()
return True
except Exception as e:
logger.error(f"update_score failed: {e}")
return False
@staticmethod
async def update_response_time(
db: aiosqlite.Connection,
ip: str,
port: int,
response_time_ms: float,
) -> bool:
try:
await db.execute(
"UPDATE proxies SET response_time_ms = ? WHERE ip = ? AND port = ?",
(response_time_ms, ip, port),
)
await db.commit()
return True
except Exception as e:
logger.error(f"update_response_time failed: {e}")
return False
@staticmethod
async def delete(db: aiosqlite.Connection, ip: str, port: int) -> None:
await db.execute("DELETE FROM proxies WHERE ip = ? AND port = ?", (ip, port))
await db.commit()
@staticmethod
async def batch_delete(db: aiosqlite.Connection, proxies: List[Tuple[str, int]]) -> int:
if not proxies:
return 0
await db.executemany("DELETE FROM proxies WHERE ip = ? AND port = ?", proxies)
await db.commit()
return len(proxies)
@staticmethod
async def get_by_ip_port(
db: aiosqlite.Connection, ip: str, port: int
) -> Optional[Proxy]:
async with db.execute(
"SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE ip = ? AND port = ?",
(ip, port),
) as cursor:
row = await cursor.fetchone()
if row:
return _row_to_proxy(row)
return None
@staticmethod
async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]:
async with db.execute(
"SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1"
) as cursor:
row = await cursor.fetchone()
if row:
return _row_to_proxy(row)
return None
@staticmethod
async def list_all(
db: aiosqlite.Connection,
protocol: Optional[str] = None,
limit: int = 100000,
) -> List[Proxy]:
query = "SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies"
params: List = []
if protocol:
query += " WHERE protocol = ?"
params.append(protocol.lower())
query += " LIMIT ?"
params.append(limit)
async with db.execute(query, params) as cursor:
rows = await cursor.fetchall()
return [_row_to_proxy(row) for row in rows]
@staticmethod
async def list_paginated(
db: aiosqlite.Connection,
page: int = 1,
page_size: int = 20,
protocol: Optional[str] = None,
min_score: int = 0,
max_score: Optional[int] = None,
sort_by: str = "last_check",
sort_order: str = "DESC",
) -> Tuple[List[Proxy], int]:
conditions = ["score >= ?"]
params: List = [min_score]
if protocol:
conditions.append("protocol = ?")
params.append(protocol)
if max_score is not None:
conditions.append("score <= ?")
params.append(max_score)
where_clause = " AND ".join(conditions)
order_clause = f"{sort_by} {sort_order}"
offset = (page - 1) * page_size
count_query = f"SELECT COUNT(*) FROM proxies WHERE {where_clause}"
async with db.execute(count_query, list(params)) as cursor:
row = await cursor.fetchone()
total = row[0] if row else 0
data_query = f"""
SELECT ip, port, protocol, score, response_time_ms, last_check, created_at
FROM proxies
WHERE {where_clause}
ORDER BY {order_clause}
LIMIT ? OFFSET ?
"""
params.extend([page_size, offset])
async with db.execute(data_query, params) as cursor:
rows = await cursor.fetchall()
proxies = [_row_to_proxy(row) for row in rows]
return proxies, total
@staticmethod
async def get_stats(db: aiosqlite.Connection) -> dict:
query = """
SELECT
COUNT(*) as total,
COUNT(CASE WHEN score > 0 THEN 1 END) as available,
AVG(score) as avg_score,
COUNT(CASE WHEN protocol = 'http' THEN 1 END) as http_count,
COUNT(CASE WHEN protocol = 'https' THEN 1 END) as https_count,
COUNT(CASE WHEN protocol = 'socks4' THEN 1 END) as socks4_count,
COUNT(CASE WHEN protocol = 'socks5' THEN 1 END) as socks5_count
FROM proxies
"""
async with db.execute(query) as cursor:
row = await cursor.fetchone()
if row:
return {
"total": row[0] or 0,
"available": row[1] or 0,
"avg_score": round(row[2], 2) if row[2] else 0,
"http_count": row[3] or 0,
"https_count": row[4] or 0,
"socks4_count": row[5] or 0,
"socks5_count": row[6] or 0,
}
return {
"total": 0,
"available": 0,
"avg_score": 0,
"http_count": 0,
"https_count": 0,
"socks4_count": 0,
"socks5_count": 0,
}
@staticmethod
async def get_today_new_count(db: aiosqlite.Connection) -> int:
try:
async with db.execute(
"SELECT COUNT(*) FROM proxies WHERE DATE(last_check) = DATE('now', 'localtime')"
) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
except Exception as e:
logger.error(f"get_today_new_count failed: {e}")
return 0
@staticmethod
async def clean_invalid(db: aiosqlite.Connection) -> int:
await db.execute("DELETE FROM proxies WHERE score <= 0")
await db.commit()
return db.total_changes
@staticmethod
async def clean_expired(db: aiosqlite.Connection, days: int) -> int:
try:
await db.execute(
"DELETE FROM proxies WHERE last_check < datetime('now', '-{} days')".format(days)
)
await db.commit()
return db.total_changes
except Exception as e:
logger.error(f"clean_expired failed: {e}")
return 0

View File

@@ -0,0 +1,140 @@
"""设置数据访问层"""
import json
import aiosqlite
from typing import Optional, Dict, Any
from app.core.log import logger
DEFAULT_SETTINGS = {
"crawl_timeout": 30,
"validation_timeout": 10,
"max_retries": 3,
"default_concurrency": 50,
"min_proxy_score": 0,
"proxy_expiry_days": 7,
"auto_validate": True,
"validate_interval_minutes": 30,
}
class SettingsRepository:
"""系统设置 Repository"""
@staticmethod
async def get_all(db: aiosqlite.Connection) -> Dict[str, Any]:
settings = DEFAULT_SETTINGS.copy()
try:
async with db.execute("SELECT key, value FROM settings") as cursor:
rows = await cursor.fetchall()
for key, value in rows:
# 类型转换
default = DEFAULT_SETTINGS.get(key)
if isinstance(default, bool):
settings[key] = value.lower() == "true"
elif isinstance(default, int):
settings[key] = int(value)
else:
settings[key] = value
except Exception as e:
logger.error(f"get_all settings failed: {e}")
return settings
@staticmethod
async def save(db: aiosqlite.Connection, settings: Dict[str, Any]) -> bool:
try:
for key, value in settings.items():
await db.execute(
"""
INSERT INTO settings (key, value, updated_at)
VALUES (?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(key) DO UPDATE SET
value = excluded.value,
updated_at = CURRENT_TIMESTAMP
""",
(key, str(value)),
)
await db.commit()
return True
except Exception as e:
logger.error(f"save settings failed: {e}")
return False
class PluginSettingsRepository:
"""插件设置 Repository"""
@staticmethod
async def get_enabled(db: aiosqlite.Connection, plugin_id: str) -> Optional[bool]:
async with db.execute(
"SELECT enabled FROM plugin_settings WHERE plugin_id = ?", (plugin_id,)
) as cursor:
row = await cursor.fetchone()
if row:
return bool(row[0])
return None
@staticmethod
async def set_enabled(db: aiosqlite.Connection, plugin_id: str, enabled: bool) -> bool:
try:
await db.execute(
"""
INSERT INTO plugin_settings (plugin_id, enabled, created_at, updated_at)
VALUES (?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
ON CONFLICT(plugin_id) DO UPDATE SET
enabled = excluded.enabled,
updated_at = CURRENT_TIMESTAMP
""",
(plugin_id, int(enabled)),
)
await db.commit()
return True
except Exception as e:
logger.error(f"set_enabled failed for {plugin_id}: {e}")
return False
@staticmethod
async def get_config(db: aiosqlite.Connection, plugin_id: str) -> Optional[Dict[str, Any]]:
async with db.execute(
"SELECT config_json FROM plugin_settings WHERE plugin_id = ?", (plugin_id,)
) as cursor:
row = await cursor.fetchone()
if row and row[0]:
try:
return json.loads(row[0])
except json.JSONDecodeError:
return None
return None
@staticmethod
async def set_config(db: aiosqlite.Connection, plugin_id: str, config: Dict[str, Any]) -> bool:
try:
await db.execute(
"""
INSERT INTO plugin_settings (plugin_id, config_json, updated_at)
VALUES (?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(plugin_id) DO UPDATE SET
config_json = excluded.config_json,
updated_at = CURRENT_TIMESTAMP
""",
(plugin_id, json.dumps(config, ensure_ascii=False)),
)
await db.commit()
return True
except Exception as e:
logger.error(f"set_config failed for {plugin_id}: {e}")
return False
@staticmethod
async def list_all(db: aiosqlite.Connection) -> Dict[str, Dict[str, Any]]:
result = {}
async with db.execute("SELECT plugin_id, enabled, config_json FROM plugin_settings") as cursor:
rows = await cursor.fetchall()
for plugin_id, enabled, config_json in rows:
config = {}
if config_json:
try:
config = json.loads(config_json)
except json.JSONDecodeError:
pass
result[plugin_id] = {"enabled": bool(enabled), "config": config}
return result

View File

@@ -0,0 +1,135 @@
"""验证任务队列持久化层"""
import aiosqlite
from typing import List, Optional
from app.models.domain import ProxyRaw
from app.core.log import logger
class ValidationTaskRepository:
"""验证任务 Repository —— 支持队列持久化"""
@staticmethod
async def insert_batch(db: aiosqlite.Connection, proxies: List[ProxyRaw]) -> int:
if not proxies:
return 0
try:
rows = [(p.ip, p.port, p.protocol) for p in proxies]
await db.executemany(
"""
INSERT INTO validation_tasks (ip, port, protocol, status, created_at)
VALUES (?, ?, ?, 'pending', CURRENT_TIMESTAMP)
""",
rows,
)
await db.commit()
return len(rows)
except Exception as e:
logger.error(f"insert_batch validation tasks failed: {e}")
return 0
@staticmethod
async def acquire_pending(db: aiosqlite.Connection) -> Optional[dict]:
"""原子性地获取一个 pending 任务并将其标记为 processing"""
try:
async with db.execute(
"""
SELECT id, ip, port, protocol FROM validation_tasks
WHERE status = 'pending'
ORDER BY id ASC
LIMIT 1
"""
) as cursor:
row = await cursor.fetchone()
if not row:
return None
task_id = row[0]
await db.execute(
"UPDATE validation_tasks SET status = 'processing', updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(task_id,),
)
await db.commit()
return {"id": task_id, "ip": row[1], "port": row[2], "protocol": row[3]}
except Exception as e:
logger.error(f"acquire_pending failed: {e}")
return None
@staticmethod
async def complete_task(
db: aiosqlite.Connection,
task_id: int,
is_valid: bool,
response_time_ms: Optional[float] = None,
) -> bool:
try:
await db.execute(
"""
UPDATE validation_tasks
SET status = 'completed',
result = ?,
response_time_ms = ?,
updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""",
("valid" if is_valid else "invalid", response_time_ms, task_id),
)
await db.commit()
return True
except Exception as e:
logger.error(f"complete_task failed: {e}")
return False
@staticmethod
async def fail_task(db: aiosqlite.Connection, task_id: int) -> bool:
try:
await db.execute(
"""
UPDATE validation_tasks
SET status = 'failed',
result = 'invalid',
updated_at = CURRENT_TIMESTAMP
WHERE id = ?
""",
(task_id,),
)
await db.commit()
return True
except Exception as e:
logger.error(f"fail_task failed: {e}")
return False
@staticmethod
async def get_pending_count(db: aiosqlite.Connection) -> int:
async with db.execute(
"SELECT COUNT(*) FROM validation_tasks WHERE status = 'pending'"
) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
@staticmethod
async def reset_processing(db: aiosqlite.Connection) -> int:
"""将异常中断的 processing 任务重置为 pending用于启动恢复"""
try:
await db.execute(
"""
UPDATE validation_tasks
SET status = 'pending', updated_at = CURRENT_TIMESTAMP
WHERE status = 'processing'
"""
)
await db.commit()
return db.total_changes
except Exception as e:
logger.error(f"reset_processing failed: {e}")
return 0
@staticmethod
async def cleanup_old(db: aiosqlite.Connection, days: int = 7) -> int:
try:
await db.execute(
"DELETE FROM validation_tasks WHERE updated_at < datetime('now', '-{} days')".format(days)
)
await db.commit()
return db.total_changes
except Exception as e:
logger.error(f"cleanup_old tasks failed: {e}")
return 0

12
app/services/__init__.py Normal file
View File

@@ -0,0 +1,12 @@
"""业务逻辑层包"""
from .proxy_service import ProxyService
from .plugin_service import PluginService
from .scheduler_service import SchedulerService
from .validator_service import ValidatorService
__all__ = [
"ProxyService",
"PluginService",
"SchedulerService",
"ValidatorService",
]

View File

@@ -0,0 +1,139 @@
"""插件业务服务"""
from datetime import datetime
from typing import List, Optional
from app.core.db import get_db
from app.core.plugin_system.registry import registry
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.repositories.settings_repo import PluginSettingsRepository
from app.models.domain import PluginInfo, ProxyRaw
from app.core.log import logger
class PluginService:
"""插件业务服务:管理插件生命周期、执行爬取、配置管理"""
def __init__(self):
self.plugin_settings_repo = PluginSettingsRepository()
self._stats: dict[str, dict] = {}
async def list_plugins(self) -> List[PluginInfo]:
"""获取所有插件信息(合并持久化状态和配置)"""
async with get_db() as db:
db_states = await self.plugin_settings_repo.list_all(db)
result = []
for plugin in registry.list_plugins():
# 合并持久化状态
state = db_states.get(plugin.name, {})
if "enabled" in state:
plugin.enabled = state["enabled"]
if "config" in state and isinstance(state["config"], dict):
plugin.update_config(state["config"])
stat = self._stats.get(plugin.name, {
"success_count": 0,
"failure_count": 0,
"last_run": None,
})
result.append(PluginInfo(
id=plugin.name,
name=plugin.name,
display_name=plugin.display_name or plugin.name,
description=plugin.description or f"{plugin.name} 爬取代理",
enabled=plugin.enabled,
last_run=stat.get("last_run"),
success_count=stat.get("success_count", 0),
failure_count=stat.get("failure_count", 0),
))
return result
async def toggle_plugin(self, plugin_id: str, enabled: bool) -> bool:
plugin = registry.get(plugin_id)
if not plugin:
return False
async with get_db() as db:
success = await self.plugin_settings_repo.set_enabled(db, plugin_id, enabled)
if success:
plugin.enabled = enabled
logger.info(f"Plugin {plugin_id} toggled to {enabled}")
return success
async def get_plugin_config(self, plugin_id: str) -> Optional[dict]:
"""获取插件当前配置(合并默认值和持久化值)"""
plugin = registry.get(plugin_id)
if not plugin:
return None
async with get_db() as db:
saved = await self.plugin_settings_repo.get_config(db, plugin_id)
config = dict(plugin.default_config)
if saved:
config.update(saved)
return config
async def update_plugin_config(self, plugin_id: str, config: dict) -> bool:
"""更新插件配置(只保存已存在于 default_config 中的键)"""
plugin = registry.get(plugin_id)
if not plugin:
return False
# 过滤非法键
safe_config = {k: v for k, v in config.items() if k in plugin.default_config}
if not safe_config:
return False
plugin.update_config(safe_config)
async with get_db() as db:
return await self.plugin_settings_repo.set_config(db, plugin_id, plugin.config)
def get_plugin(self, plugin_id: str) -> Optional[BaseCrawlerPlugin]:
return registry.get(plugin_id)
async def run_plugin(self, plugin_id: str) -> List[ProxyRaw]:
"""执行单个插件爬取"""
plugin = self.get_plugin(plugin_id)
if not plugin:
raise ValueError(f"Plugin {plugin_id} not found")
if not plugin.enabled:
logger.warning(f"Plugin {plugin_id} is disabled, skip crawl")
return []
try:
results = await plugin.crawl()
self._record_stat(plugin_id, success=len(results))
logger.info(f"Plugin {plugin_id} crawled {len(results)} proxies")
return results
except Exception as e:
self._record_stat(plugin_id, failure=1)
logger.error(f"Plugin {plugin_id} crawl failed: {e}")
return []
async def run_all_plugins(self) -> List[ProxyRaw]:
"""执行所有启用插件的爬取"""
all_results: List[ProxyRaw] = []
for plugin in registry.list_plugins():
if not plugin.enabled:
continue
try:
results = await self.run_plugin(plugin.name)
all_results.extend(results)
except Exception as e:
logger.error(f"Run all plugins error at {plugin.name}: {e}")
# 去重
seen = set()
unique = []
for p in all_results:
key = (p.ip, p.port, p.protocol)
if key not in seen:
seen.add(key)
unique.append(p)
return unique
def _record_stat(self, plugin_id: str, success: int = 0, failure: int = 0):
if plugin_id not in self._stats:
self._stats[plugin_id] = {
"success_count": 0,
"failure_count": 0,
"last_run": None,
}
self._stats[plugin_id]["success_count"] += success
self._stats[plugin_id]["failure_count"] += failure
if success or failure:
self._stats[plugin_id]["last_run"] = datetime.now()

View File

@@ -0,0 +1,93 @@
"""代理业务服务"""
import csv
import json
import io
from datetime import datetime
from typing import List, Optional, Tuple, AsyncIterator
from app.core.db import get_db
from app.repositories.proxy_repo import ProxyRepository
from app.models.domain import Proxy
from app.core.log import logger
class ProxyService:
def __init__(self, proxy_repo: ProxyRepository = ProxyRepository()):
self.proxy_repo = proxy_repo
async def get_stats(self) -> dict:
async with get_db() as db:
stats = await self.proxy_repo.get_stats(db)
stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
return stats
async def list_proxies(
self,
page: int = 1,
page_size: int = 20,
protocol: Optional[str] = None,
min_score: int = 0,
max_score: Optional[int] = None,
sort_by: str = "last_check",
sort_order: str = "DESC",
) -> Tuple[List[Proxy], int]:
async with get_db() as db:
return await self.proxy_repo.list_paginated(
db, page, page_size, protocol, min_score, max_score, sort_by, sort_order
)
async def get_random_proxy(self) -> Optional[Proxy]:
async with get_db() as db:
return await self.proxy_repo.get_random(db)
async def delete_proxy(self, ip: str, port: int) -> None:
async with get_db() as db:
await self.proxy_repo.delete(db, ip, port)
async def batch_delete(self, proxies: List[Tuple[str, int]]) -> int:
async with get_db() as db:
return await self.proxy_repo.batch_delete(db, proxies)
async def clean_invalid(self) -> int:
async with get_db() as db:
return await self.proxy_repo.clean_invalid(db)
async def clean_expired(self, days: int) -> int:
async with get_db() as db:
return await self.proxy_repo.clean_expired(db, days)
async def export_proxies(
self,
fmt: str,
protocol: Optional[str] = None,
limit: int = 10000,
) -> AsyncIterator[str]:
async with get_db() as db:
proxies = await self.proxy_repo.list_all(db, protocol=protocol, limit=limit)
if fmt == "csv":
yield "IP,Port,Protocol,Score,Last Check\n"
for p in proxies:
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
elif fmt == "txt":
for p in proxies:
yield f"{p.ip}:{p.port}\n"
elif fmt == "json":
data = [
{
"ip": p.ip,
"port": p.port,
"protocol": p.protocol,
"score": p.score,
"last_check": self._fmt_time(p.last_check),
}
for p in proxies
]
yield json.dumps(data, ensure_ascii=False, indent=2)
@staticmethod
def _fmt_time(dt: Optional[datetime]) -> str:
if not dt:
return ""
if isinstance(dt, str):
return dt
return dt.isoformat()

View File

@@ -0,0 +1,88 @@
"""调度器服务 - 定时验证存量代理"""
import asyncio
from datetime import datetime
from app.core.db import get_db
from app.repositories.proxy_repo import ProxyRepository
from app.core.tasks.queue import ValidationQueue
from app.core.config import settings as app_settings
from app.core.log import logger
class SchedulerService:
"""代理验证调度器"""
def __init__(
self,
validation_queue: ValidationQueue,
proxy_repo: ProxyRepository = ProxyRepository(),
):
self.validation_queue = validation_queue
self.proxy_repo = proxy_repo
self.interval_minutes = 30
self.running = False
self._task: asyncio.Task | None = None
async def start(self):
if self.running:
logger.warning("Scheduler already running")
return
self.running = True
await self.validation_queue.start()
self._task = asyncio.create_task(self._run_loop())
logger.info("Scheduler started")
async def stop(self):
self.running = False
if self._task:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
self._task = None
await self.validation_queue.stop()
logger.info("Scheduler stopped")
async def validate_all_now(self):
"""立即执行一次全量验证(后台运行,不阻塞)"""
asyncio.create_task(self._do_validate_all())
async def _run_loop(self):
"""定时循环"""
while self.running:
try:
await self._do_validate_all()
except Exception as e:
logger.error(f"Scheduler loop error: {e}")
# 等待下一次
for _ in range(self.interval_minutes * 60):
if not self.running:
break
await asyncio.sleep(1)
async def _do_validate_all(self):
"""验证数据库中所有存量代理"""
logger.info("Starting scheduled validation for all proxies")
async with get_db() as db:
proxies = await self.proxy_repo.list_all(db)
if not proxies:
logger.info("No proxies to validate")
return
logger.info(f"Validating {len(proxies)} proxies from database")
from app.models.domain import ProxyRaw
# 批量提交到验证队列
batch_size = 100
for i in range(0, len(proxies), batch_size):
if not self.running:
break
batch = proxies[i : i + batch_size]
await self.validation_queue.submit([
ProxyRaw(p.ip, p.port, p.protocol) for p in batch
])
# 等待当前批次处理完
await self.validation_queue.drain()
logger.info(f"Validated batch {i//batch_size + 1}/{(len(proxies)-1)//batch_size + 1}")
logger.info("Scheduled validation completed")

View File

@@ -0,0 +1,97 @@
"""代理验证服务 - 支持 HTTP/HTTPS/SOCKS4/SOCKS5"""
import asyncio
import random
import time
import aiohttp
import aiohttp_socks
from typing import Tuple
from app.core.log import logger
class ValidatorService:
"""代理验证器"""
# 测试 URL
TEST_URLS = {
"http": ["http://httpbin.org/ip", "http://api.ipify.org"],
"https": ["https://httpbin.org/ip", "https://api.ipify.org"],
}
def __init__(
self,
timeout: float = 5.0,
connect_timeout: float = 3.0,
max_concurrency: int = 50,
):
self.timeout = timeout
self.connect_timeout = connect_timeout
self.semaphore = asyncio.Semaphore(max_concurrency)
def _get_test_url(self, protocol: str) -> str:
"""获取测试 URL"""
urls = self.TEST_URLS.get(protocol.lower(), self.TEST_URLS["http"])
return random.choice(urls)
async def validate(self, ip: str, port: int, protocol: str = "http") -> Tuple[bool, float]:
"""验证单个代理,返回 (是否有效, 延迟毫秒)"""
protocol = protocol.lower()
async with self.semaphore:
start = time.time()
try:
if protocol in ("socks4", "socks5"):
return await self._validate_socks(ip, port, protocol, start)
else:
return await self._validate_http(ip, port, protocol, start)
except asyncio.TimeoutError:
logger.debug(f"Validation timeout: {ip}:{port} ({protocol})")
return False, 0.0
except Exception as e:
logger.debug(f"Validation error {ip}:{port} ({protocol}): {e}")
return False, 0.0
async def _validate_http(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]:
"""验证 HTTP/HTTPS 代理"""
proxy_url = f"http://{ip}:{port}"
connector = aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout)
test_url = self._get_test_url(protocol)
try:
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
async with session.get(test_url, proxy=proxy_url, allow_redirects=True) as response:
if response.status in (200, 301, 302):
latency = round((time.time() - start) * 1000, 2)
logger.info(f"HTTP valid: {ip}:{port} ({protocol}) {latency}ms")
return True, latency
return False, 0.0
finally:
await connector.close()
async def _validate_socks(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]:
"""验证 SOCKS4/SOCKS5 代理"""
proxy_type = (
aiohttp_socks.ProxyType.SOCKS4
if protocol == "socks4"
else aiohttp_socks.ProxyType.SOCKS5
)
connector = aiohttp_socks.ProxyConnector(
proxy_type=proxy_type,
host=ip,
port=port,
rdns=True,
ssl=False,
)
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout)
test_url = self._get_test_url("http")
try:
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
async with session.get(test_url, allow_redirects=True) as response:
if response.status in (200, 301, 302):
latency = round((time.time() - start) * 1000, 2)
logger.info(f"SOCKS valid: {ip}:{port} ({protocol}) {latency}ms")
return True, latency
return False, 0.0
finally:
await connector.close()