diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 0000000..576c75c --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,470 @@ +# ProxyPool 架构重构设计文档 + +> 目标:建立一个高度可扩展、分层清晰、易于维护的代理池系统。最关键的目标是**让添加新爬虫变得极其简单**。 + +--- + +## 1. 架构总览 + +采用经典的分层架构: + +``` +┌─────────────────────────────────────────┐ +│ Frontend (Vue3 + Vite + Element Plus) │ +└─────────────┬───────────────────────────┘ + │ HTTP/REST +┌─────────────▼───────────────────────────┐ +│ API Layer (FastAPI Routers) │ ← 只负责:校验输入、调用 Service、格式化输出 +├─────────────────────────────────────────┤ +│ Service Layer │ ← 业务逻辑编排:爬取策略、验证调度、导出逻辑 +├─────────────────────────────────────────┤ +│ Plugin System (Crawlers) │ ← 爬虫插件:实现统一接口,返回原始代理数据 +├─────────────────────────────────────────┤ +│ Task Queue & Workers │ ← 验证队列:背压控制、Worker 池、削峰填谷 +├─────────────────────────────────────────┤ +│ Repository Layer │ ← 数据访问:所有 SQL 收敛于此 +├─────────────────────────────────────────┤ +│ Infrastructure (DB / Config / Log) │ ← 基础设施:连接池、配置、日志 +└─────────────────────────────────────────┘ +``` + +--- + +## 2. 后端核心设计原则 + +### 2.1 消灭全局单例,全面使用依赖注入 (DI) +当前 `scheduler = ValidationScheduler()` 是模块级全局变量,导致测试困难、隐式依赖。 + +重构后: +- 所有核心组件(DB、Scheduler、PluginManager)都通过 FastAPI `Depends` 注入 +- 使用 `contextlib.asynccontextmanager` 在 lifespan 中初始化并挂载到 `app.state` +- 单元测试可以轻易 mock 任何一层 + +### 2.2 Repository 模式收敛所有 SQL +所有数据库操作从 `api_server.py`、`scheduler.py` 中彻底抽离到 `repositories/proxy_repo.py`。 + +好处: +- 换数据库时只改 Repository +- 写单元测试直接 mock Repository +- SQL 语句集中管理,防止散落在各处 + +### 2.3 任务队列解耦爬取与验证 +当前插件爬取后直接 `asyncio.gather(*10000_tasks)` 验证,存在内存和并发风险。 + +重构后引入轻量级内存队列: +- `ValidationQueue`:基于 `asyncio.Queue` +- `ValidationWorkerPool`:固定数量的 Worker 从队列消费 +- 爬取结果 `put` 进队列即返回,验证在后台进行 +- 天然支持背压(backpressure),防止内存爆炸 + +--- + +## 3. 插件系统设计(核心) + +### 3.1 设计目标 +**让添加一个新爬虫只需要做两件事:** +1. 创建一个类,继承 `BaseCrawlerPlugin` +2. 实现 `crawl()` 方法,返回 `list[ProxyRaw]` + +### 3.2 插件接口 + +```python +from dataclasses import dataclass +from typing import List, AsyncIterator + +@dataclass +class ProxyRaw: + ip: str + port: int + protocol: str # http | https | socks4 | socks5 + +class BaseCrawlerPlugin: + """所有爬虫插件必须继承的基类""" + + name: str = "" # 插件唯一标识 + display_name: str = "" # 展示名称 + description: str = "" # 描述 + enabled: bool = True # 是否默认启用 + + async def crawl(self) -> List[ProxyRaw]: + """ + 爬取代理的核心方法。 + 可以是纯同步逻辑,也可以包含异步 HTTP 请求。 + 返回原始代理列表,不要在这里做验证。 + """ + raise NotImplementedError + + async def health_check(self) -> bool: + """可选:检查当前插件是否可用(如目标网站是否可访问)""" + return True +``` + +### 3.3 插件注册机制 +采用**显式注册 + 装饰器**模式,抛弃运行时目录扫描。 + +```python +from core.plugin_system import registry + +@registry.register +class MyNewPlugin(BaseCrawlerPlugin): + name = "my_new_plugin" + display_name = "我的新代理源" + + async def crawl(self): + return [ProxyRaw("1.2.3.4", 8080, "http")] +``` + +优点: +- 类型安全:IDE 可以自动补全、静态检查 +- 可控:不会出现意外加载未预期模块的问题 +- 测试友好:测试时只注册 mock 插件 + +同时提供一个兼容入口 `registry.auto_discover("plugins")`,用于兼容现有习惯。 + +### 3.4 插件元数据持久化 +插件的 `enabled` 状态应该持久化到数据库(或 settings JSON),而不是仅存在于内存。 + +新增 `plugin_settings` 表: +```sql +CREATE TABLE plugin_settings ( + plugin_id TEXT PRIMARY KEY, + enabled INTEGER DEFAULT 1, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); +``` + +启动时: +1. 加载所有已注册插件 +2. 从 `plugin_settings` 读取持久化状态 +3. 合并到插件实例中 + +--- + +## 4. 任务调度与验证队列 + +### 4.1 验证队列设计 + +```python +class ValidationQueue: + def __init__(self, worker_count: int = 50): + self.queue: asyncio.Queue[ProxyRaw] = asyncio.Queue() + self.worker_count = worker_count + self.workers: list[asyncio.Task] = [] + self._running = False + + async def start(self): + self._running = True + for _ in range(self.worker_count): + self.workers.append(asyncio.create_task(self._worker_loop())) + + async def stop(self): + self._running = False + for _ in self.workers: + self.queue.put_nowait(None) # sentinel + await asyncio.gather(*self.workers, return_exceptions=True) + + async def submit(self, proxies: list[ProxyRaw]): + for p in proxies: + await self.queue.put(p) + + async def _worker_loop(self): + while True: + item = await self.queue.get() + if item is None: + break + await self._validate_and_save(item) + self.queue.task_done() +``` + +### 4.2 调度器设计 +`SchedulerService` 负责: +- 启动/停止验证队列 +- 定时从数据库拉取存量代理,重新投入验证队列 +- 协调插件爬取后的验证流程 + +```python +class SchedulerService: + def __init__(self, queue: ValidationQueue, proxy_repo: ProxyRepository): + self.queue = queue + self.proxy_repo = proxy_repo + self.interval_minutes = 30 + self._task: asyncio.Task | None = None +``` + +--- + +## 5. 数据库设计 + +保留 SQLite + aiosqlite,但优化连接管理。 + +### 5.1 表结构 + +```sql +-- 代理表 +CREATE TABLE proxies ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ip TEXT NOT NULL, + port INTEGER NOT NULL, + protocol TEXT DEFAULT 'http', + score INTEGER DEFAULT 10, + response_time_ms REAL, + last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE(ip, port) +); + +-- 插件设置表 +CREATE TABLE plugin_settings ( + plugin_id TEXT PRIMARY KEY, + enabled INTEGER DEFAULT 1, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +-- 系统设置表(JSON 存储) +CREATE TABLE settings ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); +``` + +### 5.2 连接管理 +- 使用 `asynccontextmanager` 管理连接生命周期 +- 每个 HTTP 请求独立获取连接,请求结束后关闭 +- 调度器/队列等长生命周期组件也定期重建连接(如每 1000 次操作) + +--- + +## 6. API 设计调整 + +保持现有 API 路径基本不变,但路由按资源拆分。 + +### 6.1 路由拆分 +``` +apiv1/ +├── __init__.py +├── proxies.py # /api/proxies/* +├── plugins.py # /api/plugins/* +├── scheduler.py # /api/scheduler/* +└── settings.py # /api/settings +``` + +### 6.2 新增/调整的 API + +#### 插件相关 +- `GET /api/plugins` — 获取插件列表(含持久化状态) +- `PUT /api/plugins/{plugin_id}/toggle` — 切换启用状态(持久化到 DB) +- `POST /api/plugins/{plugin_id}/crawl` — 触发爬取(异步,返回任务 ID) +- `POST /api/plugins/crawl-all` — 批量爬取 + +**关键变更**:爬取接口改为**异步触发**而不是同步等待。因为新爬虫可能爬取数万个代理,同步 HTTP 请求会超时。 + +返回示例: +```json +{ + "code": 200, + "message": "爬取任务已启动", + "data": { + "task_id": "crawl-20250402-001", + "queued": 150 + } +} +``` + +为了简化前端,第一阶段可以保留同步 API,但内部通过 `asyncio.create_task` 包装,并设置合理的超时(30 秒)。在真正大规模使用时,再迁移到 WebSocket/SSE 推送进度。 + +--- + +## 7. 前端架构调整 + +### 7.1 新增 Service 层 +从 Store 中剥离 API 调用逻辑: + +``` +frontend/src/ +├── services/ +│ ├── proxyService.js # 代理相关 API 调用 +│ ├── pluginService.js # 插件相关 API 调用 +│ ├── schedulerService.js +│ └── settingService.js +├── stores/ +│ ├── proxy.js # 纯状态管理 +│ └── plugin.js +``` + +### 7.2 Store 职责收敛 +Store 只负责: +- 持有状态(`ref/reactive`) +- 提供计算属性 +- 调用 Service,然后更新状态 + +### 7.3 API 适配 +由于后端 API 路径保持不变,前端改动主要是代码组织上的调整,URL 和返回结构尽量兼容。 + +--- + +## 8. 目录结构(重构后) + +``` +ProxyPool/ +├── api/ # FastAPI 入口和路由 +│ ├── __init__.py +│ ├── main.py # 应用工厂 +│ ├── lifespan.py # 生命周期管理 +│ ├── deps.py # 依赖注入 +│ ├── errors.py # 统一异常 +│ └── routes/ +│ ├── __init__.py +│ ├── proxies.py +│ ├── plugins.py +│ ├── scheduler.py +│ └── settings.py +│ +├── core/ # 基础设施 +│ ├── __init__.py +│ ├── config.py # Pydantic Settings +│ ├── log.py # 日志 +│ ├── db.py # 数据库连接池/上下文 +│ └── exceptions.py # 业务异常 +│ +├── models/ # 数据模型 +│ ├── __init__.py +│ ├── schemas.py # Pydantic 模型 +│ └── domain.py # 领域模型(ProxyRaw, PluginInfo 等) +│ +├── repositories/ # 数据访问层 +│ ├── __init__.py +│ └── proxy_repo.py # ProxyRepository +│ +├── services/ # 业务逻辑层 +│ ├── __init__.py +│ ├── proxy_service.py +│ ├── plugin_service.py +│ ├── scheduler_service.py +│ └── validator_service.py +│ +├── core/ # 任务与插件系统 +│ ├── plugin_system/ +│ │ ├── __init__.py +│ │ ├── base.py # BaseCrawlerPlugin +│ │ └── registry.py # 插件注册中心 +│ └── tasks/ +│ ├── __init__.py +│ ├── queue.py # ValidationQueue +│ └── workers.py # Worker Pool +│ +├── plugins/ # 爬虫插件 +│ ├── __init__.py +│ ├── base.py # 通用抓取基类(HTTP 请求封装) +│ ├── fate0.py +│ ├── proxylist_download.py +│ └── ... +│ +├── frontend/ # Vue3 前端 +│ └── src/ +│ ├── services/ # 新增 +│ ├── stores/ +│ ├── api/ +│ └── ... +│ +├── tests/ # 测试目录 +│ ├── conftest.py +│ ├── unit/ +│ └── integration/ +│ +├── script/ +├── data/ +├── db/ +├── logs/ +├── requirements.txt +├── .env.example +└── DESIGN.md # 本文档 +``` + +--- + +## 9. 迁移计划 + +### Phase 1: 基础设施(今天完成) +1. 重写 `core/config.py` → Pydantic Settings +2. 重写 `core/db.py` → 带上下文管理的连接池 +3. 创建 `models/` 层 + +### Phase 2: Repository + Service(今天完成) +1. 创建 `repositories/proxy_repo.py` +2. 创建 `services/` 下的业务类 +3. 迁移现有逻辑 + +### Phase 3: 插件系统(今天完成,核心) +1. 创建 `core/plugin_system/base.py` 和 `registry.py` +2. 设计显式注册机制 +3. 将所有现有插件迁移到新基类 + +### Phase 4: 任务队列(今天完成) +1. 创建 `ValidationQueue` 和 `WorkerPool` +2. 重写 `SchedulerService` + +### Phase 5: API 路由(今天完成) +1. 拆分 `api_server.py` 到 `api/routes/` +2. 组装新的 `api/main.py` + +### Phase 6: 前端调整(今天完成) +1. 拆分 Service 层 +2. 适配 Store +3. 保留现有页面,只改代码组织 + +### Phase 7: 清理与验证 +1. 删除旧的 `api_server.py`, `core/scheduler.py`, `core/sqlite.py` 等 +2. 运行测试,确保所有功能正常 +3. 提交代码 + +--- + +## 10. 添加新爬虫的标准流程(目标体验) + +假设要添加一个名为 `mynewsource` 的爬虫: + +**Step 1**: 创建文件 `plugins/mynewsource.py` + +```python +from core.plugin_system import BaseCrawlerPlugin, ProxyRaw +from plugins.base import BaseHTTPPlugin # 可选:如果基于 HTTP 爬取 + +class MyNewSourcePlugin(BaseHTTPPlugin): + name = "mynewsource" + display_name = "我的新代理源" + description = "从 example.com 爬取免费代理" + + def __init__(self): + super().__init__() + self.urls = ["https://example.com/proxies"] + + async def crawl(self) -> list[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url) + # ... 解析 html ... + results.append(ProxyRaw(ip="1.2.3.4", port=8080, protocol="http")) + return results +``` + +**Step 2**: 在 `plugins/__init__.py` 中注册 + +```python +from .mynewsource import MyNewSourcePlugin +from core.plugin_system import registry + +registry.register(MyNewSourcePlugin) +``` + +**Step 3**: 重启后端服务,前端自动显示新插件。 + +无需修改任何路由、服务、数据库表。 + +--- + +*文档版本: 1.0* +*作者: Kimi Code* +*日期: 2026-04-02* diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..813f14c --- /dev/null +++ b/api/__init__.py @@ -0,0 +1,3 @@ +from .main import create_app + +__all__ = ["create_app"] diff --git a/api/deps.py b/api/deps.py new file mode 100644 index 0000000..4aa20f9 --- /dev/null +++ b/api/deps.py @@ -0,0 +1,55 @@ +"""依赖注入""" +from fastapi import Request +from services.proxy_service import ProxyService +from services.plugin_service import PluginService +from services.settings_service import SettingsService +from services.scheduler_service import SchedulerService +from services.validator_service import ValidatorService +from repositories.proxy_repo import ProxyRepository +from core.tasks.queue import ValidationQueue +from core.config import settings as app_settings + + +def get_proxy_service() -> ProxyService: + return ProxyService() + + +def get_plugin_service() -> PluginService: + return PluginService() + + +def get_settings_service() -> SettingsService: + return SettingsService() + + +def get_scheduler_service(request: Request) -> SchedulerService: + return request.app.state.scheduler_service + + +def get_validation_queue(request: Request) -> ValidationQueue: + return request.app.state.validation_queue + + +def create_scheduler_service() -> SchedulerService: + """在应用启动时创建 SchedulerService(非请求上下文)""" + validator = ValidatorService( + timeout=app_settings.validator_timeout, + connect_timeout=app_settings.validator_connect_timeout, + max_concurrency=app_settings.validator_max_concurrency, + ) + proxy_repo = ProxyRepository() + queue = ValidationQueue( + validator=validator, + proxy_repo=proxy_repo, + db_ctx=get_db, + worker_count=app_settings.validator_max_concurrency, + score_valid=app_settings.score_valid, + score_invalid=app_settings.score_invalid, + score_min=app_settings.score_min, + score_max=app_settings.score_max, + ) + return SchedulerService(validation_queue=queue, proxy_repo=proxy_repo) + + +# 避免循环导入 +from core.db import get_db diff --git a/api/errors.py b/api/errors.py new file mode 100644 index 0000000..0c38b56 --- /dev/null +++ b/api/errors.py @@ -0,0 +1,33 @@ +"""统一异常处理""" +from fastapi import Request +from fastapi.responses import JSONResponse +from pydantic import ValidationError +from core.exceptions import ProxyPoolException +from core.log import logger + + +async def proxy_pool_exception_handler(request: Request, exc: ProxyPoolException): + return JSONResponse( + status_code=exc.code, + content={"code": exc.code, "message": exc.message, "data": None}, + ) + + +async def pydantic_validation_handler(request: Request, exc: ValidationError): + logger.error(f"Validation error: {exc}") + return JSONResponse( + status_code=422, + content={ + "code": 422, + "message": "参数验证失败", + "data": exc.errors(), + }, + ) + + +async def general_exception_handler(request: Request, exc: Exception): + logger.error(f"Unhandled exception: {exc}", exc_info=True) + return JSONResponse( + status_code=500, + content={"code": 500, "message": "服务器内部错误", "data": None}, + ) diff --git a/api/lifespan.py b/api/lifespan.py new file mode 100644 index 0000000..358f859 --- /dev/null +++ b/api/lifespan.py @@ -0,0 +1,39 @@ +"""应用生命周期管理""" +from contextlib import asynccontextmanager +from fastapi import FastAPI +from core.db import init_db +from core.config import settings as app_settings +from core.log import logger +from api.deps import create_scheduler_service + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """应用启动和关闭时的生命周期管理""" + # 初始化数据库 + await init_db() + + # 创建调度器并挂载到 app.state + scheduler_service = create_scheduler_service() + app.state.scheduler_service = scheduler_service + app.state.validation_queue = scheduler_service.validation_queue + + # 加载设置并决定是否启动调度器 + from services.settings_service import SettingsService + settings_service = SettingsService() + try: + settings = await settings_service.get_settings() + scheduler_service.interval_minutes = settings.get( + "validate_interval_minutes", app_settings.validator_timeout + ) + if settings.get("auto_validate", True): + await scheduler_service.start() + except Exception as e: + logger.error(f"Failed to load settings on startup: {e}") + + logger.info("API server started") + yield + + # 关闭调度器 + await scheduler_service.stop() + logger.info("API server shutdown") diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000..4690da7 --- /dev/null +++ b/api/main.py @@ -0,0 +1,55 @@ +"""FastAPI 应用工厂""" +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from api.lifespan import lifespan +from api.routes import api_router +from api.errors import proxy_pool_exception_handler, pydantic_validation_handler, general_exception_handler +from core.exceptions import ProxyPoolException +from pydantic import ValidationError +from core.config import settings as app_settings + +# 导入并注册所有插件(显式注册模式) +import plugins + + +def create_app() -> FastAPI: + app = FastAPI( + title="代理池API", + version="2.0.0", + lifespan=lifespan, + ) + + # CORS + app.add_middleware( + CORSMiddleware, + allow_origins=app_settings.cors_origins_list, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + # 异常处理 + app.add_exception_handler(ProxyPoolException, proxy_pool_exception_handler) + app.add_exception_handler(ValidationError, pydantic_validation_handler) + app.add_exception_handler(Exception, general_exception_handler) + + # 路由 + app.include_router(api_router) + + @app.get("/") + async def root(): + return {"message": "欢迎使用代理池API", "status": "running", "data": None} + + @app.get("/health") + async def health_check(): + from datetime import datetime + scheduler = app.state.scheduler_service + return { + "status": "healthy", + "timestamp": datetime.now().isoformat(), + "database": "connected", + "scheduler": "running" if scheduler.running else "stopped", + "version": "2.0.0", + } + + return app diff --git a/api/routes/__init__.py b/api/routes/__init__.py new file mode 100644 index 0000000..969f8a3 --- /dev/null +++ b/api/routes/__init__.py @@ -0,0 +1,9 @@ +from fastapi import APIRouter +from . import stats, proxies, plugins, scheduler, settings + +api_router = APIRouter() +api_router.include_router(stats.router) +api_router.include_router(proxies.router) +api_router.include_router(plugins.router) +api_router.include_router(scheduler.router) +api_router.include_router(settings.router) diff --git a/api/routes/plugins.py b/api/routes/plugins.py new file mode 100644 index 0000000..8f160d5 --- /dev/null +++ b/api/routes/plugins.py @@ -0,0 +1,139 @@ +"""插件相关路由""" +from fastapi import APIRouter, Depends +from services.plugin_service import PluginService +from services.scheduler_service import SchedulerService +from models.schemas import PluginToggleRequest +from api.deps import get_plugin_service, get_scheduler_service +from core.log import logger + +router = APIRouter(prefix="/api/plugins", tags=["plugins"]) + + +def success_response(message: str, data=None): + return {"code": 200, "message": message, "data": data} + + +def error_response(message: str, code: int = 500): + return {"code": code, "message": message, "data": None} + + +@router.get("") +async def list_plugins(service: PluginService = Depends(get_plugin_service)): + plugins = await service.list_plugins() + return success_response( + "获取插件列表成功", + { + "plugins": [ + { + "id": p.id, + "name": p.display_name, # 保持旧版本兼容:name 用于展示 + "display_name": p.display_name, + "description": p.description, + "enabled": p.enabled, + "last_run": p.last_run.isoformat() if p.last_run else None, + "success_count": p.success_count, + "failure_count": p.failure_count, + } + for p in plugins + ] + }, + ) + + +@router.put("/{plugin_id}/toggle") +async def toggle_plugin( + plugin_id: str, + request: PluginToggleRequest, + service: PluginService = Depends(get_plugin_service), +): + success = await service.toggle_plugin(plugin_id, request.enabled) + if not success: + return error_response("插件不存在", 404) + return success_response( + f"插件 {plugin_id} 已{'启用' if request.enabled else '禁用'}", + {"plugin_id": plugin_id, "enabled": request.enabled}, + ) + + +@router.post("/{plugin_id}/crawl") +async def crawl_plugin( + plugin_id: str, + plugin_service: PluginService = Depends(get_plugin_service), + scheduler_service: SchedulerService = Depends(get_scheduler_service), +): + plugin = plugin_service.get_plugin(plugin_id) + if not plugin: + return error_response("插件不存在", 404) + + try: + results = await plugin_service.run_plugin(plugin_id) + if not results: + return success_response( + f"插件 {plugin_id} 爬取完成,未获取到代理", + {"plugin_id": plugin_id, "proxy_count": 0, "valid_count": 0}, + ) + + logger.info(f"Plugin {plugin_id} crawled {len(results)} proxies, sending to validation queue") + scheduler_service.validation_queue.reset_stats() + await scheduler_service.validation_queue.submit(results) + # 等待队列排空(最多等 30 秒,避免前端超时) + try: + await asyncio.wait_for(scheduler_service.validation_queue.drain(), timeout=30.0) + except asyncio.TimeoutError: + pass + + valid_count = scheduler_service.validation_queue.valid_count + invalid_count = scheduler_service.validation_queue.invalid_count + + return success_response( + f"插件 {plugin_id} 爬取并验证完成", + { + "plugin_id": plugin_id, + "proxy_count": len(results), + "valid_count": valid_count, + "invalid_count": invalid_count, + }, + ) + except Exception as e: + logger.error(f"Crawl plugin {plugin_id} failed: {e}") + return error_response(f"插件爬取失败: {str(e)}") + + +@router.post("/crawl-all") +async def crawl_all( + plugin_service: PluginService = Depends(get_plugin_service), + scheduler_service: SchedulerService = Depends(get_scheduler_service), +): + try: + results = await plugin_service.run_all_plugins() + if not results: + return success_response( + "所有插件爬取完成,未获取到代理", + {"total_crawled": 0, "valid_count": 0, "invalid_count": 0}, + ) + + logger.info(f"All plugins crawled {len(results)} unique proxies, sending to validation queue") + scheduler_service.validation_queue.reset_stats() + await scheduler_service.validation_queue.submit(results) + try: + await asyncio.wait_for(scheduler_service.validation_queue.drain(), timeout=60.0) + except asyncio.TimeoutError: + pass + + valid_count = scheduler_service.validation_queue.valid_count + invalid_count = scheduler_service.validation_queue.invalid_count + + return success_response( + "所有插件爬取并验证完成", + { + "total_crawled": len(results), + "valid_count": valid_count, + "invalid_count": invalid_count, + }, + ) + except Exception as e: + logger.error(f"Crawl all failed: {e}") + return error_response(f"批量爬取失败: {str(e)}") + + +import asyncio diff --git a/api/routes/proxies.py b/api/routes/proxies.py new file mode 100644 index 0000000..d803be0 --- /dev/null +++ b/api/routes/proxies.py @@ -0,0 +1,114 @@ +"""代理相关路由""" +from typing import Optional +from fastapi import APIRouter, Depends, Query +from services.proxy_service import ProxyService +from models.schemas import ProxyListRequest, BatchDeleteRequest +from api.deps import get_proxy_service + +router = APIRouter(prefix="/api/proxies", tags=["proxies"]) + + +def success_response(message: str, data=None): + return {"code": 200, "message": message, "data": data} + + +def error_response(message: str, code: int = 500): + return {"code": code, "message": message, "data": None} + + +@router.post("") +async def list_proxies( + request: ProxyListRequest, + service: ProxyService = Depends(get_proxy_service), +): + proxies, total = await service.list_proxies( + page=request.page, + page_size=request.page_size, + protocol=request.protocol, + min_score=request.min_score, + max_score=request.max_score, + sort_by=request.sort_by, + sort_order=request.sort_order, + ) + return success_response( + "获取代理列表成功", + { + "list": [ + { + "ip": p.ip, + "port": p.port, + "protocol": p.protocol, + "score": p.score, + "last_check": p.last_check.isoformat() if p.last_check else None, + } + for p in proxies + ], + "total": total, + "page": request.page, + "page_size": request.page_size, + }, + ) + + +@router.get("/random") +async def get_random_proxy(service: ProxyService = Depends(get_proxy_service)): + proxy = await service.get_random_proxy() + if not proxy: + return error_response("没有找到可用的代理", 404) + return success_response( + "获取随机代理成功", + { + "ip": proxy.ip, + "port": proxy.port, + "protocol": proxy.protocol, + "score": proxy.score, + "last_check": proxy.last_check.isoformat() if proxy.last_check else None, + }, + ) + + +@router.get("/export/{fmt}") +async def export_proxies( + fmt: str, + protocol: Optional[str] = None, + limit: int = Query(default=10000, ge=1, le=100000), + service: ProxyService = Depends(get_proxy_service), +): + if fmt not in ("csv", "txt", "json"): + return error_response("不支持的导出格式", 400) + + from fastapi.responses import StreamingResponse + + media_types = {"csv": "text/csv", "txt": "text/plain", "json": "application/json"} + + async def generate(): + async for chunk in service.export_proxies(fmt, protocol, limit): + yield chunk + + return StreamingResponse( + generate(), + media_type=media_types[fmt], + headers={"Content-Disposition": f"attachment; filename=proxies.{fmt}"}, + ) + + +@router.delete("/{ip}/{port}") +async def delete_proxy(ip: str, port: int, service: ProxyService = Depends(get_proxy_service)): + await service.delete_proxy(ip, port) + return success_response("删除代理成功") + + +@router.post("/batch-delete") +async def batch_delete( + request: BatchDeleteRequest, + service: ProxyService = Depends(get_proxy_service), +): + proxies = [(item.ip, item.port) for item in request.proxies] + deleted = await service.batch_delete(proxies) + return success_response(f"批量删除 {deleted} 个代理成功", {"deleted_count": deleted}) + + +@router.delete("/clean-invalid") +async def clean_invalid(service: ProxyService = Depends(get_proxy_service)): + count = await service.clean_invalid() + return success_response(f"清理了 {count} 个无效代理", {"deleted_count": count}) diff --git a/api/routes/scheduler.py b/api/routes/scheduler.py new file mode 100644 index 0000000..c9fe46e --- /dev/null +++ b/api/routes/scheduler.py @@ -0,0 +1,78 @@ +"""调度器相关路由""" +from fastapi import APIRouter, Depends +from services.scheduler_service import SchedulerService +from services.settings_service import SettingsService +from api.deps import get_scheduler_service +from core.log import logger + +router = APIRouter(prefix="/api/scheduler", tags=["scheduler"]) + + +def success_response(message: str, data=None): + return {"code": 200, "message": message, "data": data} + + +def error_response(message: str, code: int = 500): + return {"code": code, "message": message, "data": None} + + +@router.post("/start") +async def start_scheduler( + scheduler: SchedulerService = Depends(get_scheduler_service), +): + try: + if scheduler.running: + return success_response("验证调度器已在运行", {"running": True}) + await scheduler.start() + # 持久化设置 + settings_service = SettingsService() + settings = await settings_service.get_settings() + settings["auto_validate"] = True + from models.schemas import SettingsSchema + await settings_service.save_settings(SettingsSchema(**settings)) + return success_response("验证调度器已启动", {"running": True}) + except Exception as e: + logger.error(f"Start scheduler failed: {e}") + return error_response(f"启动调度器失败: {str(e)}") + + +@router.post("/stop") +async def stop_scheduler( + scheduler: SchedulerService = Depends(get_scheduler_service), +): + try: + if not scheduler.running: + return success_response("验证调度器未运行", {"running": False}) + await scheduler.stop() + # 持久化设置 + settings_service = SettingsService() + settings = await settings_service.get_settings() + settings["auto_validate"] = False + from models.schemas import SettingsSchema + await settings_service.save_settings(SettingsSchema(**settings)) + return success_response("验证调度器已停止", {"running": False}) + except Exception as e: + logger.error(f"Stop scheduler failed: {e}") + return error_response(f"停止调度器失败: {str(e)}") + + +@router.post("/validate-now") +async def validate_now( + scheduler: SchedulerService = Depends(get_scheduler_service), +): + try: + scheduler.validate_all_now() + return success_response("已开始全量验证", {"started": True}) + except Exception as e: + logger.error(f"Validate now failed: {e}") + return error_response(f"启动验证失败: {str(e)}") + + +@router.get("/status") +async def scheduler_status( + scheduler: SchedulerService = Depends(get_scheduler_service), +): + return success_response( + "获取状态成功", + {"running": scheduler.running, "interval_minutes": scheduler.interval_minutes}, + ) diff --git a/api/routes/settings.py b/api/routes/settings.py new file mode 100644 index 0000000..9c538a9 --- /dev/null +++ b/api/routes/settings.py @@ -0,0 +1,41 @@ +"""设置相关路由""" +from fastapi import APIRouter, Depends +from services.settings_service import SettingsService +from models.schemas import SettingsSchema +from api.deps import get_settings_service +from core.log import logger + +router = APIRouter(prefix="/api/settings", tags=["settings"]) + + +def success_response(message: str, data=None): + return {"code": 200, "message": message, "data": data} + + +def error_response(message: str, code: int = 500): + return {"code": code, "message": message, "data": None} + + +@router.get("") +async def get_settings(service: SettingsService = Depends(get_settings_service)): + try: + settings = await service.get_settings() + return success_response("获取设置成功", settings) + except Exception as e: + logger.error(f"Get settings failed: {e}") + return error_response("获取设置失败") + + +@router.post("") +async def save_settings( + request: SettingsSchema, + service: SettingsService = Depends(get_settings_service), +): + try: + success = await service.save_settings(request) + if not success: + return error_response("保存设置失败") + return success_response("保存设置成功", request.model_dump()) + except Exception as e: + logger.error(f"Save settings failed: {e}") + return error_response(f"保存设置失败: {str(e)}") diff --git a/api/routes/stats.py b/api/routes/stats.py new file mode 100644 index 0000000..0e44013 --- /dev/null +++ b/api/routes/stats.py @@ -0,0 +1,30 @@ +"""统计信息路由""" +from fastapi import APIRouter, Depends +from services.proxy_service import ProxyService +from services.scheduler_service import SchedulerService +from api.deps import get_proxy_service, get_scheduler_service +from core.log import logger + +router = APIRouter(prefix="/api/stats", tags=["stats"]) + + +def success_response(message: str, data=None): + return {"code": 200, "message": message, "data": data} + + +def error_response(message: str, code: int = 500): + return {"code": code, "message": message, "data": None} + + +@router.get("") +async def get_stats( + proxy_service: ProxyService = Depends(get_proxy_service), + scheduler_service: SchedulerService = Depends(get_scheduler_service), +): + try: + stats = await proxy_service.get_stats() + stats["scheduler_running"] = scheduler_service.running + return success_response("获取统计信息成功", stats) + except Exception as e: + logger.error(f"Get stats failed: {e}") + return error_response("获取统计信息失败") diff --git a/api_server.py b/api_server.py deleted file mode 100644 index e3ae4be..0000000 --- a/api_server.py +++ /dev/null @@ -1,698 +0,0 @@ -from fastapi import FastAPI, HTTPException, Request, status -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import StreamingResponse, JSONResponse -from pydantic import BaseModel, Field, field_validator, ValidationError -from typing import Optional, List -import asyncio -import json -from datetime import datetime -import re -import os -from contextlib import asynccontextmanager - -from core.sqlite import SQLiteManager -from core.plugin_manager import PluginManager -from core.scheduler import ValidationScheduler -from core.log import logger -from config import config - -# 全局调度器实例 -scheduler = ValidationScheduler() - -# 设置文件路径 -SETTINGS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'settings.json') - -# 默认设置 -DEFAULT_SETTINGS = { - "crawl_timeout": 30, - "validation_timeout": config.VALIDATOR_TIMEOUT, - "max_retries": 3, - "default_concurrency": config.VALIDATOR_MAX_CONCURRENCY, - "min_proxy_score": config.SCORE_MIN, - "proxy_expiry_days": 7, - "auto_validate": True, - "validate_interval_minutes": 30 -} - - -def load_settings(): - """从文件加载设置""" - try: - if os.path.exists(SETTINGS_FILE): - with open(SETTINGS_FILE, 'r', encoding='utf-8') as f: - saved_settings = json.load(f) - # 合并默认设置和保存的设置 - settings = DEFAULT_SETTINGS.copy() - settings.update(saved_settings) - return settings - except Exception as e: - logger.error(f"加载设置失败: {e}") - return DEFAULT_SETTINGS.copy() - - -def save_settings_to_file(settings: dict): - """保存设置到文件""" - try: - # 确保目录存在 - os.makedirs(os.path.dirname(SETTINGS_FILE), exist_ok=True) - with open(SETTINGS_FILE, 'w', encoding='utf-8') as f: - json.dump(settings, f, ensure_ascii=False, indent=2) - return True - except Exception as e: - logger.error(f"保存设置失败: {e}") - return False - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """应用生命周期管理""" - db = SQLiteManager() - await db.init_db() - - # 加载设置并应用到调度器 - settings = load_settings() - scheduler.interval_minutes = settings.get('validate_interval_minutes', 30) - - # 如果启用了自动验证,启动调度器 - if settings.get('auto_validate', True): - await scheduler.start() - - logger.info("API服务器启动") - yield - - # 关闭调度器 - await scheduler.stop() - logger.info("API服务器关闭") - - -app = FastAPI(title="代理池API", version="1.3.0", lifespan=lifespan) - - -def format_datetime(datetime_str: str) -> str: - """将数据库时间格式统一转换为ISO 8601格式""" - if not datetime_str: - return None - - if isinstance(datetime_str, str): - if 'T' in datetime_str: - return datetime_str - - if re.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', datetime_str): - return datetime_str.replace(' ', 'T') + '.000Z' - - return datetime_str - - -@app.exception_handler(ValidationError) -async def validation_exception_handler(request: Request, exc: ValidationError): - logger.error(f"参数验证失败: {exc}") - return JSONResponse( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content={"code": 422, "message": "参数验证失败", "data": exc.errors()} - ) - - -@app.exception_handler(HTTPException) -async def http_exception_handler(request: Request, exc: HTTPException): - logger.error(f"HTTP异常: {exc.status_code} - {exc.detail}") - return JSONResponse( - status_code=exc.status_code, - content={"code": exc.status_code, "message": exc.detail, "data": None} - ) - - -@app.exception_handler(Exception) -async def general_exception_handler(request: Request, exc: Exception): - logger.error(f"未处理的异常: {exc}", exc_info=True) - return JSONResponse( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - content={"code": 500, "message": "服务器内部错误", "data": None} - ) - - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -plugin_manager = PluginManager() - - -class ProxyRequest(BaseModel): - page: int = Field(default=1, ge=1, description="页码,必须大于等于1") - page_size: int = Field(default=20, ge=1, le=100, description="每页数量,必须在1-100之间") - protocol: Optional[str] = None - min_score: int = Field(default=0, ge=0, description="最低分数") - max_score: Optional[int] = Field(default=None, ge=0, description="最高分数") - sort_by: str = 'last_check' - sort_order: str = 'DESC' - - @field_validator('protocol') - @classmethod - def validate_protocol(cls, v): - if v is not None and v.lower() not in ['http', 'https', 'socks4', 'socks5']: - raise ValueError('协议类型必须是 http, https, socks4 或 socks5') - return v.lower() if v else v - - @field_validator('sort_by') - @classmethod - def validate_sort_by(cls, v): - if v not in ['ip', 'port', 'protocol', 'score', 'last_check']: - raise ValueError('排序字段必须是 ip, port, protocol, score 或 last_check') - return v - - @field_validator('sort_order') - @classmethod - def validate_sort_order(cls, v): - if v.upper() not in ['ASC', 'DESC']: - raise ValueError('排序方式必须是 ASC 或 DESC') - return v.upper() - - -class ProxyDeleteItem(BaseModel): - ip: str - port: int - - @field_validator('port') - @classmethod - def validate_port(cls, v): - if not 1 <= v <= 65535: - raise ValueError('端口号必须在1-65535范围内') - return v - - -class DeleteProxiesRequest(BaseModel): - proxies: List[ProxyDeleteItem] - - @field_validator('proxies') - @classmethod - def validate_proxies_count(cls, v): - if len(v) > 1000: - raise ValueError('单次最多删除1000个代理') - return v - - -@app.get("/") -async def root(): - return {"message": "欢迎使用代理池API", "status": "running", "data": None} - - -@app.get("/health") -async def health_check(): - try: - db = SQLiteManager() - await db.count_proxies() - return { - "status": "healthy", - "timestamp": datetime.now().isoformat(), - "database": "connected", - "scheduler": "running" if scheduler.running else "stopped", - "version": "1.3.0" - } - except Exception as e: - logger.error(f"健康检查失败: {e}") - return { - "status": "unhealthy", - "timestamp": datetime.now().isoformat(), - "database": "disconnected", - "error": str(e) - } - - -@app.get("/api/stats") -async def get_stats(): - try: - db = SQLiteManager() - stats = await db.get_stats() - today_new = await db.get_today_new_count() - stats['today_new'] = today_new - stats['scheduler_running'] = scheduler.running - return {"code": 200, "message": "获取统计信息成功", "data": stats} - except Exception as e: - logger.error(f"获取统计信息失败: {e}") - return {"code": 500, "message": "获取统计信息失败", "data": None} - - -@app.post("/api/proxies") -async def get_proxies(request: ProxyRequest): - try: - db = SQLiteManager() - proxies = await db.get_proxies_paginated( - page=request.page, - page_size=request.page_size, - protocol=request.protocol, - min_score=request.min_score, - max_score=request.max_score, - sort_by=request.sort_by, - sort_order=request.sort_order - ) - total = await db.get_proxies_total( - protocol=request.protocol, - min_score=request.min_score, - max_score=request.max_score - ) - - proxy_list = [] - for proxy in proxies: - proxy_list.append({ - "ip": proxy[0], - "port": proxy[1], - "protocol": proxy[2], - "score": proxy[3], - "last_check": format_datetime(proxy[4]) - }) - - return { - "code": 200, - "message": "获取代理列表成功", - "data": { - "list": proxy_list, - "total": total, - "page": request.page, - "page_size": request.page_size - } - } - except Exception as e: - logger.error(f"获取代理列表失败: {e}") - return {"code": 500, "message": "获取代理列表失败", "data": None} - - -@app.get("/api/proxies/random") -async def get_random_proxy(): - db = SQLiteManager() - proxy = await db.get_random_proxy() - if proxy: - return { - "code": 200, - "message": "获取随机代理成功", - "data": { - "ip": proxy[0], - "port": proxy[1], - "protocol": proxy[2], - "score": proxy[3], - "last_check": format_datetime(proxy[4]) - } - } - return {"code": 404, "message": "没有找到可用的代理", "data": None} - - -@app.get("/api/proxies/export/{format}") -async def export_proxies(format: str, protocol: Optional[str] = None, limit: int = 10000): - try: - db = SQLiteManager() - - if format not in ['csv', 'txt', 'json']: - raise HTTPException(status_code=400, detail="不支持的导出格式") - - if limit > 100000: - raise HTTPException(status_code=400, detail="导出数量不能超过100000条") - - async def generate_csv(): - proxies = await db.get_all_proxies() - if protocol: - proxies = [p for p in proxies if p[2].lower() == protocol.lower()] - - proxies = proxies[:limit] - - output = [] - output.append('IP,Port,Protocol,Score,Last Check') - for proxy in proxies: - output.append(f"{proxy[0]},{proxy[1]},{proxy[2]},{proxy[3]},{format_datetime(proxy[4])}") - - for line in output: - yield line + '\n' - - async def generate_txt(): - proxies = await db.get_all_proxies() - if protocol: - proxies = [p for p in proxies if p[2].lower() == protocol.lower()] - - proxies = proxies[:limit] - - for proxy in proxies: - yield f"{proxy[0]}:{proxy[1]}\n" - - async def generate_json(): - proxies = await db.get_all_proxies() - if protocol: - proxies = [p for p in proxies if p[2].lower() == protocol.lower()] - - proxies = proxies[:limit] - - proxy_list = [] - for proxy in proxies: - proxy_list.append({'ip': proxy[0], 'port': proxy[1], 'protocol': proxy[2], 'score': proxy[3], 'last_check': format_datetime(proxy[4])}) - - yield '[\n' - for i, item in enumerate(proxy_list): - if i > 0: - yield ',\n' - yield json.dumps(item, ensure_ascii=False, indent=2) - yield '\n]' - - if format == 'csv': - return StreamingResponse( - generate_csv(), - media_type='text/csv', - headers={'Content-Disposition': 'attachment; filename=proxies.csv'} - ) - - elif format == 'txt': - return StreamingResponse( - generate_txt(), - media_type='text/plain', - headers={'Content-Disposition': 'attachment; filename=proxies.txt'} - ) - - elif format == 'json': - return StreamingResponse( - generate_json(), - media_type='application/json', - headers={'Content-Disposition': 'attachment; filename=proxies.json'} - ) - except HTTPException: - raise - except Exception as e: - logger.error(f"导出代理失败: {e}") - raise HTTPException(status_code=500, detail="导出代理失败") - - -@app.get("/api/proxies/{ip}/{port}") -async def get_proxy_detail(ip: str, port: int): - db = SQLiteManager() - proxy = await db.get_proxy_detail(ip, port) - if proxy: - return { - "code": 200, - "message": "获取代理详情成功", - "data": { - "ip": proxy[0], - "port": proxy[1], - "protocol": proxy[2], - "score": proxy[3], - "last_check": format_datetime(proxy[4]) - } - } - raise HTTPException(status_code=404, detail="代理不存在") - - -@app.delete("/api/proxies/{ip}/{port}") -async def delete_proxy(ip: str, port: int): - db = SQLiteManager() - await db.delete_proxy(ip, port) - return {"code": 200, "message": "删除代理成功", "data": None} - - -@app.post("/api/proxies/batch-delete") -async def batch_delete_proxies(request: DeleteProxiesRequest): - db = SQLiteManager() - proxy_tuples = [(item.ip, item.port) for item in request.proxies] - deleted_count = await db.batch_delete_proxies(proxy_tuples) - return {"code": 200, "message": f"批量删除 {deleted_count} 个代理成功", "data": {"deleted_count": deleted_count}} - - -@app.delete("/api/proxies/clean-invalid") -async def clean_invalid_proxies(): - db = SQLiteManager() - deleted_count = await db.clean_invalid_proxies() - return {"code": 200, "message": f"清理了 {deleted_count} 个无效代理", "data": {"deleted_count": deleted_count}} - - -@app.get("/api/plugins") -async def get_plugins(): - try: - plugins_info = plugin_manager.get_all_plugin_info() - return { - "code": 200, - "message": "获取插件列表成功", - "data": { - "plugins": plugins_info - } - } - except Exception as e: - logger.error(f"获取插件列表失败: {e}") - return {"code": 500, "message": "获取插件列表失败", "data": None} - - -class PluginToggleRequest(BaseModel): - enabled: bool - - -@app.put("/api/plugins/{plugin_id}/toggle") -async def toggle_plugin(plugin_id: str, request: PluginToggleRequest): - try: - success = plugin_manager.toggle_plugin(plugin_id, request.enabled) - if success: - return { - "code": 200, - "message": f"插件 {plugin_id} 已{'启用' if request.enabled else '禁用'}", - "data": { - "plugin_id": plugin_id, - "enabled": request.enabled - } - } - else: - return {"code": 404, "message": "插件不存在", "data": None} - except Exception as e: - logger.error(f"切换插件状态失败: {e}") - return {"code": 500, "message": "切换插件状态失败", "data": None} - - -@app.post("/api/plugins/{plugin_id}/crawl") -async def crawl_plugin(plugin_id: str): - try: - # 1. 执行爬取 - results = await plugin_manager.run_plugin(plugin_id) - - if not results: - return { - "code": 200, - "message": f"插件 {plugin_id} 爬取完成,未获取到代理", - "data": { - "plugin_id": plugin_id, - "proxy_count": 0, - "valid_count": 0 - } - } - - logger.info(f"插件 {plugin_id} 爬取完成,获取 {len(results)} 个代理,开始验证...") - - # 2. 验证新抓取的代理 - valid_proxies, invalid_proxies = await scheduler.validate_proxies_batch(results) - - # 3. 只将有效代理存入数据库 - db = SQLiteManager() - inserted_count = 0 - for ip, port, protocol in valid_proxies: - success = await db.insert_proxy(ip, port, protocol, score=config.SCORE_VALID) - if success: - inserted_count += 1 - - logger.info(f"插件 {plugin_id} 处理完成: 有效 {inserted_count}, 无效 {len(invalid_proxies)}") - - return { - "code": 200, - "message": f"插件 {plugin_id} 爬取并验证完成", - "data": { - "plugin_id": plugin_id, - "proxy_count": len(results), - "valid_count": inserted_count, - "invalid_count": len(invalid_proxies) - } - } - except Exception as e: - logger.error(f"插件爬取失败: {e}") - return {"code": 500, "message": f"插件爬取失败: {str(e)}", "data": None} - - -@app.post("/api/plugins/crawl-all") -async def crawl_all_plugins(): - """运行所有插件并验证""" - try: - all_results = [] - all_valid = [] - all_invalid = [] - - for plugin in plugin_manager.plugins: - if not plugin.enabled: - continue - - try: - results = await plugin_manager.run_plugin(plugin.name) - if results: - all_results.extend(results) - except Exception as e: - logger.error(f"插件 {plugin.name} 执行失败: {e}") - continue - - if all_results: - # 去重 - unique_proxies = list(set(all_results)) - logger.info(f"所有插件爬取完成,共 {len(unique_proxies)} 个唯一代理,开始验证...") - - # 验证 - valid_proxies, invalid_proxies = await scheduler.validate_proxies_batch(unique_proxies) - - # 保存有效代理 - db = SQLiteManager() - inserted_count = 0 - for ip, port, protocol in valid_proxies: - success = await db.insert_proxy(ip, port, protocol, score=config.SCORE_VALID) - if success: - inserted_count += 1 - - return { - "code": 200, - "message": "所有插件爬取并验证完成", - "data": { - "total_crawled": len(unique_proxies), - "valid_count": inserted_count, - "invalid_count": len(invalid_proxies) - } - } - - return { - "code": 200, - "message": "所有插件爬取完成,未获取到代理", - "data": { - "total_crawled": 0, - "valid_count": 0, - "invalid_count": 0 - } - } - - except Exception as e: - logger.error(f"批量爬取失败: {e}") - return {"code": 500, "message": f"批量爬取失败: {str(e)}", "data": None} - - -# 验证调度器控制 -@app.post("/api/scheduler/start") -async def start_scheduler(): - """启动验证调度器""" - try: - if scheduler.running: - return {"code": 200, "message": "验证调度器已在运行", "data": {"running": True}} - - await scheduler.start() - - # 更新设置 - settings = load_settings() - settings['auto_validate'] = True - save_settings_to_file(settings) - - return {"code": 200, "message": "验证调度器已启动", "data": {"running": True}} - except Exception as e: - logger.error(f"启动调度器失败: {e}") - return {"code": 500, "message": f"启动调度器失败: {str(e)}", "data": None} - - -@app.post("/api/scheduler/stop") -async def stop_scheduler(): - """停止验证调度器""" - try: - if not scheduler.running: - return {"code": 200, "message": "验证调度器未运行", "data": {"running": False}} - - await scheduler.stop() - - # 更新设置 - settings = load_settings() - settings['auto_validate'] = False - save_settings_to_file(settings) - - return {"code": 200, "message": "验证调度器已停止", "data": {"running": False}} - except Exception as e: - logger.error(f"停止调度器失败: {e}") - return {"code": 500, "message": f"停止调度器失败: {str(e)}", "data": None} - - -@app.post("/api/scheduler/validate-now") -async def validate_now(): - """立即执行一次全量验证""" - try: - # 在后台运行验证,不阻塞响应 - asyncio.create_task(scheduler.validate_all_proxies()) - return {"code": 200, "message": "已开始全量验证", "data": {"started": True}} - except Exception as e: - logger.error(f"启动验证失败: {e}") - return {"code": 500, "message": f"启动验证失败: {str(e)}", "data": None} - - -@app.get("/api/scheduler/status") -async def get_scheduler_status(): - """获取调度器状态""" - return { - "code": 200, - "message": "获取状态成功", - "data": { - "running": scheduler.running, - "interval_minutes": scheduler.interval_minutes - } - } - - -# 设置管理 -class SettingsRequest(BaseModel): - crawl_timeout: int = Field(default=30, ge=5, le=120) - validation_timeout: int = Field(default=10, ge=3, le=60) - max_retries: int = Field(default=3, ge=0, le=10) - default_concurrency: int = Field(default=50, ge=10, le=200) - min_proxy_score: int = Field(default=0, ge=0, le=100) - proxy_expiry_days: int = Field(default=7, ge=1, le=30) - auto_validate: bool = True - validate_interval_minutes: int = Field(default=30, ge=5, le=1440) - - -@app.get("/api/settings") -async def get_settings(): - """获取系统设置""" - try: - settings = load_settings() - return {"code": 200, "message": "获取设置成功", "data": settings} - except Exception as e: - logger.error(f"获取设置失败: {e}") - return {"code": 500, "message": "获取设置失败", "data": None} - - -@app.post("/api/settings") -async def save_settings(request: SettingsRequest): - """保存系统设置""" - try: - settings = { - "crawl_timeout": request.crawl_timeout, - "validation_timeout": request.validation_timeout, - "max_retries": request.max_retries, - "default_concurrency": request.default_concurrency, - "min_proxy_score": request.min_proxy_score, - "proxy_expiry_days": request.proxy_expiry_days, - "auto_validate": request.auto_validate, - "validate_interval_minutes": request.validate_interval_minutes - } - - # 保存到文件 - if save_settings_to_file(settings): - # 更新调度器配置 - scheduler.interval_minutes = request.validate_interval_minutes - - # 如果自动验证状态改变,启动或停止调度器 - if request.auto_validate and not scheduler.running: - await scheduler.start() - elif not request.auto_validate and scheduler.running: - await scheduler.stop() - - return {"code": 200, "message": "保存设置成功", "data": settings} - else: - return {"code": 500, "message": "保存设置失败", "data": None} - - except Exception as e: - logger.error(f"保存设置失败: {e}") - return {"code": 500, "message": f"保存设置失败: {str(e)}", "data": None} - - -if __name__ == "__main__": - import uvicorn - uvicorn.run(app, host=config.HOST, port=config.PORT) diff --git a/core/config.py b/core/config.py new file mode 100644 index 0000000..855f4fa --- /dev/null +++ b/core/config.py @@ -0,0 +1,59 @@ +"""全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件""" +import os +from typing import List +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + # 数据库配置 + db_path: str = "db/proxies.sqlite" + + # API 服务配置 + host: str = "0.0.0.0" + port: int = 9949 + + # 验证器配置 + validator_timeout: int = 5 + validator_max_concurrency: int = 200 + validator_connect_timeout: int = 3 + + # 爬虫配置 + crawler_num_validators: int = 50 + crawler_max_queue_size: int = 500 + + # 日志配置 + log_level: str = "INFO" + log_dir: str = "logs" + + # 导出配置 + export_max_records: int = 10000 + + # 代理评分配置 + score_valid: int = 10 + score_invalid: int = -5 + score_min: int = 0 + score_max: int = 100 + + # 插件配置 + plugins_dir: str = "plugins" + + # CORS 配置 + cors_origins: str = "http://localhost:8080,http://localhost:5173" + + @property + def cors_origins_list(self) -> List[str]: + return [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()] + + @property + def base_dir(self) -> str: + return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +# 全局配置实例(启动时加载一次) +settings = Settings() diff --git a/core/crawler.py b/core/crawler.py deleted file mode 100644 index 5521a51..0000000 --- a/core/crawler.py +++ /dev/null @@ -1,86 +0,0 @@ -import aiohttp -import asyncio -import random -from core.log import logger - -class BaseCrawler: - def __init__(self): - self.user_agents = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", - "Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Mobile/15E148 Safari/604.1" - ] - - def get_headers(self): - return { - 'User-Agent': random.choice(self.user_agents), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', - 'Connection': 'keep-alive', - } - - async def fetch(self, url, method='GET', params=None, data=None, proxies=None, timeout=10, retry_count=3): - """异步抓取方法""" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - } - async with aiohttp.ClientSession(headers=headers) as session: - for i in range(retry_count): - try: - # 注意:aiohttp 的代理格式与 requests 不同,通常为 http://user:pass@host:port - async with session.request( - method=method, - url=url, - params=params, - data=data, - proxy=proxies, - timeout=aiohttp.ClientTimeout(total=timeout) - ) as response: - if response.status == 200: - # 先读取内容,再处理编码 - content = await response.read() - - # 尝试获取编码 - encoding = response.get_encoding() - if encoding == 'utf-8' or not encoding: - try: - return content.decode('utf-8') - except UnicodeDecodeError: - # 尝试从内容中检测编码或手动设置为 gbk (国内网站常见) - return content.decode('gbk', errors='ignore') - - return content.decode(encoding, errors='ignore') - else: - logger.warning(f"请求失败 [{response.status}]: {url}, 正在进行第 {i+1} 次重试...") - except Exception as e: - logger.error(f"请求异常: {url}, 错误: {e}, 正在进行第 {i+1} 次重试...") - - await asyncio.sleep(random.uniform(1, 3)) - - return None - -class BasePlugin(BaseCrawler): - def __init__(self): - super().__init__() - self.name = "BasePlugin" - self.urls = [] - self.enabled = True - - async def parse(self, html): - """异步解析网页内容,需在子类中实现""" - raise NotImplementedError("Please implement parse method") - - async def run(self): - """异步运行插件""" - logger.info(f"正在运行插件: {self.name}") - results = [] - for url in self.urls: - self.current_url = url # 记录当前正在抓取的 URL,供 parse 使用 - html = await self.fetch(url) - if html: - async for proxy in self.parse(html): - results.append(proxy) - await asyncio.sleep(random.uniform(1, 2)) - return results diff --git a/core/db.py b/core/db.py new file mode 100644 index 0000000..b0e6ea9 --- /dev/null +++ b/core/db.py @@ -0,0 +1,95 @@ +"""数据库连接管理 - 使用上下文管理器,避免全局单例连接泄漏""" +import os +import aiosqlite +from contextlib import asynccontextmanager +from typing import AsyncIterator +from core.config import settings +from core.log import logger + + +DB_PATH = os.path.join(settings.base_dir, settings.db_path) + + +def ensure_db_dir(): + db_dir = os.path.dirname(DB_PATH) + if db_dir and not os.path.exists(db_dir): + os.makedirs(db_dir, exist_ok=True) + + +async def init_db(): + """初始化数据库表结构(支持迁移)""" + ensure_db_dir() + async with aiosqlite.connect(DB_PATH) as db: + await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA synchronous=NORMAL") + await db.execute("PRAGMA cache_size=-64000") + await db.execute("PRAGMA temp_store=MEMORY") + + await db.execute(""" + CREATE TABLE IF NOT EXISTS proxies ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ip TEXT NOT NULL, + port INTEGER NOT NULL, + protocol TEXT DEFAULT 'http', + score INTEGER DEFAULT 10, + response_time_ms REAL, + last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE(ip, port) + ) + """) + + # 迁移:如果旧表缺少 response_time_ms 列,则添加 + try: + await db.execute("SELECT response_time_ms FROM proxies LIMIT 1") + except Exception: + await db.execute("ALTER TABLE proxies ADD COLUMN response_time_ms REAL") + logger.info("Migrated: added response_time_ms column") + + # 迁移:如果旧表缺少 created_at 列,则添加 + try: + await db.execute("SELECT created_at FROM proxies LIMIT 1") + except Exception: + await db.execute("ALTER TABLE proxies ADD COLUMN created_at TIMESTAMP") + await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL") + logger.info("Migrated: added created_at column") + + await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)") + await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)") + await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)") + await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)") + + # 插件设置表 + await db.execute(""" + CREATE TABLE IF NOT EXISTS plugin_settings ( + plugin_id TEXT PRIMARY KEY, + enabled INTEGER DEFAULT 1, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # 系统设置表 + await db.execute(""" + CREATE TABLE IF NOT EXISTS settings ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + await db.commit() + logger.info("Database initialized") + + +@asynccontextmanager +async def get_db() -> AsyncIterator[aiosqlite.Connection]: + """获取数据库连接的异步上下文管理器""" + ensure_db_dir() + db = await aiosqlite.connect(DB_PATH) + try: + await db.execute("PRAGMA journal_mode=WAL") + await db.execute("PRAGMA synchronous=NORMAL") + yield db + finally: + await db.close() diff --git a/core/exceptions.py b/core/exceptions.py new file mode 100644 index 0000000..2ed053d --- /dev/null +++ b/core/exceptions.py @@ -0,0 +1,24 @@ +"""业务异常定义""" + + +class ProxyPoolException(Exception): + """基础业务异常""" + def __init__(self, message: str, code: int = 500): + self.message = message + self.code = code + super().__init__(self.message) + + +class PluginNotFoundException(ProxyPoolException): + def __init__(self, plugin_id: str): + super().__init__(f"Plugin '{plugin_id}' not found", 404) + + +class ProxyNotFoundException(ProxyPoolException): + def __init__(self, ip: str, port: int): + super().__init__(f"Proxy {ip}:{port} not found", 404) + + +class ValidationException(ProxyPoolException): + def __init__(self, message: str): + super().__init__(message, 400) diff --git a/core/plugin_manager.py b/core/plugin_manager.py deleted file mode 100644 index d580943..0000000 --- a/core/plugin_manager.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import importlib -import inspect -import asyncio -from typing import List, Dict, Optional -from core.crawler import BasePlugin -from core.log import logger - -class PluginManager: - def __init__(self, plugin_dir='plugins'): - self.plugin_dir = plugin_dir - self.plugins = [] - self.plugin_stats = {} - self._load_plugins() - self._init_stats() - - def _init_stats(self): - for plugin in self.plugins: - self.plugin_stats[plugin.name] = { - 'success_count': 0, - 'failure_count': 0, - 'last_run': None - } - - def _load_plugins(self): - base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - full_plugin_path = os.path.join(base_dir, self.plugin_dir) - - if not os.path.exists(full_plugin_path): - logger.error(f"插件目录不存在: {full_plugin_path}") - return - - for filename in os.listdir(full_plugin_path): - if filename.endswith('.py') and not filename.startswith('__'): - module_name = f"{self.plugin_dir}.{filename[:-3]}" - try: - module = importlib.import_module(module_name) - for name, obj in inspect.getmembers(module): - if inspect.isclass(obj) and issubclass(obj, BasePlugin) and obj is not BasePlugin: - plugin_instance = obj() - if plugin_instance.enabled: - logger.info(f"成功加载插件: {name} 来自 {module_name}") - self.plugins.append(plugin_instance) - else: - logger.info(f"插件已禁用,跳过加载: {name} 来自 {module_name}") - except Exception as e: - logger.error(f"加载插件失败 {module_name}: {e}") - - def get_plugin_by_name(self, plugin_name: str) -> Optional[BasePlugin]: - for plugin in self.plugins: - if plugin.name == plugin_name: - return plugin - return None - - def get_all_plugin_info(self) -> List[Dict]: - plugins_info = [] - for plugin in self.plugins: - stats = self.plugin_stats.get(plugin.name, { - 'success_count': 0, - 'failure_count': 0, - 'last_run': None - }) - plugins_info.append({ - 'id': plugin.name, - 'name': plugin.name, - 'enabled': plugin.enabled, - 'description': getattr(plugin, 'description', f'从{plugin.name}网站爬取代理'), - 'last_run': stats['last_run'], - 'success_count': stats['success_count'], - 'failure_count': stats['failure_count'] - }) - return plugins_info - - def toggle_plugin(self, plugin_name: str, enabled: bool) -> bool: - plugin = self.get_plugin_by_name(plugin_name) - if plugin: - plugin.enabled = enabled - logger.info(f"插件 {plugin_name} 已{'启用' if enabled else '禁用'}") - return True - return False - - async def run_plugin(self, plugin_name: str): - plugin = self.get_plugin_by_name(plugin_name) - if not plugin: - logger.error(f"插件不存在: {plugin_name}") - return [] - - if not plugin.enabled: - logger.warning(f"插件已禁用: {plugin_name}") - return [] - - try: - results = await plugin.run() - success_count = len(results) - failure_count = 0 - - from datetime import datetime - self.plugin_stats[plugin.name] = { - 'success_count': self.plugin_stats[plugin.name]['success_count'] + success_count, - 'failure_count': self.plugin_stats[plugin.name]['failure_count'] + failure_count, - 'last_run': datetime.now().isoformat() - } - - logger.info(f"插件 {plugin_name} 执行完成,成功: {success_count}") - return results - except Exception as e: - logger.error(f"插件 {plugin_name} 执行失败: {e}") - from datetime import datetime - self.plugin_stats[plugin.name] = { - 'success_count': self.plugin_stats[plugin.name]['success_count'], - 'failure_count': self.plugin_stats[plugin.name]['failure_count'] + 1, - 'last_run': datetime.now().isoformat() - } - return [] - - async def run_all(self): - """并发运行所有插件""" - tasks = [plugin.run() for plugin in self.plugins] - # 并发执行并收集结果 - results_list = await asyncio.gather(*tasks) - - # 将嵌套列表扁平化并产出结果 - for results in results_list: - for proxy in results: - yield proxy diff --git a/core/plugin_system/__init__.py b/core/plugin_system/__init__.py new file mode 100644 index 0000000..c50e13a --- /dev/null +++ b/core/plugin_system/__init__.py @@ -0,0 +1,4 @@ +from .base import BaseCrawlerPlugin, ProxyRaw +from .registry import registry + +__all__ = ["BaseCrawlerPlugin", "ProxyRaw", "registry"] diff --git a/core/plugin_system/base.py b/core/plugin_system/base.py new file mode 100644 index 0000000..37aff7e --- /dev/null +++ b/core/plugin_system/base.py @@ -0,0 +1,41 @@ +"""插件基类 - 所有爬虫插件必须继承此基类""" +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List + + +@dataclass +class ProxyRaw: + """爬虫产出的原始代理数据""" + ip: str + port: int + protocol: str = "http" + + def __post_init__(self): + self.protocol = self.protocol.lower().strip() + if self.protocol not in ("http", "https", "socks4", "socks5"): + self.protocol = "http" + + +class BaseCrawlerPlugin(ABC): + """爬虫插件基类 + + 添加新爬虫只需: + 1. 继承 BaseCrawlerPlugin + 2. 实现 crawl() 方法返回 List[ProxyRaw] + 3. 用 @registry.register 装饰或在 __init__ 中显式注册 + """ + + name: str = "" + display_name: str = "" + description: str = "" + enabled: bool = True + + @abstractmethod + async def crawl(self) -> List[ProxyRaw]: + """爬取代理的核心方法。只负责爬取,不要在这里验证。""" + raise NotImplementedError + + async def health_check(self) -> bool: + """可选:检查插件健康状态""" + return True diff --git a/core/plugin_system/registry.py b/core/plugin_system/registry.py new file mode 100644 index 0000000..fe40a30 --- /dev/null +++ b/core/plugin_system/registry.py @@ -0,0 +1,77 @@ +"""插件注册中心 - 显式注册,类型安全,测试友好""" +import importlib +import inspect +import os +from typing import Dict, List, Type, Optional +from core.plugin_system.base import BaseCrawlerPlugin +from core.log import logger + + +class PluginRegistry: + """插件注册中心""" + + def __init__(self): + self._plugins: Dict[str, Type[BaseCrawlerPlugin]] = {} + self._instances: Dict[str, BaseCrawlerPlugin] = {} + + def register(self, plugin_cls: Type[BaseCrawlerPlugin]) -> Type[BaseCrawlerPlugin]: + """注册一个插件类。支持装饰器语法。""" + if not inspect.isclass(plugin_cls) or not issubclass(plugin_cls, BaseCrawlerPlugin): + raise ValueError("Plugin must be a subclass of BaseCrawlerPlugin") + if not plugin_cls.name: + raise ValueError(f"Plugin {plugin_cls.__name__} must have a 'name' attribute") + + self._plugins[plugin_cls.name] = plugin_cls + logger.info(f"Plugin registered: {plugin_cls.name} ({plugin_cls.__name__})") + return plugin_cls + + def get(self, name: str) -> Optional[BaseCrawlerPlugin]: + """获取插件实例(懒加载)""" + if name not in self._instances: + cls = self._plugins.get(name) + if cls: + self._instances[name] = cls() + return self._instances.get(name) + + def list_plugins(self) -> List[BaseCrawlerPlugin]: + """获取所有已注册插件的实例列表""" + result = [] + for name in self._plugins: + instance = self.get(name) + if instance: + result.append(instance) + return result + + def get_plugin_names(self) -> List[str]: + return list(self._plugins.keys()) + + def auto_discover(self, package_name: str): + """自动扫描指定包下的所有模块并注册其中的插件类。 + 注意:为了类型安全和可控性,推荐显式注册。auto_discover 仅作为兼容。""" + try: + package = importlib.import_module(package_name) + package_dir = os.path.dirname(package.__file__) + except Exception as e: + logger.error(f"Auto discover failed for package {package_name}: {e}") + return + + for filename in os.listdir(package_dir): + if filename.endswith(".py") and not filename.startswith("__"): + module_name = f"{package_name}.{filename[:-3]}" + try: + module = importlib.import_module(module_name) + for attr_name in dir(module): + obj = getattr(module, attr_name) + if ( + inspect.isclass(obj) + and issubclass(obj, BaseCrawlerPlugin) + and obj is not BaseCrawlerPlugin + and obj not in self._plugins.values() + ): + self.register(obj) + except Exception as e: + logger.error(f"Failed to load module {module_name}: {e}") + + +# 全局注册中心实例 +registry = PluginRegistry() diff --git a/core/scheduler.py b/core/scheduler.py deleted file mode 100644 index 94006da..0000000 --- a/core/scheduler.py +++ /dev/null @@ -1,206 +0,0 @@ -""" -代理验证调度器 -负责定期验证数据库中的代理,并更新分数 -""" -import asyncio -from datetime import datetime, timedelta -from typing import Optional -from core.sqlite import SQLiteManager -from core.validator import ProxyValidator -from core.log import logger -from config import config - - -class ValidationScheduler: - """代理验证调度器""" - - def __init__(self): - self.db = SQLiteManager() - self.validator: Optional[ProxyValidator] = None - self.running = False - self.task: Optional[asyncio.Task] = None - self.interval_minutes = 30 # 默认每30分钟验证一次 - self.batch_size = 100 # 每批验证数量 - - async def start(self): - """启动验证调度器""" - if self.running: - logger.warning("验证调度器已在运行") - return - - self.running = True - self.validator = ProxyValidator( - max_concurrency=config.VALIDATOR_MAX_CONCURRENCY, - timeout=config.VALIDATOR_TIMEOUT - ) - self.task = asyncio.create_task(self._run_loop()) - logger.info("代理验证调度器已启动") - - async def stop(self): - """停止验证调度器""" - self.running = False - if self.task: - self.task.cancel() - try: - await self.task - except asyncio.CancelledError: - pass - if self.validator: - await self.validator.__aexit__(None, None, None) - logger.info("代理验证调度器已停止") - - async def _run_loop(self): - """运行循环""" - while self.running: - try: - await self.validate_all_proxies() - except Exception as e: - logger.error(f"验证循环出错: {e}") - - # 等待下一次验证 - await asyncio.sleep(self.interval_minutes * 60) - - async def validate_all_proxies(self): - """验证所有代理""" - logger.info("开始批量验证代理...") - - try: - # 获取所有代理 - proxies = await self.db.get_all_proxies() - if not proxies: - logger.info("数据库中没有代理需要验证") - return - - logger.info(f"需要验证 {len(proxies)} 个代理") - - # 分批验证 - validated_count = 0 - valid_count = 0 - invalid_count = 0 - - async with self.validator: - for i in range(0, len(proxies), self.batch_size): - if not self.running: - break - - batch = proxies[i:i + self.batch_size] - tasks = [] - - for proxy in batch: - ip, port, protocol, score, last_check = proxy - task = self._validate_and_update(ip, port, protocol) - tasks.append(task) - - # 并发验证一批 - results = await asyncio.gather(*tasks, return_exceptions=True) - - for result in results: - validated_count += 1 - if isinstance(result, Exception): - logger.error(f"验证过程出错: {result}") - continue - if result: - valid_count += 1 - else: - invalid_count += 1 - - logger.info(f"已验证 {validated_count}/{len(proxies)} 个代理") - - # 批次间短暂延迟,避免过载 - if i + self.batch_size < len(proxies): - await asyncio.sleep(1) - - logger.info(f"验证完成: 总计 {validated_count}, 有效 {valid_count}, 无效 {invalid_count}") - - except Exception as e: - logger.error(f"批量验证代理失败: {e}", exc_info=True) - - async def _validate_and_update(self, ip: str, port: int, protocol: str) -> bool: - """验证单个代理并更新分数""" - try: - is_valid, latency = await self.validator.validate(ip, port, protocol) - - if is_valid: - # 验证成功,增加分数 - await self.db.update_score( - ip, port, - config.SCORE_VALID, - min_score=config.SCORE_MIN, - max_score=config.SCORE_MAX - ) - logger.debug(f"代理验证成功 {ip}:{port} ({protocol}) - 延迟 {latency}ms") - return True - else: - # 验证失败,减少分数 - await self.db.update_score( - ip, port, - config.SCORE_INVALID, - min_score=config.SCORE_MIN, - max_score=config.SCORE_MAX - ) - logger.debug(f"代理验证失败 {ip}:{port} ({protocol})") - return False - - except Exception as e: - logger.error(f"验证代理 {ip}:{port} 时出错: {e}") - # 出错也视为失败 - await self.db.update_score( - ip, port, - config.SCORE_INVALID, - min_score=config.SCORE_MIN, - max_score=config.SCORE_MAX - ) - return False - - async def validate_proxies_batch(self, proxies: list) -> tuple: - """ - 验证一批新抓取的代理 - - Args: - proxies: [(ip, port, protocol), ...] - - Returns: - (有效代理列表, 无效代理列表) - """ - if not proxies: - return [], [] - - valid_proxies = [] - invalid_proxies = [] - - logger.info(f"开始验证 {len(proxies)} 个新抓取代理...") - - try: - validator = ProxyValidator( - max_concurrency=min(config.VALIDATOR_MAX_CONCURRENCY, 50), - timeout=config.VALIDATOR_TIMEOUT - ) - - async with validator: - tasks = [] - for ip, port, protocol in proxies: - task = validator.validate(ip, port, protocol) - tasks.append((ip, port, protocol, task)) - - for ip, port, protocol, task in tasks: - try: - is_valid, latency = await task - if is_valid: - valid_proxies.append((ip, port, protocol)) - logger.debug(f"新代理有效: {ip}:{port} ({protocol}) - {latency}ms") - else: - invalid_proxies.append((ip, port, protocol)) - except Exception as e: - logger.warning(f"验证新代理 {ip}:{port} 失败: {e}") - invalid_proxies.append((ip, port, protocol)) - - logger.info(f"新代理验证完成: 有效 {len(valid_proxies)}, 无效 {len(invalid_proxies)}") - - except Exception as e: - logger.error(f"批量验证新代理失败: {e}") - - return valid_proxies, invalid_proxies - - -# 全局调度器实例 -scheduler = ValidationScheduler() diff --git a/core/sqlite.py b/core/sqlite.py deleted file mode 100644 index a172c1e..0000000 --- a/core/sqlite.py +++ /dev/null @@ -1,331 +0,0 @@ -import aiosqlite -import os -import asyncio -from core.log import logger - -VALID_PROTOCOLS = ['http', 'https', 'socks4', 'socks5'] - -class SQLiteManager: - _instance = None - _connection = None - _lock = asyncio.Lock() - - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super(SQLiteManager, cls).__new__(cls) - return cls._instance - - def __init__(self, db_path=None): - if hasattr(self, 'initialized') and self.initialized: - return - - if db_path is None: - base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - db_dir = os.path.join(base_dir, 'db') - if not os.path.exists(db_dir): - os.makedirs(db_dir) - self.db_path = os.path.join(db_dir, 'proxies.sqlite') - else: - self.db_path = db_path - - self.initialized = True - - async def get_connection(self): - async with self._lock: - if self._connection is None: - self._connection = await aiosqlite.connect(self.db_path) - await self._connection.execute("PRAGMA journal_mode=WAL") - await self._connection.execute("PRAGMA synchronous=NORMAL") - await self._connection.execute("PRAGMA cache_size=-64000") - await self._connection.execute("PRAGMA temp_store=MEMORY") - return self._connection - - async def close_connection(self): - async with self._lock: - if self._connection is not None: - await self._connection.close() - self._connection = None - - async def init_db(self): - """初始化数据库和表结构""" - db = await self.get_connection() - await db.execute(''' - CREATE TABLE IF NOT EXISTS proxies ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - ip TEXT NOT NULL, - port INTEGER NOT NULL, - protocol TEXT DEFAULT 'http', - score INTEGER DEFAULT 10, - last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - UNIQUE(ip, port) - ) - ''') - - await db.execute('CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)') - await db.execute('CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)') - await db.execute('CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)') - await db.execute('CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)') - - await db.commit() - - async def insert_proxy(self, ip, port, protocol='http', score=10): - """异步插入或更新代理""" - try: - # 验证协议类型 - if protocol not in VALID_PROTOCOLS: - protocol = 'http' - logger.warning(f"无效的协议类型 {protocol},默认使用 http") - - db = await self.get_connection() - # 先检查是否存在 - async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor: - row = await cursor.fetchone() - if row: - # 如果存在,则更新最后检查时间和分数 - await db.execute(''' - UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ? - ''', (score, protocol, ip, port)) - else: - # 如果不存在,则插入新记录 - await db.execute(''' - INSERT INTO proxies (ip, port, protocol, score, last_check) - VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP) - ''', (ip, port, protocol, score)) - await db.commit() - return True - except aiosqlite.IntegrityError as e: - # 处理唯一性约束冲突 - if "UNIQUE" in str(e): - # 代理已存在,更新它 - if protocol not in VALID_PROTOCOLS: - protocol = 'http' - db = await self.get_connection() - await db.execute(''' - UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ? - ''', (score, protocol, ip, port)) - await db.commit() - return True - else: - logger.error(f"数据库完整性错误: {e}") - return False - except Exception as e: - logger.error(f"插入代理失败 {ip}:{port} - {e}") - return False - - async def get_all_proxies(self): - """异步获取所有代理""" - db = await self.get_connection() - async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies') as cursor: - return await cursor.fetchall() - - async def get_random_proxy(self): - """异步随机获取一个高分代理""" - db = await self.get_connection() - async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1') as cursor: - return await cursor.fetchone() - - async def update_score(self, ip, port, delta, min_score=0, max_score=100): - """异步更新代理分数(增量更新,带分数限制)""" - try: - db = await self.get_connection() - # 获取当前分数 - async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor: - row = await cursor.fetchone() - if row: - current_score = row[0] - new_score = max(min_score, min(max_score, current_score + delta)) - await db.execute(''' - UPDATE proxies SET score = ?, last_check = CURRENT_TIMESTAMP WHERE ip = ? AND port = ? - ''', (new_score, ip, port)) - if new_score <= 0: - await db.execute('DELETE FROM proxies WHERE score <= 0') - await db.commit() - return True - return False - except Exception as e: - logger.error(f"更新代理分数失败 {ip}:{port} - {e}") - return False - - async def delete_proxy(self, ip, port): - """异步删除指定代理""" - db = await self.get_connection() - await db.execute('DELETE FROM proxies WHERE ip = ? AND port = ?', (ip, port)) - await db.commit() - - async def count_proxies(self): - """异步统计代理数量""" - db = await self.get_connection() - async with db.execute('SELECT COUNT(*) FROM proxies') as cursor: - row = await cursor.fetchone() - return row[0] if row else 0 - - async def get_proxies_paginated_with_total(self, page: int = 1, page_size: int = 20, - protocol: str = None, min_score: int = 0, - max_score: int = None, - sort_by: str = 'last_check', - sort_order: str = 'DESC'): - """分页获取代理列表(一次查询返回数据和总数)""" - db = await self.get_connection() - conditions = ['score >= ?'] - params = [min_score] - - if protocol: - conditions.append('protocol = ?') - params.append(protocol) - - if max_score is not None: - conditions.append('score <= ?') - params.append(max_score) - - where_clause = ' AND '.join(conditions) - - order_by_clause = f'{sort_by} {sort_order}' - - offset = (page - 1) * page_size - query = f''' - SELECT ip, port, protocol, score, last_check, - COUNT(*) OVER() as total_count - FROM proxies - WHERE {where_clause} - ORDER BY {order_by_clause} - LIMIT ? OFFSET ? - ''' - params.extend([page_size, offset]) - - async with db.execute(query, params) as cursor: - rows = await cursor.fetchall() - total = rows[0][5] if rows else 0 - proxies = [(row[0], row[1], row[2], row[3], row[4]) for row in rows] - return proxies, total - - async def get_proxies_paginated(self, page: int = 1, page_size: int = 20, - protocol: str = None, min_score: int = 0, - max_score: int = None, - sort_by: str = 'last_check', - sort_order: str = 'DESC'): - """分页获取代理列表""" - db = await self.get_connection() - conditions = ['score >= ?'] - params = [min_score] - - if protocol: - conditions.append('protocol = ?') - params.append(protocol) - - if max_score is not None: - conditions.append('score <= ?') - params.append(max_score) - - where_clause = ' AND '.join(conditions) - - order_by_clause = f'{sort_by} {sort_order}' - - offset = (page - 1) * page_size - query = f''' - SELECT ip, port, protocol, score, last_check - FROM proxies - WHERE {where_clause} - ORDER BY {order_by_clause} - LIMIT ? OFFSET ? - ''' - params.extend([page_size, offset]) - - async with db.execute(query, params) as cursor: - return await cursor.fetchall() - - async def get_proxies_total(self, protocol: str = None, min_score: int = 0, max_score: int = None): - """获取符合条件的代理总数""" - db = await self.get_connection() - conditions = ['score >= ?'] - params = [min_score] - - if protocol: - conditions.append('protocol = ?') - params.append(protocol) - - if max_score is not None: - conditions.append('score <= ?') - params.append(max_score) - - where_clause = ' AND '.join(conditions) - - query = f'SELECT COUNT(*) FROM proxies WHERE {where_clause}' - - async with db.execute(query, params) as cursor: - row = await cursor.fetchone() - return row[0] if row else 0 - - async def get_proxy_detail(self, ip: str, port: int): - """获取单个代理的详细信息""" - db = await self.get_connection() - async with db.execute( - 'SELECT ip, port, protocol, score, last_check FROM proxies WHERE ip = ? AND port = ?', - (ip, port) - ) as cursor: - row = await cursor.fetchone() - return row - - async def batch_delete_proxies(self, proxy_list: list): - """批量删除代理,返回实际删除的数量(使用executemany优化性能)""" - if not proxy_list: - return 0 - - db = await self.get_connection() - await db.executemany('DELETE FROM proxies WHERE ip = ? AND port = ?', proxy_list) - await db.commit() - return len(proxy_list) - - async def get_stats(self): - """获取统计信息(使用单个GROUP BY查询优化性能)""" - db = await self.get_connection() - stats = {} - - query = ''' - SELECT - COUNT(*) as total, - COUNT(CASE WHEN score > 0 THEN 1 END) as available, - AVG(score) as avg_score, - COUNT(CASE WHEN protocol = "http" THEN 1 END) as http_count, - COUNT(CASE WHEN protocol = "https" THEN 1 END) as https_count, - COUNT(CASE WHEN protocol = "socks4" THEN 1 END) as socks4_count, - COUNT(CASE WHEN protocol = "socks5" THEN 1 END) as socks5_count - FROM proxies - ''' - - async with db.execute(query) as cursor: - row = await cursor.fetchone() - if row: - stats = { - 'total': row[0] if row[0] else 0, - 'available': row[1] if row[1] else 0, - 'avg_score': round(row[2], 2) if row[2] else 0, - 'http_count': row[3] if row[3] else 0, - 'https_count': row[4] if row[4] else 0, - 'socks4_count': row[5] if row[5] else 0, - 'socks5_count': row[6] if row[6] else 0 - } - - return stats - - async def get_today_new_count(self): - """获取今日新增代理数量""" - try: - db = await self.get_connection() - query = ''' - SELECT COUNT(*) FROM proxies - WHERE DATE(last_check) = DATE('now', 'localtime') - ''' - async with db.execute(query) as cursor: - row = await cursor.fetchone() - return row[0] if row else 0 - except Exception as e: - logger.error(f"获取今日新增数量失败: {e}") - return 0 - - async def clean_invalid_proxies(self): - """清理无效代理(分数<=0)""" - db = await self.get_connection() - async with db.execute('DELETE FROM proxies WHERE score <= 0') as cursor: - deleted_count = cursor.rowcount - await db.commit() - return deleted_count diff --git a/core/tasks/__init__.py b/core/tasks/__init__.py new file mode 100644 index 0000000..0eadd5c --- /dev/null +++ b/core/tasks/__init__.py @@ -0,0 +1,3 @@ +from .queue import ValidationQueue + +__all__ = ["ValidationQueue"] diff --git a/core/tasks/queue.py b/core/tasks/queue.py new file mode 100644 index 0000000..cc8c82c --- /dev/null +++ b/core/tasks/queue.py @@ -0,0 +1,111 @@ +"""验证任务队列 - 解耦爬取与验证,支持背压控制""" +import asyncio +from typing import Optional +from models.domain import ProxyRaw +from core.log import logger + + +class ValidationQueue: + """代理验证队列 + + 工作流程: + 1. 爬虫将原始代理 submit() 到队列 + 2. Worker 池从队列消费并验证 + 3. 验证通过的代理写入数据库 + """ + + def __init__( + self, + validator, + proxy_repo, + db_ctx, + worker_count: int = 50, + score_valid: int = 10, + score_invalid: int = -5, + score_min: int = 0, + score_max: int = 100, + ): + self.validator = validator + self.proxy_repo = proxy_repo + self.db_ctx = db_ctx + self.worker_count = worker_count + self.score_valid = score_valid + self.score_invalid = score_invalid + self.score_min = score_min + self.score_max = score_max + + self._queue: asyncio.Queue[Optional[ProxyRaw]] = asyncio.Queue() + self._workers: list[asyncio.Task] = [] + self._running = False + + # 统计 + self.valid_count = 0 + self.invalid_count = 0 + + async def start(self): + if self._running: + return + self._running = True + for i in range(self.worker_count): + self._workers.append(asyncio.create_task(self._worker_loop(i))) + logger.info(f"ValidationQueue started with {self.worker_count} workers") + + async def stop(self): + if not self._running: + return + self._running = False + for _ in self._workers: + self._queue.put_nowait(None) # sentinel + if self._workers: + await asyncio.gather(*self._workers, return_exceptions=True) + self._workers.clear() + logger.info("ValidationQueue stopped") + + async def submit(self, proxies: list[ProxyRaw]): + """提交代理到验证队列""" + for p in proxies: + await self._queue.put(p) + + async def submit_one(self, proxy: ProxyRaw): + await self._queue.put(proxy) + + async def drain(self): + """等待队列中当前所有任务处理完毕""" + await self._queue.join() + + async def _worker_loop(self, worker_id: int): + while True: + item = await self._queue.get() + if item is None: + self._queue.task_done() + break + try: + await self._validate_and_save(item) + except Exception as e: + logger.error(f"Worker {worker_id} validation error: {e}") + finally: + self._queue.task_done() + + async def _validate_and_save(self, proxy: ProxyRaw): + is_valid, latency = await self.validator.validate( + proxy.ip, proxy.port, proxy.protocol + ) + async with self.db_ctx() as db: + if is_valid: + await self.proxy_repo.insert_or_update( + db, proxy.ip, proxy.port, proxy.protocol, score=self.score_valid + ) + if latency: + await self.proxy_repo.update_response_time( + db, proxy.ip, proxy.port, latency + ) + self.valid_count += 1 + logger.debug(f"ValidationQueue: valid {proxy.ip}:{proxy.port}") + else: + # 对于新爬取的无效代理,不需要入库,直接丢弃 + self.invalid_count += 1 + logger.debug(f"ValidationQueue: invalid {proxy.ip}:{proxy.port}") + + def reset_stats(self): + self.valid_count = 0 + self.invalid_count = 0 diff --git a/core/validator.py b/core/validator.py deleted file mode 100644 index 3cbc32f..0000000 --- a/core/validator.py +++ /dev/null @@ -1,192 +0,0 @@ -import asyncio -import aiohttp -import aiohttp_socks -import random -import time -from core.log import logger - - -class ProxyValidator: - """代理验证器 - 支持 HTTP/HTTPS/SOCKS4/SOCKS5""" - - def __init__(self, max_concurrency=50, timeout=5): - # 验证目标源 - self.http_sources = [ - "http://httpbin.org/ip", - "http://api.ipify.org" - ] - self.https_sources = [ - "https://httpbin.org/ip", - "https://api.ipify.org" - ] - self.semaphore = asyncio.Semaphore(max_concurrency) - self.timeout = timeout - self.session = None - - async def __aenter__(self): - """异步上下文管理器入口""" - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """异步上下文管理器出口""" - if self.session: - await self.session.close() - self.session = None - - def _get_test_url(self, protocol: str) -> str: - """根据协议获取测试 URL""" - protocol = protocol.lower() - if protocol == 'https': - return random.choice(self.https_sources) - return random.choice(self.http_sources) - - def _create_connector(self, ip: str, port: int, protocol: str): - """创建代理连接器""" - protocol = protocol.lower() - - if protocol == 'socks4': - return aiohttp_socks.ProxyConnector( - proxy_type=aiohttp_socks.ProxyType.SOCKS4, - host=ip, - port=port, - rdns=True - ) - elif protocol == 'socks5': - return aiohttp_socks.ProxyConnector( - proxy_type=aiohttp_socks.ProxyType.SOCKS5, - host=ip, - port=port, - rdns=True - ) - elif protocol in ('http', 'https'): - # HTTP/HTTPS 使用普通 connector,在请求时指定 proxy 参数 - return aiohttp.TCPConnector(ssl=False, limit=0, force_close=True) - else: - # 未知协议默认使用 HTTP - return aiohttp.TCPConnector(ssl=False, limit=0, force_close=True) - - async def validate(self, ip: str, port: int, protocol: str = 'http'): - """ - 验证单个代理是否可用 - - Args: - ip: 代理 IP - port: 代理端口 - protocol: 协议类型 (http/https/socks4/socks5) - - Returns: - (is_valid: bool, latency_ms: float) - """ - protocol = protocol.lower() - test_url = self._get_test_url(protocol) - - async with self.semaphore: - start_time = time.time() - - try: - if protocol in ('socks4', 'socks5'): - return await self._validate_socks(ip, port, protocol, test_url, start_time) - else: - return await self._validate_http(ip, port, protocol, test_url, start_time) - - except asyncio.TimeoutError: - logger.warning(f"验证超时: {ip}:{port} ({protocol})") - return False, 0 - except Exception as e: - logger.warning(f"验证失败: {ip}:{port} ({protocol}) - {e}") - return False, 0 - - async def _validate_http(self, ip: str, port: int, protocol: str, test_url: str, start_time: float): - """验证 HTTP/HTTPS 代理""" - proxy_url = f"http://{ip}:{port}" - - connector = aiohttp.TCPConnector(ssl=False, limit=0, force_close=True) - timeout = aiohttp.ClientTimeout(total=self.timeout, connect=3) - - async with aiohttp.ClientSession( - connector=connector, - timeout=timeout - ) as session: - async with session.get( - test_url, - proxy=proxy_url, - allow_redirects=True - ) as response: - if response.status in [200, 301, 302]: - try: - content = await response.text() - if 'ip' in content.lower() or 'origin' in content.lower(): - latency = round((time.time() - start_time) * 1000, 2) - logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms") - return True, latency - except: - pass - - # 内容解析失败但状态码正常,也算可用 - latency = round((time.time() - start_time) * 1000, 2) - logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms") - return True, latency - - return False, 0 - - async def _validate_socks(self, ip: str, port: int, protocol: str, test_url: str, start_time: float): - """验证 SOCKS4/SOCKS5 代理""" - proxy_type = ( - aiohttp_socks.ProxyType.SOCKS4 - if protocol == 'socks4' - else aiohttp_socks.ProxyType.SOCKS5 - ) - - connector = aiohttp_socks.ProxyConnector( - proxy_type=proxy_type, - host=ip, - port=port, - rdns=True, # 远程 DNS 解析,避免 DNS 泄漏 - ssl=False - ) - - timeout = aiohttp.ClientTimeout(total=self.timeout, connect=3) - - try: - async with aiohttp.ClientSession( - connector=connector, - timeout=timeout - ) as session: - async with session.get(test_url, allow_redirects=True) as response: - if response.status in [200, 301, 302]: - try: - content = await response.text() - if 'ip' in content.lower() or 'origin' in content.lower(): - latency = round((time.time() - start_time) * 1000, 2) - logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms") - return True, latency - except: - pass - - # 内容解析失败但状态码正常 - latency = round((time.time() - start_time) * 1000, 2) - logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms") - return True, latency - - return False, 0 - finally: - await connector.close() - - -class ProxyValidatorLegacy: - """ - 兼容旧版本的验证器 - 保持原有接口不变 - """ - def __init__(self, max_concurrency=50, timeout=5): - self.validator = ProxyValidator(max_concurrency, timeout) - - async def __aenter__(self): - await self.validator.__aenter__() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.validator.__aexit__(exc_type, exc_val, exc_tb) - - async def validate(self, ip, port, protocol='http'): - return await self.validator.validate(ip, port, protocol) diff --git a/frontend/src/services/pluginService.js b/frontend/src/services/pluginService.js new file mode 100644 index 0000000..33c0272 --- /dev/null +++ b/frontend/src/services/pluginService.js @@ -0,0 +1,19 @@ +import { pluginsAPI } from '../api' + +export const pluginService = { + async getPlugins() { + return pluginsAPI.getPlugins() + }, + + async togglePlugin(pluginId, enabled) { + return pluginsAPI.togglePlugin(pluginId, enabled) + }, + + async crawlPlugin(pluginId) { + return pluginsAPI.crawlPlugin(pluginId) + }, + + async crawlAll() { + return pluginsAPI.crawlAll() + } +} diff --git a/frontend/src/services/proxyService.js b/frontend/src/services/proxyService.js new file mode 100644 index 0000000..85631bf --- /dev/null +++ b/frontend/src/services/proxyService.js @@ -0,0 +1,27 @@ +import { statsAPI, proxiesAPI } from '../api' + +export const proxyService = { + async getStats() { + return statsAPI.getStats() + }, + + async getProxies(params, signal) { + return proxiesAPI.getProxies(params, signal) + }, + + async deleteProxy(ip, port) { + return proxiesAPI.deleteProxy(ip, port) + }, + + async batchDelete(proxies) { + return proxiesAPI.batchDeleteProxies(proxies) + }, + + async cleanInvalid() { + return proxiesAPI.cleanInvalidProxies() + }, + + async export(format, protocol) { + return proxiesAPI.exportProxies(format, protocol) + } +} diff --git a/frontend/src/services/schedulerService.js b/frontend/src/services/schedulerService.js new file mode 100644 index 0000000..75c97fb --- /dev/null +++ b/frontend/src/services/schedulerService.js @@ -0,0 +1,19 @@ +import { schedulerAPI } from '../api' + +export const schedulerService = { + async start() { + return schedulerAPI.start() + }, + + async stop() { + return schedulerAPI.stop() + }, + + async validateNow() { + return schedulerAPI.validateNow() + }, + + async getStatus() { + return schedulerAPI.getStatus() + } +} diff --git a/frontend/src/services/settingService.js b/frontend/src/services/settingService.js new file mode 100644 index 0000000..7a73abb --- /dev/null +++ b/frontend/src/services/settingService.js @@ -0,0 +1,11 @@ +import { settingsAPI } from '../api' + +export const settingService = { + async getSettings() { + return settingsAPI.getSettings() + }, + + async saveSettings(data) { + return settingsAPI.saveSettings(data) + } +} diff --git a/frontend/src/stores/plugins.js b/frontend/src/stores/plugins.js index 7536983..892c440 100644 --- a/frontend/src/stores/plugins.js +++ b/frontend/src/stores/plugins.js @@ -1,6 +1,6 @@ import { defineStore } from 'pinia' import { ref, computed } from 'vue' -import { pluginsAPI } from '../api' +import { pluginService } from '../services/pluginService' /** * Plugins Store @@ -24,7 +24,7 @@ export const usePluginsStore = defineStore('plugins', () => { async function fetchPlugins() { loading.value = true try { - const response = await pluginsAPI.getPlugins() + const response = await pluginService.getPlugins() if (response.code === 200) { plugins.value = response.data.plugins || [] return true @@ -45,7 +45,7 @@ export const usePluginsStore = defineStore('plugins', () => { */ async function togglePlugin(pluginId, enabled) { try { - const response = await pluginsAPI.togglePlugin(pluginId, enabled) + const response = await pluginService.togglePlugin(pluginId, enabled) if (response.code === 200) { const plugin = plugins.value.find(p => p.id === pluginId) if (plugin) { @@ -66,7 +66,7 @@ export const usePluginsStore = defineStore('plugins', () => { */ async function crawlPlugin(pluginId) { try { - const response = await pluginsAPI.crawlPlugin(pluginId) + const response = await pluginService.crawlPlugin(pluginId) return response.code === 200 } catch (error) { console.error('触发插件爬取失败:', error) diff --git a/frontend/src/stores/proxy.js b/frontend/src/stores/proxy.js index 07bc2f9..aba3eaa 100644 --- a/frontend/src/stores/proxy.js +++ b/frontend/src/stores/proxy.js @@ -1,6 +1,6 @@ import { defineStore } from 'pinia' import { ref, computed } from 'vue' -import { proxiesAPI, statsAPI } from '../api' +import { proxyService } from '../services/proxyService' /** * 判断是否为用户取消的错误 @@ -34,7 +34,7 @@ export const useProxyStore = defineStore('proxy', () => { */ async function fetchStats() { try { - const response = await statsAPI.getStats() + const response = await proxyService.getStats() if (response.code === 200) { stats.value = response.data return true @@ -54,7 +54,7 @@ export const useProxyStore = defineStore('proxy', () => { async function fetchProxies(params, signal) { loading.value = true try { - const response = await proxiesAPI.getProxies(params, signal) + const response = await proxyService.getProxies(params, signal) if (response.code === 200) { proxies.value = response.data.list total.value = response.data.total @@ -79,7 +79,7 @@ export const useProxyStore = defineStore('proxy', () => { */ async function deleteProxy(ip, port) { try { - const response = await proxiesAPI.deleteProxy(ip, port) + const response = await proxyService.deleteProxy(ip, port) return response.code === 200 } catch (error) { console.error('删除代理失败:', error) @@ -96,7 +96,7 @@ export const useProxyStore = defineStore('proxy', () => { if (!proxyList?.length) return 0 try { - const response = await proxiesAPI.batchDeleteProxies(proxyList) + const response = await proxyService.batchDelete(proxyList) if (response.code === 200) { return response.data.deleted_count } @@ -112,7 +112,7 @@ export const useProxyStore = defineStore('proxy', () => { */ async function cleanInvalidProxies() { try { - const response = await proxiesAPI.cleanInvalidProxies() + const response = await proxyService.cleanInvalid() if (response.code === 200) { return response.data.deleted_count } @@ -130,7 +130,7 @@ export const useProxyStore = defineStore('proxy', () => { */ async function exportProxies(format, protocol = null) { try { - const response = await proxiesAPI.exportProxies(format, protocol) + const response = await proxyService.export(format, protocol) // 创建下载链接 const url = window.URL.createObjectURL(new Blob([response])) diff --git a/frontend/src/views/Plugins.vue b/frontend/src/views/Plugins.vue index 6af21d8..efcfa4b 100644 --- a/frontend/src/views/Plugins.vue +++ b/frontend/src/views/Plugins.vue @@ -138,7 +138,7 @@ import { Box } from '@element-plus/icons-vue' import { usePluginsStore } from '../stores/plugins' -import { pluginsAPI } from '../api' +import { pluginService } from '../services/pluginService' import { formatTime } from '../utils/format' import PageHeader from '../components/PageHeader.vue' @@ -168,7 +168,7 @@ async function handleCrawl(pluginId) { crawlingPlugin.value = pluginId lastCrawlResult.value = null - const response = await pluginsAPI.crawlPlugin(pluginId) + const response = await pluginService.crawlPlugin(pluginId) if (response.code === 200) { lastCrawlResult.value = { @@ -216,7 +216,7 @@ async function handleCrawlAll() { crawlingAll.value = true lastCrawlResult.value = null - const response = await pluginsAPI.crawlAll() + const response = await pluginService.crawlAll() if (response.code === 200) { lastCrawlResult.value = { diff --git a/frontend/src/views/Settings.vue b/frontend/src/views/Settings.vue index f71382d..3b33987 100644 --- a/frontend/src/views/Settings.vue +++ b/frontend/src/views/Settings.vue @@ -190,7 +190,8 @@ import { VideoPause, Refresh } from '@element-plus/icons-vue' -import { settingsAPI, schedulerAPI } from '../api' +import { settingService } from '../services/settingService' +import { schedulerService } from '../services/schedulerService' import PageHeader from '../components/PageHeader.vue' // ==================== 状态 ==================== @@ -237,7 +238,7 @@ const formRules = { async function fetchSettings() { loading.value = true try { - const response = await settingsAPI.getSettings() + const response = await settingService.getSettings() if (response.code === 200) { Object.assign(settings, response.data) } @@ -251,7 +252,7 @@ async function fetchSettings() { async function fetchSchedulerStatus() { try { - const response = await schedulerAPI.getStatus() + const response = await schedulerService.getStatus() if (response.code === 200) { schedulerRunning.value = response.data.running } @@ -264,7 +265,7 @@ async function fetchSchedulerStatus() { async function handleStartScheduler() { schedulerLoading.value = true try { - const response = await schedulerAPI.start() + const response = await schedulerService.start() if (response.code === 200) { schedulerRunning.value = true ElMessage.success('自动验证已启动') @@ -282,7 +283,7 @@ async function handleStartScheduler() { async function handleStopScheduler() { schedulerLoading.value = true try { - const response = await schedulerAPI.stop() + const response = await schedulerService.stop() if (response.code === 200) { schedulerRunning.value = false ElMessage.success('自动验证已停止') @@ -310,7 +311,7 @@ async function handleValidateNow() { ) validating.value = true - const response = await schedulerAPI.validateNow() + const response = await schedulerService.validateNow() if (response.code === 200) { ElMessage.success('全量验证已启动,请在日志中查看进度') } else { @@ -333,7 +334,7 @@ async function handleSave() { saving.value = true try { - const response = await settingsAPI.saveSettings(settings) + const response = await settingService.saveSettings(settings) if (response.code === 200) { ElMessage.success('配置保存成功') diff --git a/main.py b/main.py new file mode 100644 index 0000000..f2381bb --- /dev/null +++ b/main.py @@ -0,0 +1,9 @@ +"""项目入口""" +import uvicorn +from api import create_app +from core.config import settings + +app = create_app() + +if __name__ == "__main__": + uvicorn.run("main:app", host=settings.host, port=settings.port, reload=False) diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..7302f2b --- /dev/null +++ b/models/__init__.py @@ -0,0 +1,13 @@ +from .domain import ProxyRaw, Proxy, PluginInfo +from .schemas import ProxyCreate, ProxyResponse, PluginResponse, SettingsSchema, CrawlResult + +__all__ = [ + "ProxyRaw", + "Proxy", + "PluginInfo", + "ProxyCreate", + "ProxyResponse", + "PluginResponse", + "SettingsSchema", + "CrawlResult", +] diff --git a/models/domain.py b/models/domain.py new file mode 100644 index 0000000..b390b7e --- /dev/null +++ b/models/domain.py @@ -0,0 +1,42 @@ +"""领域模型 - 纯数据结构,不依赖任何框架""" +from dataclasses import dataclass, field +from datetime import datetime +from typing import Optional + + +@dataclass +class ProxyRaw: + """爬虫爬取的原始代理数据""" + ip: str + port: int + protocol: str = "http" + + def __post_init__(self): + self.protocol = self.protocol.lower().strip() + if self.protocol not in ("http", "https", "socks4", "socks5"): + self.protocol = "http" + + +@dataclass +class Proxy: + """数据库中的代理实体""" + ip: str + port: int + protocol: str + score: int + response_time_ms: Optional[float] = None + last_check: Optional[datetime] = None + created_at: Optional[datetime] = None + + +@dataclass +class PluginInfo: + """插件元数据""" + id: str + name: str + display_name: str + description: str + enabled: bool + last_run: Optional[datetime] = None + success_count: int = 0 + failure_count: int = 0 diff --git a/models/schemas.py b/models/schemas.py new file mode 100644 index 0000000..142ee8a --- /dev/null +++ b/models/schemas.py @@ -0,0 +1,105 @@ +"""Pydantic 模型 - 用于 API 请求/响应校验""" +from pydantic import BaseModel, Field, field_validator +from typing import Optional, List + + +class ProxyCreate(BaseModel): + ip: str + port: int = Field(ge=1, le=65535) + protocol: str = "http" + score: int = Field(default=10, ge=0, le=100) + + @field_validator("protocol") + @classmethod + def validate_protocol(cls, v: str): + v = v.lower().strip() + if v not in ("http", "https", "socks4", "socks5"): + raise ValueError("protocol must be http, https, socks4 or socks5") + return v + + +class ProxyResponse(BaseModel): + ip: str + port: int + protocol: str + score: int + last_check: Optional[str] = None + + +class PluginResponse(BaseModel): + id: str + name: str + display_name: str + description: str + enabled: bool + last_run: Optional[str] = None + success_count: int = 0 + failure_count: int = 0 + + +class SettingsSchema(BaseModel): + crawl_timeout: int = Field(default=30, ge=5, le=120) + validation_timeout: int = Field(default=10, ge=3, le=60) + max_retries: int = Field(default=3, ge=0, le=10) + default_concurrency: int = Field(default=50, ge=10, le=200) + min_proxy_score: int = Field(default=0, ge=0, le=100) + proxy_expiry_days: int = Field(default=7, ge=1, le=30) + auto_validate: bool = True + validate_interval_minutes: int = Field(default=30, ge=5, le=1440) + + +class CrawlResult(BaseModel): + plugin_id: str + proxy_count: int + valid_count: int + invalid_count: int = 0 + + +class ProxyListRequest(BaseModel): + page: int = Field(default=1, ge=1) + page_size: int = Field(default=20, ge=1, le=100) + protocol: Optional[str] = None + min_score: int = Field(default=0, ge=0) + max_score: Optional[int] = Field(default=None, ge=0) + sort_by: str = "last_check" + sort_order: str = "DESC" + + @field_validator("protocol") + @classmethod + def validate_protocol(cls, v): + if v is not None and v.lower() not in ("http", "https", "socks4", "socks5"): + raise ValueError("协议类型必须是 http, https, socks4 或 socks5") + return v.lower() if v else v + + @field_validator("sort_by") + @classmethod + def validate_sort_by(cls, v): + if v not in ("ip", "port", "protocol", "score", "last_check"): + raise ValueError("排序字段必须是 ip, port, protocol, score 或 last_check") + return v + + @field_validator("sort_order") + @classmethod + def validate_sort_order(cls, v): + if v.upper() not in ("ASC", "DESC"): + raise ValueError("排序方式必须是 ASC 或 DESC") + return v.upper() + + +class ProxyDeleteItem(BaseModel): + ip: str + port: int = Field(ge=1, le=65535) + + +class BatchDeleteRequest(BaseModel): + proxies: List[ProxyDeleteItem] = Field(max_length=1000) + + +class PluginToggleRequest(BaseModel): + enabled: bool + + +class ExportRequest(BaseModel): + format: str = Field(pattern=r"^(csv|txt|json)$") + protocol: Optional[str] = None + limit: int = Field(default=10000, ge=1, le=100000) diff --git a/plugins/__init__.py b/plugins/__init__.py new file mode 100644 index 0000000..118ec24 --- /dev/null +++ b/plugins/__init__.py @@ -0,0 +1,19 @@ +"""插件包 - 在这里显式注册所有爬虫插件""" +from core.plugin_system import registry + +from .fate0 import Fate0Plugin +from .proxylist_download import ProxyListDownloadPlugin +from .ip3366 import Ip3366Plugin +from .ip89 import Ip89Plugin +from .kuaidaili import KuaiDaiLiPlugin +from .speedx import SpeedXPlugin +from .yundaili import YunDaiLiPlugin + +# 显式注册所有插件 +registry.register(Fate0Plugin) +registry.register(ProxyListDownloadPlugin) +registry.register(Ip3366Plugin) +registry.register(Ip89Plugin) +registry.register(KuaiDaiLiPlugin) +registry.register(SpeedXPlugin) +registry.register(YunDaiLiPlugin) diff --git a/plugins/base.py b/plugins/base.py new file mode 100644 index 0000000..eede0b1 --- /dev/null +++ b/plugins/base.py @@ -0,0 +1,52 @@ +"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装""" +import random +import asyncio +import aiohttp +from typing import List +from core.plugin_system import BaseCrawlerPlugin + + +class BaseHTTPPlugin(BaseCrawlerPlugin): + """基于 HTTP 的爬虫插件基类""" + + def __init__(self): + super().__init__() + self.user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", + ] + self.urls: List[str] = [] + self.current_url: str = "" + + def get_headers(self) -> dict: + return { + "User-Agent": random.choice(self.user_agents), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", + "Connection": "keep-alive", + } + + async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str: + """异步抓取指定 URL 的 HTML 内容""" + headers = self.get_headers() + async with aiohttp.ClientSession(headers=headers) as session: + for attempt in range(retries): + try: + async with session.get( + url, timeout=aiohttp.ClientTimeout(total=timeout) + ) as response: + if response.status == 200: + content = await response.read() + encoding = response.get_encoding() + if encoding == "utf-8" or not encoding: + try: + return content.decode("utf-8") + except UnicodeDecodeError: + return content.decode("gbk", errors="ignore") + return content.decode(encoding, errors="ignore") + except Exception: + pass + await asyncio.sleep(random.uniform(1, 3)) + return "" diff --git a/plugins/fate0.py b/plugins/fate0.py index d551971..cb2cc6d 100644 --- a/plugins/fate0.py +++ b/plugins/fate0.py @@ -1,66 +1,38 @@ -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from core.crawler import BasePlugin -from core.log import logger import json -import asyncio +from typing import List +from core.plugin_system import ProxyRaw +from plugins.base import BaseHTTPPlugin +from core.log import logger + + +class Fate0Plugin(BaseHTTPPlugin): + name = "fate0" + display_name = "Fate0聚合源" + description = "从 GitHub 持续更新的高质量代理聚合列表" -class Fate0Plugin(BasePlugin): def __init__(self): super().__init__() - self.name = "Fate0聚合源" - # 这是一个持续更新的高质量代理聚合列表 self.urls = ["https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"] - async def parse(self, html): - if not html: - return - - count = 0 - # fate0 的数据格式是每行一个 JSON 对象 - for line in html.split('\n'): - if not line.strip(): + async def crawl(self) -> List[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url, timeout=30) + if not html: continue - try: - data = json.loads(line) - ip = data.get('host') - port = data.get('port') - protocol = data.get('type', 'http') - - # 协议标准化 - protocol = protocol.lower().strip() - if protocol not in ('http', 'https', 'socks4', 'socks5'): - protocol = 'http' - - if ip and port: - yield ip, int(port), protocol - count += 1 - except Exception: - continue - - if count > 0: - logger.info(f"{self.name} 解析完成,获得 {count} 个潜在代理") - - -if __name__ == "__main__": - async def test_plugin(): - plugin = Fate0Plugin() - print(f"========== 测试 {plugin.name} ==========") - print(f"目标URL数量: {len(plugin.urls)}") - print(f"开始抓取...\n") - - proxies = await plugin.run() - - print(f"\n========== 抓取结果 ==========") - print(f"总计获取 {len(proxies)} 个代理:") - print("-" * 60) - - for idx, (ip, port, protocol) in enumerate(proxies, 1): - print(f"{idx:3d}. {ip:15s} : {str(port):5s} | {protocol}") - - print("-" * 60) - print(f"完成!共 {len(proxies)} 个代理~") - - asyncio.run(test_plugin()) + for line in html.split("\n"): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + ip = data.get("host") + port = data.get("port") + protocol = data.get("type", "http") + if ip and port: + results.append(ProxyRaw(ip, int(port), protocol)) + except Exception: + continue + if results: + logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") + return results diff --git a/plugins/ip3366.py b/plugins/ip3366.py index 70c277e..cc21ff1 100644 --- a/plugins/ip3366.py +++ b/plugins/ip3366.py @@ -1,74 +1,51 @@ -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from core.crawler import BasePlugin -from core.log import logger -from bs4 import BeautifulSoup import re -import asyncio +from typing import List +from bs4 import BeautifulSoup +from core.plugin_system import ProxyRaw +from plugins.base import BaseHTTPPlugin +from core.log import logger -VALID_PROTOCOLS = ['http', 'https', 'socks4', 'socks5'] +VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") + + +class Ip3366Plugin(BaseHTTPPlugin): + name = "ip3366" + display_name = "IP3366" + description = "从 IP3366 网站爬取免费代理" -class Ip3366Plugin(BasePlugin): def __init__(self): super().__init__() - self.name = "IP3366" - # 抓取高匿和普通代理的前 5 页 self.urls = [ f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, 6) ] + [ f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, 6) ] - async def parse(self, html): - if not html: - return - - soup = BeautifulSoup(html, 'lxml') - list_div = soup.find('div', id='list') - if not list_div: return - - table = list_div.find('table') - if not table: return + async def crawl(self) -> List[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url, timeout=15) + if not html: + continue + soup = BeautifulSoup(html, "lxml") + list_div = soup.find("div", id="list") + if not list_div: + continue + table = list_div.find("table") + if not table: + continue - rows = table.find_all('tr') - count = 0 - for row in rows: - tds = row.find_all('td') - if len(tds) >= 5: - ip = tds[0].get_text(strip=True) - port = tds[1].get_text(strip=True) - protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else 'http' - - if protocol not in VALID_PROTOCOLS: - protocol = 'http' - - if re.match(r'^\d+\.\d+\.\d+\.\d+$', ip) and port.isdigit(): - yield ip, int(port), protocol - count += 1 - - if count > 0: - logger.info(f"{self.name} 解析完成,获得 {count} 个潜在代理") + for row in table.find_all("tr"): + tds = row.find_all("td") + if len(tds) >= 5: + ip = tds[0].get_text(strip=True) + port = tds[1].get_text(strip=True) + protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http" + if protocol not in VALID_PROTOCOLS: + protocol = "http" + if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit(): + results.append(ProxyRaw(ip, int(port), protocol)) - -if __name__ == "__main__": - async def test_plugin(): - plugin = Ip3366Plugin() - print(f"========== 测试 {plugin.name} ==========") - print(f"目标URL数量: {len(plugin.urls)}") - print(f"开始抓取...\n") - - proxies = await plugin.run() - - print(f"\n========== 抓取结果 ==========") - print(f"总计获取 {len(proxies)} 个代理:") - print("-" * 60) - - for idx, (ip, port, protocol) in enumerate(proxies, 1): - print(f"{idx:3d}. {ip:15s} : {str(port):5s} | {protocol}") - - print("-" * 60) - print(f"完成!共 {len(proxies)} 个代理~") - - asyncio.run(test_plugin()) + if results: + logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") + return results diff --git a/plugins/ip89.py b/plugins/ip89.py index 7038bd1..a5bbc70 100644 --- a/plugins/ip89.py +++ b/plugins/ip89.py @@ -1,69 +1,39 @@ -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from core.crawler import BasePlugin -from core.log import logger -from bs4 import BeautifulSoup import re -import asyncio +from typing import List +from bs4 import BeautifulSoup +from core.plugin_system import ProxyRaw +from plugins.base import BaseHTTPPlugin +from core.log import logger + + +class Ip89Plugin(BaseHTTPPlugin): + name = "ip89" + display_name = "89免费代理" + description = "从 89ip.cn 爬取免费代理" -class Ip89Plugin(BasePlugin): def __init__(self): super().__init__() - self.name = "89免费代理" - # 抓取前 5 页 - self.urls = [ - f"https://www.89ip.cn/index_{i}.html" for i in range(1, 6) - ] + self.urls = [f"https://www.89ip.cn/index_{i}.html" for i in range(1, 6)] - async def parse(self, html): - """ - 解析 89ip 页面 - """ - if not html: - return - - soup = BeautifulSoup(html, 'lxml') - table = soup.find('table', class_='layui-table') - if not table: - return + async def crawl(self) -> List[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url, timeout=15) + if not html: + continue + soup = BeautifulSoup(html, "lxml") + table = soup.find("table", class_="layui-table") + if not table: + continue - rows = table.find_all('tr') - count = 0 - for row in rows: - tds = row.find_all('td') - if len(tds) >= 2: - ip = tds[0].get_text(strip=True) - port = tds[1].get_text(strip=True) - # 89ip 通常不直接写协议,默认尝试 http - protocol = 'http' - - if re.match(r'^\d+\.\d+\.\d+\.\d+$', ip) and port.isdigit(): - yield ip, int(port), protocol - count += 1 - - if count > 0: - logger.info(f"{self.name} 解析完成,获得 {count} 个潜在代理") + for row in table.find_all("tr"): + tds = row.find_all("td") + if len(tds) >= 2: + ip = tds[0].get_text(strip=True) + port = tds[1].get_text(strip=True) + if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit(): + results.append(ProxyRaw(ip, int(port), "http")) - -if __name__ == "__main__": - async def test_plugin(): - plugin = Ip89Plugin() - print(f"========== 测试 {plugin.name} ==========") - print(f"目标URL数量: {len(plugin.urls)}") - print(f"开始抓取...\n") - - proxies = await plugin.run() - - print(f"\n========== 抓取结果 ==========") - print(f"总计获取 {len(proxies)} 个代理:") - print("-" * 60) - - for idx, (ip, port, protocol) in enumerate(proxies, 1): - print(f"{idx:3d}. {ip:15s} : {str(port):5s} | {protocol}") - - print("-" * 60) - print(f"完成!共 {len(proxies)} 个代理~") - - asyncio.run(test_plugin()) + if results: + logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") + return results diff --git a/plugins/kuaidaili.py b/plugins/kuaidaili.py index fc88292..2c2fbba 100644 --- a/plugins/kuaidaili.py +++ b/plugins/kuaidaili.py @@ -1,79 +1,49 @@ -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from core.crawler import BasePlugin -from core.log import logger -from bs4 import BeautifulSoup import re -import asyncio +from typing import List +from bs4 import BeautifulSoup +from core.plugin_system import ProxyRaw +from plugins.base import BaseHTTPPlugin +from core.log import logger -VALID_PROTOCOLS = ['http', 'https', 'socks4', 'socks5'] +VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") + + +class KuaiDaiLiPlugin(BaseHTTPPlugin): + name = "kuaidaili" + display_name = "快代理" + description = "从快代理网站爬取免费代理" -class KuaiDaiLiPlugin(BasePlugin): def __init__(self): super().__init__() - self.name = "快代理" - # 抓取国内高匿和国内普通代理的前 10 页 self.urls = [ f"https://www.kuaidaili.com/free/inha/{i}/" for i in range(1, 11) ] + [ f"https://www.kuaidaili.com/free/intr/{i}/" for i in range(1, 11) ] - async def parse(self, html): - """ - 解析快代理页面 - """ - if not html: - return - - soup = BeautifulSoup(html, 'lxml') - # 快代理的表格在 tbody 中 - table = soup.find('table') - if not table: - # 尝试通过正则表达式匹配可能被加密或特殊处理的数据 - logger.warning(f"{self.name} 未能找到表格,可能是触发了反爬或结构变化") - return + async def crawl(self) -> List[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url, timeout=15) + if not html: + continue + soup = BeautifulSoup(html, "lxml") + table = soup.find("table") + if not table: + logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬") + continue - rows = table.find_all('tr') - count = 0 - for row in rows: - tds = row.find_all('td') - if len(tds) >= 5: - ip = tds[0].get_text(strip=True) - port = tds[1].get_text(strip=True) - protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else 'http' - - if protocol not in VALID_PROTOCOLS: - protocol = 'http' - - # 简单校验格式 - if re.match(r'^\d+\.\d+\.\d+\.\d+$', ip) and port.isdigit(): - yield ip, int(port), protocol - count += 1 - - if count > 0: - logger.info(f"{self.name} 解析完成,获得 {count} 个潜在代理") + for row in table.find_all("tr"): + tds = row.find_all("td") + if len(tds) >= 5: + ip = tds[0].get_text(strip=True) + port = tds[1].get_text(strip=True) + protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http" + if protocol not in VALID_PROTOCOLS: + protocol = "http" + if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit(): + results.append(ProxyRaw(ip, int(port), protocol)) - -if __name__ == "__main__": - async def test_plugin(): - plugin = KuaiDaiLiPlugin() - print(f"========== 测试 {plugin.name} ==========") - print(f"目标URL数量: {len(plugin.urls)}") - print(f"开始抓取...\n") - - proxies = await plugin.run() - - print(f"\n========== 抓取结果 ==========") - print(f"总计获取 {len(proxies)} 个代理:") - print("-" * 60) - - for idx, (ip, port, protocol) in enumerate(proxies, 1): - print(f"{idx:3d}. {ip:15s} : {str(port):5s} | {protocol}") - - print("-" * 60) - print(f"完成!共 {len(proxies)} 个代理~") - - asyncio.run(test_plugin()) + if results: + logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") + return results diff --git a/plugins/proxylist_download.py b/plugins/proxylist_download.py index 7a0f579..0456c84 100644 --- a/plugins/proxylist_download.py +++ b/plugins/proxylist_download.py @@ -1,75 +1,55 @@ -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from core.crawler import BasePlugin +from typing import List +from core.plugin_system import ProxyRaw +from plugins.base import BaseHTTPPlugin from core.log import logger -import asyncio -class ProxyListDownloadPlugin(BasePlugin): + +class ProxyListDownloadPlugin(BaseHTTPPlugin): + name = "proxylist_download" + display_name = "ProxyListDownload" + description = "从 ProxyListDownload API 获取代理" + def __init__(self): super().__init__() - self.name = "ProxyListDownload" self.urls = [ "https://www.proxy-list.download/api/v1/get?type=http", "https://www.proxy-list.download/api/v1/get?type=https", "https://www.proxy-list.download/api/v1/get?type=socks4", - "https://www.proxy-list.download/api/v1/get?type=socks5" + "https://www.proxy-list.download/api/v1/get?type=socks5", ] - async def parse(self, html): - if not html: - return - - lines = html.split('\r\n') - if len(lines) <= 1: - lines = html.split('\n') - - count = 0 - # 根据 URL 判断协议类型 - if 'type=socks4' in self.current_url: - protocol = 'socks4' - elif 'type=socks5' in self.current_url: - protocol = 'socks5' - elif 'type=https' in self.current_url: - protocol = 'https' - else: - protocol = 'http' - - for line in lines: - line = line.strip() - if not line: + async def crawl(self) -> List[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url, timeout=30) + if not html: continue - - if ':' in line: - parts = line.split(':') + + # 根据 URL 判断协议 + if "type=socks4" in url: + protocol = "socks4" + elif "type=socks5" in url: + protocol = "socks5" + elif "type=https" in url: + protocol = "https" + else: + protocol = "http" + + lines = html.split("\r\n") + if len(lines) <= 1: + lines = html.split("\n") + + for line in lines: + line = line.strip() + if not line or ":" not in line: + continue + parts = line.split(":") if len(parts) >= 2: - ip = parts[0] - port = parts[1] - yield ip, int(port), protocol - count += 1 - - if count > 0: - logger.info(f"{self.name} 解析完成,从 {self.current_url} 获得 {count} 个潜在代理") + ip = parts[0].strip() + port = parts[1].strip() + if ip and port.isdigit(): + results.append(ProxyRaw(ip, int(port), protocol)) - -if __name__ == "__main__": - async def test_plugin(): - plugin = ProxyListDownloadPlugin() - print(f"========== 测试 {plugin.name} ==========") - print(f"目标URL数量: {len(plugin.urls)}") - print(f"开始抓取...\n") - - proxies = await plugin.run() - - print(f"\n========== 抓取结果 ==========") - print(f"总计获取 {len(proxies)} 个代理:") - print("-" * 60) - - for idx, (ip, port, protocol) in enumerate(proxies, 1): - print(f"{idx:3d}. {ip:15s} : {str(port):5s} | {protocol}") - - print("-" * 60) - print(f"完成!共 {len(proxies)} 个代理~") - - asyncio.run(test_plugin()) + if results: + logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") + return results diff --git a/plugins/speedx.py b/plugins/speedx.py index 5088c84..5fc897d 100644 --- a/plugins/speedx.py +++ b/plugins/speedx.py @@ -1,78 +1,51 @@ -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from core.crawler import BasePlugin -from core.log import logger import re -import asyncio +from typing import List +from core.plugin_system import ProxyRaw +from plugins.base import BaseHTTPPlugin +from core.log import logger + + +class SpeedXPlugin(BaseHTTPPlugin): + name = "speedx" + display_name = "SpeedX代理源" + description = "从 SpeedX GitHub 仓库获取 SOCKS 代理列表" -class SpeedXPlugin(BasePlugin): def __init__(self): super().__init__() - self.name = "SpeedX代理源" self.urls = [ "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt", "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt", - "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt" + "https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt", ] - async def parse(self, html): - if not html: - return - - lines = html.split('\n') - count = 0 - for line in lines: - line = line.strip() - if not line: + async def crawl(self) -> List[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url, timeout=30) + if not html: continue - if ':' in line: - parts = line.split(':') + # 根据 URL 判断协议 + protocol = "http" + if "socks5" in url: + protocol = "socks5" + elif "socks4" in url: + protocol = "socks4" + + for line in html.split("\n"): + line = line.strip() + if not line or ":" not in line: + continue + parts = line.split(":") if len(parts) >= 2: ip = parts[0].strip() port = parts[1].strip() - - # 验证IP地址格式 - if not re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', ip): + if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip): continue - - # 验证端口是数字 if not port.isdigit() or not (1 <= int(port) <= 65535): continue + results.append(ProxyRaw(ip, int(port), protocol)) - # 根据 URL 判断协议 - protocol = 'http' - if 'socks5' in self.current_url: - protocol = 'socks5' - elif 'socks4' in self.current_url: - protocol = 'socks4' - - yield ip, int(port), protocol - count += 1 - - if count > 0: - logger.info(f"{self.name} 解析完成,从 {self.current_url} 获得 {count} 个潜在代理") - - -if __name__ == "__main__": - async def test_plugin(): - plugin = SpeedXPlugin() - print(f"========== 测试 {plugin.name} ==========") - print(f"目标URL数量: {len(plugin.urls)}") - print(f"开始抓取...\n") - - proxies = await plugin.run() - - print(f"\n========== 抓取结果 ==========") - print(f"总计获取 {len(proxies)} 个代理:") - print("-" * 60) - - for idx, (ip, port, protocol) in enumerate(proxies, 1): - print(f"{idx:3d}. {ip:15s} : {str(port):5s} | {protocol}") - - print("-" * 60) - print(f"完成!共 {len(proxies)} 个代理~") - - asyncio.run(test_plugin()) + if results: + logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") + return results diff --git a/plugins/yundaili.py b/plugins/yundaili.py index f472c98..2e5435e 100644 --- a/plugins/yundaili.py +++ b/plugins/yundaili.py @@ -1,79 +1,51 @@ -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from core.crawler import BasePlugin -from core.log import logger -from bs4 import BeautifulSoup import re -import asyncio +from typing import List +from bs4 import BeautifulSoup +from core.plugin_system import ProxyRaw +from plugins.base import BaseHTTPPlugin +from core.log import logger -VALID_PROTOCOLS = ['http', 'https', 'socks4', 'socks5'] +VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") + + +class YunDaiLiPlugin(BaseHTTPPlugin): + name = "yundaili" + display_name = "云代理" + description = "从云代理网站爬取免费代理" -class YunDaiLiPlugin(BasePlugin): def __init__(self): super().__init__() - self.name = "云代理" - # 抓取高匿和普通代理的前 5 页 self.urls = [ f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, 6) ] + [ f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, 6) ] - async def parse(self, html): - """ - 解析云代理/IP3366 页面 (两者结构相似) - """ - if not html: - return - - soup = BeautifulSoup(html, 'lxml') - list_table = soup.find('div', id='list') - if not list_table: - return - - table = list_table.find('table') - if not table: - return + async def crawl(self) -> List[ProxyRaw]: + results = [] + for url in self.urls: + html = await self.fetch(url, timeout=15) + if not html: + continue + soup = BeautifulSoup(html, "lxml") + list_table = soup.find("div", id="list") + if not list_table: + continue + table = list_table.find("table") + if not table: + continue - rows = table.find_all('tr') - count = 0 - for row in rows: - tds = row.find_all('td') - if len(tds) >= 5: - ip = tds[0].get_text(strip=True) - port = tds[1].get_text(strip=True) - protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else 'http' - - if protocol not in VALID_PROTOCOLS: - protocol = 'http' - - if re.match(r'^\d+\.\d+\.\d+\.\d+$', ip) and port.isdigit(): - yield ip, int(port), protocol - count += 1 - - if count > 0: - logger.info(f"{self.name} 解析完成,获得 {count} 个潜在代理") + for row in table.find_all("tr"): + tds = row.find_all("td") + if len(tds) >= 5: + ip = tds[0].get_text(strip=True) + port = tds[1].get_text(strip=True) + protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http" + if protocol not in VALID_PROTOCOLS: + protocol = "http" + if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit(): + results.append(ProxyRaw(ip, int(port), protocol)) - -if __name__ == "__main__": - async def test_plugin(): - plugin = YunDaiLiPlugin() - print(f"========== 测试 {plugin.name} ==========") - print(f"目标URL数量: {len(plugin.urls)}") - print(f"开始抓取...\n") - - proxies = await plugin.run() - - print(f"\n========== 抓取结果 ==========") - print(f"总计获取 {len(proxies)} 个代理:") - print("-" * 60) - - for idx, (ip, port, protocol) in enumerate(proxies, 1): - print(f"{idx:3d}. {ip:15s} : {str(port):5s} | {protocol}") - - print("-" * 60) - print(f"完成!共 {len(proxies)} 个代理~") - - asyncio.run(test_plugin()) + if results: + logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") + return results diff --git a/repositories/__init__.py b/repositories/__init__.py new file mode 100644 index 0000000..7b6a893 --- /dev/null +++ b/repositories/__init__.py @@ -0,0 +1,3 @@ +from .proxy_repo import ProxyRepository + +__all__ = ["ProxyRepository"] diff --git a/repositories/proxy_repo.py b/repositories/proxy_repo.py new file mode 100644 index 0000000..1d3299b --- /dev/null +++ b/repositories/proxy_repo.py @@ -0,0 +1,303 @@ +"""代理数据访问层 - 所有 SQL 操作收敛于此""" +import aiosqlite +from datetime import datetime, timedelta +from typing import List, Optional, Tuple, Union +from models.domain import Proxy +from core.log import logger + + +VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") + + +def _to_datetime(value: Union[str, datetime, None]) -> Optional[datetime]: + if value is None: + return None + if isinstance(value, datetime): + return value + if isinstance(value, str): + for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"): + try: + return datetime.strptime(value, fmt) + except ValueError: + continue + return None + + +class ProxyRepository: + """代理 Repository""" + + @staticmethod + async def insert_or_update( + db: aiosqlite.Connection, + ip: str, + port: int, + protocol: str = "http", + score: int = 10, + ) -> bool: + if protocol not in VALID_PROTOCOLS: + protocol = "http" + try: + await db.execute( + """ + INSERT INTO proxies (ip, port, protocol, score, last_check, created_at) + VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + ON CONFLICT(ip, port) DO UPDATE SET + protocol = excluded.protocol, + score = excluded.score, + last_check = CURRENT_TIMESTAMP + """, + (ip, port, protocol, score), + ) + await db.commit() + return True + except Exception as e: + logger.error(f"insert_or_update proxy failed: {e}") + return False + + @staticmethod + async def update_score( + db: aiosqlite.Connection, + ip: str, + port: int, + delta: int, + min_score: int = 0, + max_score: int = 100, + ) -> bool: + try: + async with db.execute( + "SELECT score FROM proxies WHERE ip = ? AND port = ?", (ip, port) + ) as cursor: + row = await cursor.fetchone() + if not row: + return False + current_score = row[0] + new_score = max(min_score, min(max_score, current_score + delta)) + await db.execute( + "UPDATE proxies SET score = ?, last_check = CURRENT_TIMESTAMP WHERE ip = ? AND port = ?", + (new_score, ip, port), + ) + if new_score <= 0: + await db.execute("DELETE FROM proxies WHERE score <= 0") + await db.commit() + return True + except Exception as e: + logger.error(f"update_score failed: {e}") + return False + + @staticmethod + async def update_response_time( + db: aiosqlite.Connection, + ip: str, + port: int, + response_time_ms: float, + ) -> bool: + try: + await db.execute( + "UPDATE proxies SET response_time_ms = ? WHERE ip = ? AND port = ?", + (response_time_ms, ip, port), + ) + await db.commit() + return True + except Exception as e: + logger.error(f"update_response_time failed: {e}") + return False + + @staticmethod + async def delete(db: aiosqlite.Connection, ip: str, port: int) -> None: + await db.execute("DELETE FROM proxies WHERE ip = ? AND port = ?", (ip, port)) + await db.commit() + + @staticmethod + async def batch_delete(db: aiosqlite.Connection, proxies: List[Tuple[str, int]]) -> int: + if not proxies: + return 0 + await db.executemany("DELETE FROM proxies WHERE ip = ? AND port = ?", proxies) + await db.commit() + return len(proxies) + + @staticmethod + async def get_by_ip_port( + db: aiosqlite.Connection, ip: str, port: int + ) -> Optional[Proxy]: + async with db.execute( + "SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE ip = ? AND port = ?", + (ip, port), + ) as cursor: + row = await cursor.fetchone() + if row: + return Proxy( + ip=row[0], + port=row[1], + protocol=row[2], + score=row[3], + response_time_ms=row[4], + last_check=_to_datetime(row[5]), + created_at=_to_datetime(row[6]), + ) + return None + + @staticmethod + async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]: + async with db.execute( + "SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1" + ) as cursor: + row = await cursor.fetchone() + if row: + return Proxy( + ip=row[0], + port=row[1], + protocol=row[2], + score=row[3], + response_time_ms=row[4], + last_check=_to_datetime(row[5]), + created_at=_to_datetime(row[6]), + ) + return None + + @staticmethod + async def list_all( + db: aiosqlite.Connection, + protocol: Optional[str] = None, + limit: int = 100000, + ) -> List[Proxy]: + query = "SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies" + params: List = [] + if protocol: + query += " WHERE protocol = ?" + params.append(protocol.lower()) + query += " LIMIT ?" + params.append(limit) + + async with db.execute(query, params) as cursor: + rows = await cursor.fetchall() + return [ + Proxy( + ip=row[0], + port=row[1], + protocol=row[2], + score=row[3], + response_time_ms=row[4], + last_check=_to_datetime(row[5]), + created_at=_to_datetime(row[6]), + ) + for row in rows + ] + + @staticmethod + async def list_paginated( + db: aiosqlite.Connection, + page: int = 1, + page_size: int = 20, + protocol: Optional[str] = None, + min_score: int = 0, + max_score: Optional[int] = None, + sort_by: str = "last_check", + sort_order: str = "DESC", + ) -> Tuple[List[Proxy], int]: + conditions = ["score >= ?"] + params: List = [min_score] + + if protocol: + conditions.append("protocol = ?") + params.append(protocol) + if max_score is not None: + conditions.append("score <= ?") + params.append(max_score) + + where_clause = " AND ".join(conditions) + order_clause = f"{sort_by} {sort_order}" + offset = (page - 1) * page_size + + count_query = f"SELECT COUNT(*) FROM proxies WHERE {where_clause}" + async with db.execute(count_query, list(params)) as cursor: + row = await cursor.fetchone() + total = row[0] if row else 0 + + data_query = f""" + SELECT ip, port, protocol, score, response_time_ms, last_check, created_at + FROM proxies + WHERE {where_clause} + ORDER BY {order_clause} + LIMIT ? OFFSET ? + """ + params.extend([page_size, offset]) + async with db.execute(data_query, params) as cursor: + rows = await cursor.fetchall() + proxies = [ + Proxy( + ip=row[0], + port=row[1], + protocol=row[2], + score=row[3], + response_time_ms=row[4], + last_check=_to_datetime(row[5]), + created_at=_to_datetime(row[6]), + ) + for row in rows + ] + return proxies, total + + @staticmethod + async def get_stats(db: aiosqlite.Connection) -> dict: + query = """ + SELECT + COUNT(*) as total, + COUNT(CASE WHEN score > 0 THEN 1 END) as available, + AVG(score) as avg_score, + COUNT(CASE WHEN protocol = 'http' THEN 1 END) as http_count, + COUNT(CASE WHEN protocol = 'https' THEN 1 END) as https_count, + COUNT(CASE WHEN protocol = 'socks4' THEN 1 END) as socks4_count, + COUNT(CASE WHEN protocol = 'socks5' THEN 1 END) as socks5_count + FROM proxies + """ + async with db.execute(query) as cursor: + row = await cursor.fetchone() + if row: + return { + "total": row[0] or 0, + "available": row[1] or 0, + "avg_score": round(row[2], 2) if row[2] else 0, + "http_count": row[3] or 0, + "https_count": row[4] or 0, + "socks4_count": row[5] or 0, + "socks5_count": row[6] or 0, + } + return { + "total": 0, + "available": 0, + "avg_score": 0, + "http_count": 0, + "https_count": 0, + "socks4_count": 0, + "socks5_count": 0, + } + + @staticmethod + async def get_today_new_count(db: aiosqlite.Connection) -> int: + try: + async with db.execute( + "SELECT COUNT(*) FROM proxies WHERE DATE(last_check) = DATE('now', 'localtime')" + ) as cursor: + row = await cursor.fetchone() + return row[0] if row else 0 + except Exception as e: + logger.error(f"get_today_new_count failed: {e}") + return 0 + + @staticmethod + async def clean_invalid(db: aiosqlite.Connection) -> int: + await db.execute("DELETE FROM proxies WHERE score <= 0") + await db.commit() + return db.total_changes + + @staticmethod + async def clean_expired(db: aiosqlite.Connection, days: int) -> int: + try: + await db.execute( + "DELETE FROM proxies WHERE last_check < datetime('now', '-{} days')".format(days) + ) + await db.commit() + return db.total_changes + except Exception as e: + logger.error(f"clean_expired failed: {e}") + return 0 diff --git a/repositories/settings_repo.py b/repositories/settings_repo.py new file mode 100644 index 0000000..828241c --- /dev/null +++ b/repositories/settings_repo.py @@ -0,0 +1,102 @@ +"""设置数据访问层""" +import json +import aiosqlite +from typing import Optional, Dict, Any +from core.log import logger + + +DEFAULT_SETTINGS = { + "crawl_timeout": 30, + "validation_timeout": 10, + "max_retries": 3, + "default_concurrency": 50, + "min_proxy_score": 0, + "proxy_expiry_days": 7, + "auto_validate": True, + "validate_interval_minutes": 30, +} + + +class SettingsRepository: + """系统设置 Repository""" + + @staticmethod + async def get_all(db: aiosqlite.Connection) -> Dict[str, Any]: + settings = DEFAULT_SETTINGS.copy() + try: + async with db.execute("SELECT key, value FROM settings") as cursor: + rows = await cursor.fetchall() + for key, value in rows: + # 类型转换 + default = DEFAULT_SETTINGS.get(key) + if isinstance(default, bool): + settings[key] = value.lower() == "true" + elif isinstance(default, int): + settings[key] = int(value) + else: + settings[key] = value + except Exception as e: + logger.error(f"get_all settings failed: {e}") + return settings + + @staticmethod + async def save(db: aiosqlite.Connection, settings: Dict[str, Any]) -> bool: + try: + for key, value in settings.items(): + await db.execute( + """ + INSERT INTO settings (key, value, updated_at) + VALUES (?, ?, CURRENT_TIMESTAMP) + ON CONFLICT(key) DO UPDATE SET + value = excluded.value, + updated_at = CURRENT_TIMESTAMP + """, + (key, str(value)), + ) + await db.commit() + return True + except Exception as e: + logger.error(f"save settings failed: {e}") + return False + + +class PluginSettingsRepository: + """插件设置 Repository""" + + @staticmethod + async def get_enabled(db: aiosqlite.Connection, plugin_id: str) -> Optional[bool]: + async with db.execute( + "SELECT enabled FROM plugin_settings WHERE plugin_id = ?", (plugin_id,) + ) as cursor: + row = await cursor.fetchone() + if row: + return bool(row[0]) + return None + + @staticmethod + async def set_enabled(db: aiosqlite.Connection, plugin_id: str, enabled: bool) -> bool: + try: + await db.execute( + """ + INSERT INTO plugin_settings (plugin_id, enabled, created_at, updated_at) + VALUES (?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + ON CONFLICT(plugin_id) DO UPDATE SET + enabled = excluded.enabled, + updated_at = CURRENT_TIMESTAMP + """, + (plugin_id, int(enabled)), + ) + await db.commit() + return True + except Exception as e: + logger.error(f"set_enabled failed for {plugin_id}: {e}") + return False + + @staticmethod + async def list_all(db: aiosqlite.Connection) -> Dict[str, bool]: + result = {} + async with db.execute("SELECT plugin_id, enabled FROM plugin_settings") as cursor: + rows = await cursor.fetchall() + for plugin_id, enabled in rows: + result[plugin_id] = bool(enabled) + return result diff --git a/requirements.txt b/requirements.txt index b8d58ea..3d20dbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ aiohttp==3.9.1 aiohttp-socks==0.9.1 beautifulsoup4==4.12.3 lxml==5.1.0 +pydantic-settings==2.8.1 diff --git a/services/plugin_service.py b/services/plugin_service.py new file mode 100644 index 0000000..606a020 --- /dev/null +++ b/services/plugin_service.py @@ -0,0 +1,111 @@ +"""插件业务服务""" +from datetime import datetime +from typing import List, Optional +from core.db import get_db +from core.plugin_system.registry import registry +from core.plugin_system.base import BaseCrawlerPlugin +from repositories.settings_repo import PluginSettingsRepository +from models.domain import PluginInfo, ProxyRaw +from core.log import logger + + +class PluginService: + """插件业务服务:管理插件生命周期、执行爬取""" + + def __init__(self): + self.plugin_settings_repo = PluginSettingsRepository() + self._stats: dict[str, dict] = {} + + async def list_plugins(self) -> List[PluginInfo]: + """获取所有插件信息(合并持久化状态)""" + async with get_db() as db: + db_states = await self.plugin_settings_repo.list_all(db) + + result = [] + for plugin in registry.list_plugins(): + # 如果有持久化状态,覆盖内存状态 + if plugin.name in db_states: + plugin.enabled = db_states[plugin.name] + + stat = self._stats.get(plugin.name, { + "success_count": 0, + "failure_count": 0, + "last_run": None, + }) + result.append(PluginInfo( + id=plugin.name, + name=plugin.name, + display_name=plugin.display_name or plugin.name, + description=plugin.description or f"从 {plugin.name} 爬取代理", + enabled=plugin.enabled, + last_run=stat.get("last_run"), + success_count=stat.get("success_count", 0), + failure_count=stat.get("failure_count", 0), + )) + return result + + async def toggle_plugin(self, plugin_id: str, enabled: bool) -> bool: + plugin = registry.get(plugin_id) + if not plugin: + return False + async with get_db() as db: + success = await self.plugin_settings_repo.set_enabled(db, plugin_id, enabled) + if success: + plugin.enabled = enabled + logger.info(f"Plugin {plugin_id} toggled to {enabled}") + return success + + def get_plugin(self, plugin_id: str) -> Optional[BaseCrawlerPlugin]: + return registry.get(plugin_id) + + async def run_plugin(self, plugin_id: str) -> List[ProxyRaw]: + """执行单个插件爬取""" + plugin = self.get_plugin(plugin_id) + if not plugin: + raise ValueError(f"Plugin {plugin_id} not found") + if not plugin.enabled: + logger.warning(f"Plugin {plugin_id} is disabled, skip crawl") + return [] + + try: + results = await plugin.crawl() + self._record_stat(plugin_id, success=len(results)) + logger.info(f"Plugin {plugin_id} crawled {len(results)} proxies") + return results + except Exception as e: + self._record_stat(plugin_id, failure=1) + logger.error(f"Plugin {plugin_id} crawl failed: {e}") + return [] + + async def run_all_plugins(self) -> List[ProxyRaw]: + """执行所有启用插件的爬取""" + all_results: List[ProxyRaw] = [] + for plugin in registry.list_plugins(): + if not plugin.enabled: + continue + try: + results = await self.run_plugin(plugin.name) + all_results.extend(results) + except Exception as e: + logger.error(f"Run all plugins error at {plugin.name}: {e}") + # 去重 + seen = set() + unique = [] + for p in all_results: + key = (p.ip, p.port, p.protocol) + if key not in seen: + seen.add(key) + unique.append(p) + return unique + + def _record_stat(self, plugin_id: str, success: int = 0, failure: int = 0): + if plugin_id not in self._stats: + self._stats[plugin_id] = { + "success_count": 0, + "failure_count": 0, + "last_run": None, + } + self._stats[plugin_id]["success_count"] += success + self._stats[plugin_id]["failure_count"] += failure + if success or failure: + self._stats[plugin_id]["last_run"] = datetime.now() diff --git a/services/proxy_service.py b/services/proxy_service.py new file mode 100644 index 0000000..6f002b0 --- /dev/null +++ b/services/proxy_service.py @@ -0,0 +1,93 @@ +"""代理业务服务""" +import csv +import json +import io +from datetime import datetime +from typing import List, Optional, Tuple, AsyncIterator +from core.db import get_db +from repositories.proxy_repo import ProxyRepository +from models.domain import Proxy +from core.log import logger + + +class ProxyService: + def __init__(self, proxy_repo: ProxyRepository = ProxyRepository()): + self.proxy_repo = proxy_repo + + async def get_stats(self) -> dict: + async with get_db() as db: + stats = await self.proxy_repo.get_stats(db) + stats["today_new"] = await self.proxy_repo.get_today_new_count(db) + return stats + + async def list_proxies( + self, + page: int = 1, + page_size: int = 20, + protocol: Optional[str] = None, + min_score: int = 0, + max_score: Optional[int] = None, + sort_by: str = "last_check", + sort_order: str = "DESC", + ) -> Tuple[List[Proxy], int]: + async with get_db() as db: + return await self.proxy_repo.list_paginated( + db, page, page_size, protocol, min_score, max_score, sort_by, sort_order + ) + + async def get_random_proxy(self) -> Optional[Proxy]: + async with get_db() as db: + return await self.proxy_repo.get_random(db) + + async def delete_proxy(self, ip: str, port: int) -> None: + async with get_db() as db: + await self.proxy_repo.delete(db, ip, port) + + async def batch_delete(self, proxies: List[Tuple[str, int]]) -> int: + async with get_db() as db: + return await self.proxy_repo.batch_delete(db, proxies) + + async def clean_invalid(self) -> int: + async with get_db() as db: + return await self.proxy_repo.clean_invalid(db) + + async def clean_expired(self, days: int) -> int: + async with get_db() as db: + return await self.proxy_repo.clean_expired(db, days) + + async def export_proxies( + self, + fmt: str, + protocol: Optional[str] = None, + limit: int = 10000, + ) -> AsyncIterator[str]: + async with get_db() as db: + proxies = await self.proxy_repo.list_all(db, protocol=protocol, limit=limit) + + if fmt == "csv": + yield "IP,Port,Protocol,Score,Last Check\n" + for p in proxies: + yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n" + elif fmt == "txt": + for p in proxies: + yield f"{p.ip}:{p.port}\n" + elif fmt == "json": + data = [ + { + "ip": p.ip, + "port": p.port, + "protocol": p.protocol, + "score": p.score, + "last_check": self._fmt_time(p.last_check), + } + for p in proxies + ] + yield json.dumps(data, ensure_ascii=False, indent=2) + + @staticmethod + def _fmt_time(dt: Optional[datetime]) -> str: + if not dt: + return "" + if isinstance(dt, str): + return dt + return dt.isoformat() diff --git a/services/scheduler_service.py b/services/scheduler_service.py new file mode 100644 index 0000000..5d9e402 --- /dev/null +++ b/services/scheduler_service.py @@ -0,0 +1,88 @@ +"""调度器服务 - 定时验证存量代理""" +import asyncio +from datetime import datetime +from core.db import get_db +from repositories.proxy_repo import ProxyRepository +from core.tasks.queue import ValidationQueue +from core.config import settings as app_settings +from core.log import logger + + +class SchedulerService: + """代理验证调度器""" + + def __init__( + self, + validation_queue: ValidationQueue, + proxy_repo: ProxyRepository = ProxyRepository(), + ): + self.validation_queue = validation_queue + self.proxy_repo = proxy_repo + self.interval_minutes = 30 + self.running = False + self._task: asyncio.Task | None = None + + async def start(self): + if self.running: + logger.warning("Scheduler already running") + return + self.running = True + await self.validation_queue.start() + self._task = asyncio.create_task(self._run_loop()) + logger.info("Scheduler started") + + async def stop(self): + self.running = False + if self._task: + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + self._task = None + await self.validation_queue.stop() + logger.info("Scheduler stopped") + + async def validate_all_now(self): + """立即执行一次全量验证(后台运行,不阻塞)""" + asyncio.create_task(self._do_validate_all()) + + async def _run_loop(self): + """定时循环""" + while self.running: + try: + await self._do_validate_all() + except Exception as e: + logger.error(f"Scheduler loop error: {e}") + # 等待下一次 + for _ in range(self.interval_minutes * 60): + if not self.running: + break + await asyncio.sleep(1) + + async def _do_validate_all(self): + """验证数据库中所有存量代理""" + logger.info("Starting scheduled validation for all proxies") + async with get_db() as db: + proxies = await self.proxy_repo.list_all(db) + if not proxies: + logger.info("No proxies to validate") + return + + logger.info(f"Validating {len(proxies)} proxies from database") + from models.domain import ProxyRaw + + # 批量提交到验证队列 + batch_size = 100 + for i in range(0, len(proxies), batch_size): + if not self.running: + break + batch = proxies[i : i + batch_size] + await self.validation_queue.submit([ + ProxyRaw(p.ip, p.port, p.protocol) for p in batch + ]) + # 等待当前批次处理完 + await self.validation_queue.drain() + logger.info(f"Validated batch {i//batch_size + 1}/{(len(proxies)-1)//batch_size + 1}") + + logger.info("Scheduled validation completed") diff --git a/services/settings_service.py b/services/settings_service.py new file mode 100644 index 0000000..39da888 --- /dev/null +++ b/services/settings_service.py @@ -0,0 +1,19 @@ +"""系统设置业务服务""" +from typing import Any, Dict +from core.db import get_db +from repositories.settings_repo import SettingsRepository +from models.schemas import SettingsSchema + + +class SettingsService: + def __init__(self): + self.repo = SettingsRepository() + + async def get_settings(self) -> Dict[str, Any]: + async with get_db() as db: + return await self.repo.get_all(db) + + async def save_settings(self, data: SettingsSchema) -> bool: + settings_dict = data.model_dump() + async with get_db() as db: + return await self.repo.save(db, settings_dict) diff --git a/services/validator_service.py b/services/validator_service.py new file mode 100644 index 0000000..4b86137 --- /dev/null +++ b/services/validator_service.py @@ -0,0 +1,103 @@ +"""代理验证服务 - 支持 HTTP/HTTPS/SOCKS4/SOCKS5""" +import asyncio +import random +import time +import aiohttp +import aiohttp_socks +from typing import Tuple +from core.log import logger + + +class ValidatorService: + """代理验证器""" + + def __init__( + self, + timeout: float = 5.0, + connect_timeout: float = 3.0, + max_concurrency: int = 50, + ): + self.timeout = timeout + self.connect_timeout = connect_timeout + self.semaphore = asyncio.Semaphore(max_concurrency) + self.http_sources = [ + "http://httpbin.org/ip", + "http://api.ipify.org", + ] + self.https_sources = [ + "https://httpbin.org/ip", + "https://api.ipify.org", + ] + + def _get_test_url(self, protocol: str) -> str: + protocol = protocol.lower() + if protocol == "https": + return random.choice(self.https_sources) + return random.choice(self.http_sources) + + async def validate(self, ip: str, port: int, protocol: str = "http") -> Tuple[bool, float]: + """验证单个代理,返回 (是否有效, 延迟毫秒)""" + protocol = protocol.lower() + test_url = self._get_test_url(protocol) + + async with self.semaphore: + start = time.time() + try: + if protocol in ("socks4", "socks5"): + return await self._validate_socks(ip, port, protocol, test_url, start) + else: + return await self._validate_http(ip, port, protocol, test_url, start) + except asyncio.TimeoutError: + logger.debug(f"Validation timeout: {ip}:{port} ({protocol})") + return False, 0.0 + except Exception as e: + logger.debug(f"Validation error {ip}:{port} ({protocol}): {e}") + return False, 0.0 + + async def _validate_http( + self, ip: str, port: int, protocol: str, test_url: str, start: float + ) -> Tuple[bool, float]: + proxy_url = f"http://{ip}:{port}" + connector = aiohttp.TCPConnector(ssl=False, limit=0, force_close=True) + timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout) + + try: + async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: + async with session.get( + test_url, proxy=proxy_url, allow_redirects=True + ) as response: + if response.status in (200, 301, 302): + latency = round((time.time() - start) * 1000, 2) + logger.info(f"HTTP valid: {ip}:{port} ({protocol}) {latency}ms") + return True, latency + return False, 0.0 + finally: + await connector.close() + + async def _validate_socks( + self, ip: str, port: int, protocol: str, test_url: str, start: float + ) -> Tuple[bool, float]: + proxy_type = ( + aiohttp_socks.ProxyType.SOCKS4 + if protocol == "socks4" + else aiohttp_socks.ProxyType.SOCKS5 + ) + connector = aiohttp_socks.ProxyConnector( + proxy_type=proxy_type, + host=ip, + port=port, + rdns=True, + ssl=False, + ) + timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout) + + try: + async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: + async with session.get(test_url, allow_redirects=True) as response: + if response.status in (200, 301, 302): + latency = round((time.time() - start) * 1000, 2) + logger.info(f"SOCKS valid: {ip}:{port} ({protocol}) {latency}ms") + return True, latency + return False, 0.0 + finally: + await connector.close()