全面架构重构:建立分层架构与高度可扩展的插件系统

后端重构:
- 新增分层架构:API Routes -> Services -> Repositories -> Infrastructure
- 彻底移除全局单例,全面采用 FastAPI 依赖注入
- 新增 api/ 目录拆分路由(proxies, plugins, scheduler, settings, stats)
- 新增 services/ 业务逻辑层:ProxyService, PluginService, SchedulerService, ValidatorService, SettingsService
- 新增 repositories/ 数据访问层:ProxyRepository, SettingsRepository, PluginSettingsRepository
- 新增 models/ 层:Pydantic Schemas + Domain Models
- 重写 core/config.py:采用 Pydantic Settings 管理配置
- 新增 core/db.py:基于 asynccontextmanager 的连接管理,支持数据库迁移
- 新增 core/exceptions.py:统一业务异常体系

插件系统重构(核心):
- 新增 core/plugin_system/:BaseCrawlerPlugin + PluginRegistry
- 采用显式注册模式(装饰器 + plugins/__init__.py),类型安全、测试友好
- 新增 plugins/base.py:BaseHTTPPlugin 通用 HTTP 爬虫基类
- 迁移全部 7 个插件到新架构(fate0, proxylist_download, ip3366, ip89, kuaidaili, speedx, yundaili)
- 插件状态持久化到 plugin_settings 表

任务调度重构:
- 新增 core/tasks/queue.py:ValidationQueue + WorkerPool
- 解耦爬取与验证:爬虫只负责爬取,代理提交队列后由 Worker 异步验证
- 调度器定时从数据库拉取存量代理并分批投入验证队列

前端调整:
- 新增 frontend/src/services/ 层拆分 API 调用逻辑
- 调整 stores/ 和 views/ 使用 Service 层
- 保持 API 兼容性,页面无需大幅修改

其他:
- 新增 main.py 作为新入口
- 新增 DESIGN.md 架构设计文档
- 更新 requirements.txt 增加 pydantic-settings
This commit is contained in:
祀梦
2026-04-02 11:55:05 +08:00
parent a79f78b338
commit 209a744d94
56 changed files with 2891 additions and 2095 deletions

59
core/config.py Normal file
View File

@@ -0,0 +1,59 @@
"""全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件"""
import os
from typing import List
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
# 数据库配置
db_path: str = "db/proxies.sqlite"
# API 服务配置
host: str = "0.0.0.0"
port: int = 9949
# 验证器配置
validator_timeout: int = 5
validator_max_concurrency: int = 200
validator_connect_timeout: int = 3
# 爬虫配置
crawler_num_validators: int = 50
crawler_max_queue_size: int = 500
# 日志配置
log_level: str = "INFO"
log_dir: str = "logs"
# 导出配置
export_max_records: int = 10000
# 代理评分配置
score_valid: int = 10
score_invalid: int = -5
score_min: int = 0
score_max: int = 100
# 插件配置
plugins_dir: str = "plugins"
# CORS 配置
cors_origins: str = "http://localhost:8080,http://localhost:5173"
@property
def cors_origins_list(self) -> List[str]:
return [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()]
@property
def base_dir(self) -> str:
return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# 全局配置实例(启动时加载一次)
settings = Settings()

View File

@@ -1,86 +0,0 @@
import aiohttp
import asyncio
import random
from core.log import logger
class BaseCrawler:
def __init__(self):
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Mobile/15E148 Safari/604.1"
]
def get_headers(self):
return {
'User-Agent': random.choice(self.user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
}
async def fetch(self, url, method='GET', params=None, data=None, proxies=None, timeout=10, retry_count=3):
"""异步抓取方法"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
async with aiohttp.ClientSession(headers=headers) as session:
for i in range(retry_count):
try:
# 注意aiohttp 的代理格式与 requests 不同,通常为 http://user:pass@host:port
async with session.request(
method=method,
url=url,
params=params,
data=data,
proxy=proxies,
timeout=aiohttp.ClientTimeout(total=timeout)
) as response:
if response.status == 200:
# 先读取内容,再处理编码
content = await response.read()
# 尝试获取编码
encoding = response.get_encoding()
if encoding == 'utf-8' or not encoding:
try:
return content.decode('utf-8')
except UnicodeDecodeError:
# 尝试从内容中检测编码或手动设置为 gbk (国内网站常见)
return content.decode('gbk', errors='ignore')
return content.decode(encoding, errors='ignore')
else:
logger.warning(f"请求失败 [{response.status}]: {url}, 正在进行第 {i+1} 次重试...")
except Exception as e:
logger.error(f"请求异常: {url}, 错误: {e}, 正在进行第 {i+1} 次重试...")
await asyncio.sleep(random.uniform(1, 3))
return None
class BasePlugin(BaseCrawler):
def __init__(self):
super().__init__()
self.name = "BasePlugin"
self.urls = []
self.enabled = True
async def parse(self, html):
"""异步解析网页内容,需在子类中实现"""
raise NotImplementedError("Please implement parse method")
async def run(self):
"""异步运行插件"""
logger.info(f"正在运行插件: {self.name}")
results = []
for url in self.urls:
self.current_url = url # 记录当前正在抓取的 URL供 parse 使用
html = await self.fetch(url)
if html:
async for proxy in self.parse(html):
results.append(proxy)
await asyncio.sleep(random.uniform(1, 2))
return results

95
core/db.py Normal file
View File

@@ -0,0 +1,95 @@
"""数据库连接管理 - 使用上下文管理器,避免全局单例连接泄漏"""
import os
import aiosqlite
from contextlib import asynccontextmanager
from typing import AsyncIterator
from core.config import settings
from core.log import logger
DB_PATH = os.path.join(settings.base_dir, settings.db_path)
def ensure_db_dir():
db_dir = os.path.dirname(DB_PATH)
if db_dir and not os.path.exists(db_dir):
os.makedirs(db_dir, exist_ok=True)
async def init_db():
"""初始化数据库表结构(支持迁移)"""
ensure_db_dir()
async with aiosqlite.connect(DB_PATH) as db:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
await db.execute("PRAGMA cache_size=-64000")
await db.execute("PRAGMA temp_store=MEMORY")
await db.execute("""
CREATE TABLE IF NOT EXISTS proxies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ip TEXT NOT NULL,
port INTEGER NOT NULL,
protocol TEXT DEFAULT 'http',
score INTEGER DEFAULT 10,
response_time_ms REAL,
last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(ip, port)
)
""")
# 迁移:如果旧表缺少 response_time_ms 列,则添加
try:
await db.execute("SELECT response_time_ms FROM proxies LIMIT 1")
except Exception:
await db.execute("ALTER TABLE proxies ADD COLUMN response_time_ms REAL")
logger.info("Migrated: added response_time_ms column")
# 迁移:如果旧表缺少 created_at 列,则添加
try:
await db.execute("SELECT created_at FROM proxies LIMIT 1")
except Exception:
await db.execute("ALTER TABLE proxies ADD COLUMN created_at TIMESTAMP")
await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL")
logger.info("Migrated: added created_at column")
await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)")
# 插件设置表
await db.execute("""
CREATE TABLE IF NOT EXISTS plugin_settings (
plugin_id TEXT PRIMARY KEY,
enabled INTEGER DEFAULT 1,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 系统设置表
await db.execute("""
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
await db.commit()
logger.info("Database initialized")
@asynccontextmanager
async def get_db() -> AsyncIterator[aiosqlite.Connection]:
"""获取数据库连接的异步上下文管理器"""
ensure_db_dir()
db = await aiosqlite.connect(DB_PATH)
try:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
yield db
finally:
await db.close()

24
core/exceptions.py Normal file
View File

@@ -0,0 +1,24 @@
"""业务异常定义"""
class ProxyPoolException(Exception):
"""基础业务异常"""
def __init__(self, message: str, code: int = 500):
self.message = message
self.code = code
super().__init__(self.message)
class PluginNotFoundException(ProxyPoolException):
def __init__(self, plugin_id: str):
super().__init__(f"Plugin '{plugin_id}' not found", 404)
class ProxyNotFoundException(ProxyPoolException):
def __init__(self, ip: str, port: int):
super().__init__(f"Proxy {ip}:{port} not found", 404)
class ValidationException(ProxyPoolException):
def __init__(self, message: str):
super().__init__(message, 400)

View File

@@ -1,125 +0,0 @@
import os
import importlib
import inspect
import asyncio
from typing import List, Dict, Optional
from core.crawler import BasePlugin
from core.log import logger
class PluginManager:
def __init__(self, plugin_dir='plugins'):
self.plugin_dir = plugin_dir
self.plugins = []
self.plugin_stats = {}
self._load_plugins()
self._init_stats()
def _init_stats(self):
for plugin in self.plugins:
self.plugin_stats[plugin.name] = {
'success_count': 0,
'failure_count': 0,
'last_run': None
}
def _load_plugins(self):
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
full_plugin_path = os.path.join(base_dir, self.plugin_dir)
if not os.path.exists(full_plugin_path):
logger.error(f"插件目录不存在: {full_plugin_path}")
return
for filename in os.listdir(full_plugin_path):
if filename.endswith('.py') and not filename.startswith('__'):
module_name = f"{self.plugin_dir}.{filename[:-3]}"
try:
module = importlib.import_module(module_name)
for name, obj in inspect.getmembers(module):
if inspect.isclass(obj) and issubclass(obj, BasePlugin) and obj is not BasePlugin:
plugin_instance = obj()
if plugin_instance.enabled:
logger.info(f"成功加载插件: {name} 来自 {module_name}")
self.plugins.append(plugin_instance)
else:
logger.info(f"插件已禁用,跳过加载: {name} 来自 {module_name}")
except Exception as e:
logger.error(f"加载插件失败 {module_name}: {e}")
def get_plugin_by_name(self, plugin_name: str) -> Optional[BasePlugin]:
for plugin in self.plugins:
if plugin.name == plugin_name:
return plugin
return None
def get_all_plugin_info(self) -> List[Dict]:
plugins_info = []
for plugin in self.plugins:
stats = self.plugin_stats.get(plugin.name, {
'success_count': 0,
'failure_count': 0,
'last_run': None
})
plugins_info.append({
'id': plugin.name,
'name': plugin.name,
'enabled': plugin.enabled,
'description': getattr(plugin, 'description', f'{plugin.name}网站爬取代理'),
'last_run': stats['last_run'],
'success_count': stats['success_count'],
'failure_count': stats['failure_count']
})
return plugins_info
def toggle_plugin(self, plugin_name: str, enabled: bool) -> bool:
plugin = self.get_plugin_by_name(plugin_name)
if plugin:
plugin.enabled = enabled
logger.info(f"插件 {plugin_name}{'启用' if enabled else '禁用'}")
return True
return False
async def run_plugin(self, plugin_name: str):
plugin = self.get_plugin_by_name(plugin_name)
if not plugin:
logger.error(f"插件不存在: {plugin_name}")
return []
if not plugin.enabled:
logger.warning(f"插件已禁用: {plugin_name}")
return []
try:
results = await plugin.run()
success_count = len(results)
failure_count = 0
from datetime import datetime
self.plugin_stats[plugin.name] = {
'success_count': self.plugin_stats[plugin.name]['success_count'] + success_count,
'failure_count': self.plugin_stats[plugin.name]['failure_count'] + failure_count,
'last_run': datetime.now().isoformat()
}
logger.info(f"插件 {plugin_name} 执行完成,成功: {success_count}")
return results
except Exception as e:
logger.error(f"插件 {plugin_name} 执行失败: {e}")
from datetime import datetime
self.plugin_stats[plugin.name] = {
'success_count': self.plugin_stats[plugin.name]['success_count'],
'failure_count': self.plugin_stats[plugin.name]['failure_count'] + 1,
'last_run': datetime.now().isoformat()
}
return []
async def run_all(self):
"""并发运行所有插件"""
tasks = [plugin.run() for plugin in self.plugins]
# 并发执行并收集结果
results_list = await asyncio.gather(*tasks)
# 将嵌套列表扁平化并产出结果
for results in results_list:
for proxy in results:
yield proxy

View File

@@ -0,0 +1,4 @@
from .base import BaseCrawlerPlugin, ProxyRaw
from .registry import registry
__all__ = ["BaseCrawlerPlugin", "ProxyRaw", "registry"]

View File

@@ -0,0 +1,41 @@
"""插件基类 - 所有爬虫插件必须继承此基类"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List
@dataclass
class ProxyRaw:
"""爬虫产出的原始代理数据"""
ip: str
port: int
protocol: str = "http"
def __post_init__(self):
self.protocol = self.protocol.lower().strip()
if self.protocol not in ("http", "https", "socks4", "socks5"):
self.protocol = "http"
class BaseCrawlerPlugin(ABC):
"""爬虫插件基类
添加新爬虫只需:
1. 继承 BaseCrawlerPlugin
2. 实现 crawl() 方法返回 List[ProxyRaw]
3. 用 @registry.register 装饰或在 __init__ 中显式注册
"""
name: str = ""
display_name: str = ""
description: str = ""
enabled: bool = True
@abstractmethod
async def crawl(self) -> List[ProxyRaw]:
"""爬取代理的核心方法。只负责爬取,不要在这里验证。"""
raise NotImplementedError
async def health_check(self) -> bool:
"""可选:检查插件健康状态"""
return True

View File

@@ -0,0 +1,77 @@
"""插件注册中心 - 显式注册,类型安全,测试友好"""
import importlib
import inspect
import os
from typing import Dict, List, Type, Optional
from core.plugin_system.base import BaseCrawlerPlugin
from core.log import logger
class PluginRegistry:
"""插件注册中心"""
def __init__(self):
self._plugins: Dict[str, Type[BaseCrawlerPlugin]] = {}
self._instances: Dict[str, BaseCrawlerPlugin] = {}
def register(self, plugin_cls: Type[BaseCrawlerPlugin]) -> Type[BaseCrawlerPlugin]:
"""注册一个插件类。支持装饰器语法。"""
if not inspect.isclass(plugin_cls) or not issubclass(plugin_cls, BaseCrawlerPlugin):
raise ValueError("Plugin must be a subclass of BaseCrawlerPlugin")
if not plugin_cls.name:
raise ValueError(f"Plugin {plugin_cls.__name__} must have a 'name' attribute")
self._plugins[plugin_cls.name] = plugin_cls
logger.info(f"Plugin registered: {plugin_cls.name} ({plugin_cls.__name__})")
return plugin_cls
def get(self, name: str) -> Optional[BaseCrawlerPlugin]:
"""获取插件实例(懒加载)"""
if name not in self._instances:
cls = self._plugins.get(name)
if cls:
self._instances[name] = cls()
return self._instances.get(name)
def list_plugins(self) -> List[BaseCrawlerPlugin]:
"""获取所有已注册插件的实例列表"""
result = []
for name in self._plugins:
instance = self.get(name)
if instance:
result.append(instance)
return result
def get_plugin_names(self) -> List[str]:
return list(self._plugins.keys())
def auto_discover(self, package_name: str):
"""自动扫描指定包下的所有模块并注册其中的插件类。
注意为了类型安全和可控性推荐显式注册。auto_discover 仅作为兼容。"""
try:
package = importlib.import_module(package_name)
package_dir = os.path.dirname(package.__file__)
except Exception as e:
logger.error(f"Auto discover failed for package {package_name}: {e}")
return
for filename in os.listdir(package_dir):
if filename.endswith(".py") and not filename.startswith("__"):
module_name = f"{package_name}.{filename[:-3]}"
try:
module = importlib.import_module(module_name)
for attr_name in dir(module):
obj = getattr(module, attr_name)
if (
inspect.isclass(obj)
and issubclass(obj, BaseCrawlerPlugin)
and obj is not BaseCrawlerPlugin
and obj not in self._plugins.values()
):
self.register(obj)
except Exception as e:
logger.error(f"Failed to load module {module_name}: {e}")
# 全局注册中心实例
registry = PluginRegistry()

View File

@@ -1,206 +0,0 @@
"""
代理验证调度器
负责定期验证数据库中的代理,并更新分数
"""
import asyncio
from datetime import datetime, timedelta
from typing import Optional
from core.sqlite import SQLiteManager
from core.validator import ProxyValidator
from core.log import logger
from config import config
class ValidationScheduler:
"""代理验证调度器"""
def __init__(self):
self.db = SQLiteManager()
self.validator: Optional[ProxyValidator] = None
self.running = False
self.task: Optional[asyncio.Task] = None
self.interval_minutes = 30 # 默认每30分钟验证一次
self.batch_size = 100 # 每批验证数量
async def start(self):
"""启动验证调度器"""
if self.running:
logger.warning("验证调度器已在运行")
return
self.running = True
self.validator = ProxyValidator(
max_concurrency=config.VALIDATOR_MAX_CONCURRENCY,
timeout=config.VALIDATOR_TIMEOUT
)
self.task = asyncio.create_task(self._run_loop())
logger.info("代理验证调度器已启动")
async def stop(self):
"""停止验证调度器"""
self.running = False
if self.task:
self.task.cancel()
try:
await self.task
except asyncio.CancelledError:
pass
if self.validator:
await self.validator.__aexit__(None, None, None)
logger.info("代理验证调度器已停止")
async def _run_loop(self):
"""运行循环"""
while self.running:
try:
await self.validate_all_proxies()
except Exception as e:
logger.error(f"验证循环出错: {e}")
# 等待下一次验证
await asyncio.sleep(self.interval_minutes * 60)
async def validate_all_proxies(self):
"""验证所有代理"""
logger.info("开始批量验证代理...")
try:
# 获取所有代理
proxies = await self.db.get_all_proxies()
if not proxies:
logger.info("数据库中没有代理需要验证")
return
logger.info(f"需要验证 {len(proxies)} 个代理")
# 分批验证
validated_count = 0
valid_count = 0
invalid_count = 0
async with self.validator:
for i in range(0, len(proxies), self.batch_size):
if not self.running:
break
batch = proxies[i:i + self.batch_size]
tasks = []
for proxy in batch:
ip, port, protocol, score, last_check = proxy
task = self._validate_and_update(ip, port, protocol)
tasks.append(task)
# 并发验证一批
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
validated_count += 1
if isinstance(result, Exception):
logger.error(f"验证过程出错: {result}")
continue
if result:
valid_count += 1
else:
invalid_count += 1
logger.info(f"已验证 {validated_count}/{len(proxies)} 个代理")
# 批次间短暂延迟,避免过载
if i + self.batch_size < len(proxies):
await asyncio.sleep(1)
logger.info(f"验证完成: 总计 {validated_count}, 有效 {valid_count}, 无效 {invalid_count}")
except Exception as e:
logger.error(f"批量验证代理失败: {e}", exc_info=True)
async def _validate_and_update(self, ip: str, port: int, protocol: str) -> bool:
"""验证单个代理并更新分数"""
try:
is_valid, latency = await self.validator.validate(ip, port, protocol)
if is_valid:
# 验证成功,增加分数
await self.db.update_score(
ip, port,
config.SCORE_VALID,
min_score=config.SCORE_MIN,
max_score=config.SCORE_MAX
)
logger.debug(f"代理验证成功 {ip}:{port} ({protocol}) - 延迟 {latency}ms")
return True
else:
# 验证失败,减少分数
await self.db.update_score(
ip, port,
config.SCORE_INVALID,
min_score=config.SCORE_MIN,
max_score=config.SCORE_MAX
)
logger.debug(f"代理验证失败 {ip}:{port} ({protocol})")
return False
except Exception as e:
logger.error(f"验证代理 {ip}:{port} 时出错: {e}")
# 出错也视为失败
await self.db.update_score(
ip, port,
config.SCORE_INVALID,
min_score=config.SCORE_MIN,
max_score=config.SCORE_MAX
)
return False
async def validate_proxies_batch(self, proxies: list) -> tuple:
"""
验证一批新抓取的代理
Args:
proxies: [(ip, port, protocol), ...]
Returns:
(有效代理列表, 无效代理列表)
"""
if not proxies:
return [], []
valid_proxies = []
invalid_proxies = []
logger.info(f"开始验证 {len(proxies)} 个新抓取代理...")
try:
validator = ProxyValidator(
max_concurrency=min(config.VALIDATOR_MAX_CONCURRENCY, 50),
timeout=config.VALIDATOR_TIMEOUT
)
async with validator:
tasks = []
for ip, port, protocol in proxies:
task = validator.validate(ip, port, protocol)
tasks.append((ip, port, protocol, task))
for ip, port, protocol, task in tasks:
try:
is_valid, latency = await task
if is_valid:
valid_proxies.append((ip, port, protocol))
logger.debug(f"新代理有效: {ip}:{port} ({protocol}) - {latency}ms")
else:
invalid_proxies.append((ip, port, protocol))
except Exception as e:
logger.warning(f"验证新代理 {ip}:{port} 失败: {e}")
invalid_proxies.append((ip, port, protocol))
logger.info(f"新代理验证完成: 有效 {len(valid_proxies)}, 无效 {len(invalid_proxies)}")
except Exception as e:
logger.error(f"批量验证新代理失败: {e}")
return valid_proxies, invalid_proxies
# 全局调度器实例
scheduler = ValidationScheduler()

View File

@@ -1,331 +0,0 @@
import aiosqlite
import os
import asyncio
from core.log import logger
VALID_PROTOCOLS = ['http', 'https', 'socks4', 'socks5']
class SQLiteManager:
_instance = None
_connection = None
_lock = asyncio.Lock()
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super(SQLiteManager, cls).__new__(cls)
return cls._instance
def __init__(self, db_path=None):
if hasattr(self, 'initialized') and self.initialized:
return
if db_path is None:
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
db_dir = os.path.join(base_dir, 'db')
if not os.path.exists(db_dir):
os.makedirs(db_dir)
self.db_path = os.path.join(db_dir, 'proxies.sqlite')
else:
self.db_path = db_path
self.initialized = True
async def get_connection(self):
async with self._lock:
if self._connection is None:
self._connection = await aiosqlite.connect(self.db_path)
await self._connection.execute("PRAGMA journal_mode=WAL")
await self._connection.execute("PRAGMA synchronous=NORMAL")
await self._connection.execute("PRAGMA cache_size=-64000")
await self._connection.execute("PRAGMA temp_store=MEMORY")
return self._connection
async def close_connection(self):
async with self._lock:
if self._connection is not None:
await self._connection.close()
self._connection = None
async def init_db(self):
"""初始化数据库和表结构"""
db = await self.get_connection()
await db.execute('''
CREATE TABLE IF NOT EXISTS proxies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ip TEXT NOT NULL,
port INTEGER NOT NULL,
protocol TEXT DEFAULT 'http',
score INTEGER DEFAULT 10,
last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(ip, port)
)
''')
await db.execute('CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)')
await db.execute('CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)')
await db.execute('CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)')
await db.execute('CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)')
await db.commit()
async def insert_proxy(self, ip, port, protocol='http', score=10):
"""异步插入或更新代理"""
try:
# 验证协议类型
if protocol not in VALID_PROTOCOLS:
protocol = 'http'
logger.warning(f"无效的协议类型 {protocol},默认使用 http")
db = await self.get_connection()
# 先检查是否存在
async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor:
row = await cursor.fetchone()
if row:
# 如果存在,则更新最后检查时间和分数
await db.execute('''
UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ?
''', (score, protocol, ip, port))
else:
# 如果不存在,则插入新记录
await db.execute('''
INSERT INTO proxies (ip, port, protocol, score, last_check)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
''', (ip, port, protocol, score))
await db.commit()
return True
except aiosqlite.IntegrityError as e:
# 处理唯一性约束冲突
if "UNIQUE" in str(e):
# 代理已存在,更新它
if protocol not in VALID_PROTOCOLS:
protocol = 'http'
db = await self.get_connection()
await db.execute('''
UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ?
''', (score, protocol, ip, port))
await db.commit()
return True
else:
logger.error(f"数据库完整性错误: {e}")
return False
except Exception as e:
logger.error(f"插入代理失败 {ip}:{port} - {e}")
return False
async def get_all_proxies(self):
"""异步获取所有代理"""
db = await self.get_connection()
async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies') as cursor:
return await cursor.fetchall()
async def get_random_proxy(self):
"""异步随机获取一个高分代理"""
db = await self.get_connection()
async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1') as cursor:
return await cursor.fetchone()
async def update_score(self, ip, port, delta, min_score=0, max_score=100):
"""异步更新代理分数(增量更新,带分数限制)"""
try:
db = await self.get_connection()
# 获取当前分数
async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor:
row = await cursor.fetchone()
if row:
current_score = row[0]
new_score = max(min_score, min(max_score, current_score + delta))
await db.execute('''
UPDATE proxies SET score = ?, last_check = CURRENT_TIMESTAMP WHERE ip = ? AND port = ?
''', (new_score, ip, port))
if new_score <= 0:
await db.execute('DELETE FROM proxies WHERE score <= 0')
await db.commit()
return True
return False
except Exception as e:
logger.error(f"更新代理分数失败 {ip}:{port} - {e}")
return False
async def delete_proxy(self, ip, port):
"""异步删除指定代理"""
db = await self.get_connection()
await db.execute('DELETE FROM proxies WHERE ip = ? AND port = ?', (ip, port))
await db.commit()
async def count_proxies(self):
"""异步统计代理数量"""
db = await self.get_connection()
async with db.execute('SELECT COUNT(*) FROM proxies') as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def get_proxies_paginated_with_total(self, page: int = 1, page_size: int = 20,
protocol: str = None, min_score: int = 0,
max_score: int = None,
sort_by: str = 'last_check',
sort_order: str = 'DESC'):
"""分页获取代理列表(一次查询返回数据和总数)"""
db = await self.get_connection()
conditions = ['score >= ?']
params = [min_score]
if protocol:
conditions.append('protocol = ?')
params.append(protocol)
if max_score is not None:
conditions.append('score <= ?')
params.append(max_score)
where_clause = ' AND '.join(conditions)
order_by_clause = f'{sort_by} {sort_order}'
offset = (page - 1) * page_size
query = f'''
SELECT ip, port, protocol, score, last_check,
COUNT(*) OVER() as total_count
FROM proxies
WHERE {where_clause}
ORDER BY {order_by_clause}
LIMIT ? OFFSET ?
'''
params.extend([page_size, offset])
async with db.execute(query, params) as cursor:
rows = await cursor.fetchall()
total = rows[0][5] if rows else 0
proxies = [(row[0], row[1], row[2], row[3], row[4]) for row in rows]
return proxies, total
async def get_proxies_paginated(self, page: int = 1, page_size: int = 20,
protocol: str = None, min_score: int = 0,
max_score: int = None,
sort_by: str = 'last_check',
sort_order: str = 'DESC'):
"""分页获取代理列表"""
db = await self.get_connection()
conditions = ['score >= ?']
params = [min_score]
if protocol:
conditions.append('protocol = ?')
params.append(protocol)
if max_score is not None:
conditions.append('score <= ?')
params.append(max_score)
where_clause = ' AND '.join(conditions)
order_by_clause = f'{sort_by} {sort_order}'
offset = (page - 1) * page_size
query = f'''
SELECT ip, port, protocol, score, last_check
FROM proxies
WHERE {where_clause}
ORDER BY {order_by_clause}
LIMIT ? OFFSET ?
'''
params.extend([page_size, offset])
async with db.execute(query, params) as cursor:
return await cursor.fetchall()
async def get_proxies_total(self, protocol: str = None, min_score: int = 0, max_score: int = None):
"""获取符合条件的代理总数"""
db = await self.get_connection()
conditions = ['score >= ?']
params = [min_score]
if protocol:
conditions.append('protocol = ?')
params.append(protocol)
if max_score is not None:
conditions.append('score <= ?')
params.append(max_score)
where_clause = ' AND '.join(conditions)
query = f'SELECT COUNT(*) FROM proxies WHERE {where_clause}'
async with db.execute(query, params) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def get_proxy_detail(self, ip: str, port: int):
"""获取单个代理的详细信息"""
db = await self.get_connection()
async with db.execute(
'SELECT ip, port, protocol, score, last_check FROM proxies WHERE ip = ? AND port = ?',
(ip, port)
) as cursor:
row = await cursor.fetchone()
return row
async def batch_delete_proxies(self, proxy_list: list):
"""批量删除代理返回实际删除的数量使用executemany优化性能"""
if not proxy_list:
return 0
db = await self.get_connection()
await db.executemany('DELETE FROM proxies WHERE ip = ? AND port = ?', proxy_list)
await db.commit()
return len(proxy_list)
async def get_stats(self):
"""获取统计信息使用单个GROUP BY查询优化性能"""
db = await self.get_connection()
stats = {}
query = '''
SELECT
COUNT(*) as total,
COUNT(CASE WHEN score > 0 THEN 1 END) as available,
AVG(score) as avg_score,
COUNT(CASE WHEN protocol = "http" THEN 1 END) as http_count,
COUNT(CASE WHEN protocol = "https" THEN 1 END) as https_count,
COUNT(CASE WHEN protocol = "socks4" THEN 1 END) as socks4_count,
COUNT(CASE WHEN protocol = "socks5" THEN 1 END) as socks5_count
FROM proxies
'''
async with db.execute(query) as cursor:
row = await cursor.fetchone()
if row:
stats = {
'total': row[0] if row[0] else 0,
'available': row[1] if row[1] else 0,
'avg_score': round(row[2], 2) if row[2] else 0,
'http_count': row[3] if row[3] else 0,
'https_count': row[4] if row[4] else 0,
'socks4_count': row[5] if row[5] else 0,
'socks5_count': row[6] if row[6] else 0
}
return stats
async def get_today_new_count(self):
"""获取今日新增代理数量"""
try:
db = await self.get_connection()
query = '''
SELECT COUNT(*) FROM proxies
WHERE DATE(last_check) = DATE('now', 'localtime')
'''
async with db.execute(query) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
except Exception as e:
logger.error(f"获取今日新增数量失败: {e}")
return 0
async def clean_invalid_proxies(self):
"""清理无效代理(分数<=0"""
db = await self.get_connection()
async with db.execute('DELETE FROM proxies WHERE score <= 0') as cursor:
deleted_count = cursor.rowcount
await db.commit()
return deleted_count

3
core/tasks/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .queue import ValidationQueue
__all__ = ["ValidationQueue"]

111
core/tasks/queue.py Normal file
View File

@@ -0,0 +1,111 @@
"""验证任务队列 - 解耦爬取与验证,支持背压控制"""
import asyncio
from typing import Optional
from models.domain import ProxyRaw
from core.log import logger
class ValidationQueue:
"""代理验证队列
工作流程:
1. 爬虫将原始代理 submit() 到队列
2. Worker 池从队列消费并验证
3. 验证通过的代理写入数据库
"""
def __init__(
self,
validator,
proxy_repo,
db_ctx,
worker_count: int = 50,
score_valid: int = 10,
score_invalid: int = -5,
score_min: int = 0,
score_max: int = 100,
):
self.validator = validator
self.proxy_repo = proxy_repo
self.db_ctx = db_ctx
self.worker_count = worker_count
self.score_valid = score_valid
self.score_invalid = score_invalid
self.score_min = score_min
self.score_max = score_max
self._queue: asyncio.Queue[Optional[ProxyRaw]] = asyncio.Queue()
self._workers: list[asyncio.Task] = []
self._running = False
# 统计
self.valid_count = 0
self.invalid_count = 0
async def start(self):
if self._running:
return
self._running = True
for i in range(self.worker_count):
self._workers.append(asyncio.create_task(self._worker_loop(i)))
logger.info(f"ValidationQueue started with {self.worker_count} workers")
async def stop(self):
if not self._running:
return
self._running = False
for _ in self._workers:
self._queue.put_nowait(None) # sentinel
if self._workers:
await asyncio.gather(*self._workers, return_exceptions=True)
self._workers.clear()
logger.info("ValidationQueue stopped")
async def submit(self, proxies: list[ProxyRaw]):
"""提交代理到验证队列"""
for p in proxies:
await self._queue.put(p)
async def submit_one(self, proxy: ProxyRaw):
await self._queue.put(proxy)
async def drain(self):
"""等待队列中当前所有任务处理完毕"""
await self._queue.join()
async def _worker_loop(self, worker_id: int):
while True:
item = await self._queue.get()
if item is None:
self._queue.task_done()
break
try:
await self._validate_and_save(item)
except Exception as e:
logger.error(f"Worker {worker_id} validation error: {e}")
finally:
self._queue.task_done()
async def _validate_and_save(self, proxy: ProxyRaw):
is_valid, latency = await self.validator.validate(
proxy.ip, proxy.port, proxy.protocol
)
async with self.db_ctx() as db:
if is_valid:
await self.proxy_repo.insert_or_update(
db, proxy.ip, proxy.port, proxy.protocol, score=self.score_valid
)
if latency:
await self.proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
self.valid_count += 1
logger.debug(f"ValidationQueue: valid {proxy.ip}:{proxy.port}")
else:
# 对于新爬取的无效代理,不需要入库,直接丢弃
self.invalid_count += 1
logger.debug(f"ValidationQueue: invalid {proxy.ip}:{proxy.port}")
def reset_stats(self):
self.valid_count = 0
self.invalid_count = 0

View File

@@ -1,192 +0,0 @@
import asyncio
import aiohttp
import aiohttp_socks
import random
import time
from core.log import logger
class ProxyValidator:
"""代理验证器 - 支持 HTTP/HTTPS/SOCKS4/SOCKS5"""
def __init__(self, max_concurrency=50, timeout=5):
# 验证目标源
self.http_sources = [
"http://httpbin.org/ip",
"http://api.ipify.org"
]
self.https_sources = [
"https://httpbin.org/ip",
"https://api.ipify.org"
]
self.semaphore = asyncio.Semaphore(max_concurrency)
self.timeout = timeout
self.session = None
async def __aenter__(self):
"""异步上下文管理器入口"""
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""异步上下文管理器出口"""
if self.session:
await self.session.close()
self.session = None
def _get_test_url(self, protocol: str) -> str:
"""根据协议获取测试 URL"""
protocol = protocol.lower()
if protocol == 'https':
return random.choice(self.https_sources)
return random.choice(self.http_sources)
def _create_connector(self, ip: str, port: int, protocol: str):
"""创建代理连接器"""
protocol = protocol.lower()
if protocol == 'socks4':
return aiohttp_socks.ProxyConnector(
proxy_type=aiohttp_socks.ProxyType.SOCKS4,
host=ip,
port=port,
rdns=True
)
elif protocol == 'socks5':
return aiohttp_socks.ProxyConnector(
proxy_type=aiohttp_socks.ProxyType.SOCKS5,
host=ip,
port=port,
rdns=True
)
elif protocol in ('http', 'https'):
# HTTP/HTTPS 使用普通 connector在请求时指定 proxy 参数
return aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
else:
# 未知协议默认使用 HTTP
return aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
async def validate(self, ip: str, port: int, protocol: str = 'http'):
"""
验证单个代理是否可用
Args:
ip: 代理 IP
port: 代理端口
protocol: 协议类型 (http/https/socks4/socks5)
Returns:
(is_valid: bool, latency_ms: float)
"""
protocol = protocol.lower()
test_url = self._get_test_url(protocol)
async with self.semaphore:
start_time = time.time()
try:
if protocol in ('socks4', 'socks5'):
return await self._validate_socks(ip, port, protocol, test_url, start_time)
else:
return await self._validate_http(ip, port, protocol, test_url, start_time)
except asyncio.TimeoutError:
logger.warning(f"验证超时: {ip}:{port} ({protocol})")
return False, 0
except Exception as e:
logger.warning(f"验证失败: {ip}:{port} ({protocol}) - {e}")
return False, 0
async def _validate_http(self, ip: str, port: int, protocol: str, test_url: str, start_time: float):
"""验证 HTTP/HTTPS 代理"""
proxy_url = f"http://{ip}:{port}"
connector = aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=3)
async with aiohttp.ClientSession(
connector=connector,
timeout=timeout
) as session:
async with session.get(
test_url,
proxy=proxy_url,
allow_redirects=True
) as response:
if response.status in [200, 301, 302]:
try:
content = await response.text()
if 'ip' in content.lower() or 'origin' in content.lower():
latency = round((time.time() - start_time) * 1000, 2)
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
return True, latency
except:
pass
# 内容解析失败但状态码正常,也算可用
latency = round((time.time() - start_time) * 1000, 2)
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
return True, latency
return False, 0
async def _validate_socks(self, ip: str, port: int, protocol: str, test_url: str, start_time: float):
"""验证 SOCKS4/SOCKS5 代理"""
proxy_type = (
aiohttp_socks.ProxyType.SOCKS4
if protocol == 'socks4'
else aiohttp_socks.ProxyType.SOCKS5
)
connector = aiohttp_socks.ProxyConnector(
proxy_type=proxy_type,
host=ip,
port=port,
rdns=True, # 远程 DNS 解析,避免 DNS 泄漏
ssl=False
)
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=3)
try:
async with aiohttp.ClientSession(
connector=connector,
timeout=timeout
) as session:
async with session.get(test_url, allow_redirects=True) as response:
if response.status in [200, 301, 302]:
try:
content = await response.text()
if 'ip' in content.lower() or 'origin' in content.lower():
latency = round((time.time() - start_time) * 1000, 2)
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
return True, latency
except:
pass
# 内容解析失败但状态码正常
latency = round((time.time() - start_time) * 1000, 2)
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
return True, latency
return False, 0
finally:
await connector.close()
class ProxyValidatorLegacy:
"""
兼容旧版本的验证器
保持原有接口不变
"""
def __init__(self, max_concurrency=50, timeout=5):
self.validator = ProxyValidator(max_concurrency, timeout)
async def __aenter__(self):
await self.validator.__aenter__()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.validator.__aexit__(exc_type, exc_val, exc_tb)
async def validate(self, ip, port, protocol='http'):
return await self.validator.validate(ip, port, protocol)