全面架构重构:建立分层架构与高度可扩展的插件系统
后端重构: - 新增分层架构:API Routes -> Services -> Repositories -> Infrastructure - 彻底移除全局单例,全面采用 FastAPI 依赖注入 - 新增 api/ 目录拆分路由(proxies, plugins, scheduler, settings, stats) - 新增 services/ 业务逻辑层:ProxyService, PluginService, SchedulerService, ValidatorService, SettingsService - 新增 repositories/ 数据访问层:ProxyRepository, SettingsRepository, PluginSettingsRepository - 新增 models/ 层:Pydantic Schemas + Domain Models - 重写 core/config.py:采用 Pydantic Settings 管理配置 - 新增 core/db.py:基于 asynccontextmanager 的连接管理,支持数据库迁移 - 新增 core/exceptions.py:统一业务异常体系 插件系统重构(核心): - 新增 core/plugin_system/:BaseCrawlerPlugin + PluginRegistry - 采用显式注册模式(装饰器 + plugins/__init__.py),类型安全、测试友好 - 新增 plugins/base.py:BaseHTTPPlugin 通用 HTTP 爬虫基类 - 迁移全部 7 个插件到新架构(fate0, proxylist_download, ip3366, ip89, kuaidaili, speedx, yundaili) - 插件状态持久化到 plugin_settings 表 任务调度重构: - 新增 core/tasks/queue.py:ValidationQueue + WorkerPool - 解耦爬取与验证:爬虫只负责爬取,代理提交队列后由 Worker 异步验证 - 调度器定时从数据库拉取存量代理并分批投入验证队列 前端调整: - 新增 frontend/src/services/ 层拆分 API 调用逻辑 - 调整 stores/ 和 views/ 使用 Service 层 - 保持 API 兼容性,页面无需大幅修改 其他: - 新增 main.py 作为新入口 - 新增 DESIGN.md 架构设计文档 - 更新 requirements.txt 增加 pydantic-settings
This commit is contained in:
59
core/config.py
Normal file
59
core/config.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件"""
|
||||
import os
|
||||
from typing import List
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
# 数据库配置
|
||||
db_path: str = "db/proxies.sqlite"
|
||||
|
||||
# API 服务配置
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 9949
|
||||
|
||||
# 验证器配置
|
||||
validator_timeout: int = 5
|
||||
validator_max_concurrency: int = 200
|
||||
validator_connect_timeout: int = 3
|
||||
|
||||
# 爬虫配置
|
||||
crawler_num_validators: int = 50
|
||||
crawler_max_queue_size: int = 500
|
||||
|
||||
# 日志配置
|
||||
log_level: str = "INFO"
|
||||
log_dir: str = "logs"
|
||||
|
||||
# 导出配置
|
||||
export_max_records: int = 10000
|
||||
|
||||
# 代理评分配置
|
||||
score_valid: int = 10
|
||||
score_invalid: int = -5
|
||||
score_min: int = 0
|
||||
score_max: int = 100
|
||||
|
||||
# 插件配置
|
||||
plugins_dir: str = "plugins"
|
||||
|
||||
# CORS 配置
|
||||
cors_origins: str = "http://localhost:8080,http://localhost:5173"
|
||||
|
||||
@property
|
||||
def cors_origins_list(self) -> List[str]:
|
||||
return [origin.strip() for origin in self.cors_origins.split(",") if origin.strip()]
|
||||
|
||||
@property
|
||||
def base_dir(self) -> str:
|
||||
return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
# 全局配置实例(启动时加载一次)
|
||||
settings = Settings()
|
||||
@@ -1,86 +0,0 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import random
|
||||
from core.log import logger
|
||||
|
||||
class BaseCrawler:
|
||||
def __init__(self):
|
||||
self.user_agents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Mobile/15E148 Safari/604.1"
|
||||
]
|
||||
|
||||
def get_headers(self):
|
||||
return {
|
||||
'User-Agent': random.choice(self.user_agents),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
|
||||
async def fetch(self, url, method='GET', params=None, data=None, proxies=None, timeout=10, retry_count=3):
|
||||
"""异步抓取方法"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
async with aiohttp.ClientSession(headers=headers) as session:
|
||||
for i in range(retry_count):
|
||||
try:
|
||||
# 注意:aiohttp 的代理格式与 requests 不同,通常为 http://user:pass@host:port
|
||||
async with session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
params=params,
|
||||
data=data,
|
||||
proxy=proxies,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout)
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
# 先读取内容,再处理编码
|
||||
content = await response.read()
|
||||
|
||||
# 尝试获取编码
|
||||
encoding = response.get_encoding()
|
||||
if encoding == 'utf-8' or not encoding:
|
||||
try:
|
||||
return content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# 尝试从内容中检测编码或手动设置为 gbk (国内网站常见)
|
||||
return content.decode('gbk', errors='ignore')
|
||||
|
||||
return content.decode(encoding, errors='ignore')
|
||||
else:
|
||||
logger.warning(f"请求失败 [{response.status}]: {url}, 正在进行第 {i+1} 次重试...")
|
||||
except Exception as e:
|
||||
logger.error(f"请求异常: {url}, 错误: {e}, 正在进行第 {i+1} 次重试...")
|
||||
|
||||
await asyncio.sleep(random.uniform(1, 3))
|
||||
|
||||
return None
|
||||
|
||||
class BasePlugin(BaseCrawler):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.name = "BasePlugin"
|
||||
self.urls = []
|
||||
self.enabled = True
|
||||
|
||||
async def parse(self, html):
|
||||
"""异步解析网页内容,需在子类中实现"""
|
||||
raise NotImplementedError("Please implement parse method")
|
||||
|
||||
async def run(self):
|
||||
"""异步运行插件"""
|
||||
logger.info(f"正在运行插件: {self.name}")
|
||||
results = []
|
||||
for url in self.urls:
|
||||
self.current_url = url # 记录当前正在抓取的 URL,供 parse 使用
|
||||
html = await self.fetch(url)
|
||||
if html:
|
||||
async for proxy in self.parse(html):
|
||||
results.append(proxy)
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
return results
|
||||
95
core/db.py
Normal file
95
core/db.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""数据库连接管理 - 使用上下文管理器,避免全局单例连接泄漏"""
|
||||
import os
|
||||
import aiosqlite
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import AsyncIterator
|
||||
from core.config import settings
|
||||
from core.log import logger
|
||||
|
||||
|
||||
DB_PATH = os.path.join(settings.base_dir, settings.db_path)
|
||||
|
||||
|
||||
def ensure_db_dir():
|
||||
db_dir = os.path.dirname(DB_PATH)
|
||||
if db_dir and not os.path.exists(db_dir):
|
||||
os.makedirs(db_dir, exist_ok=True)
|
||||
|
||||
|
||||
async def init_db():
|
||||
"""初始化数据库表结构(支持迁移)"""
|
||||
ensure_db_dir()
|
||||
async with aiosqlite.connect(DB_PATH) as db:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA synchronous=NORMAL")
|
||||
await db.execute("PRAGMA cache_size=-64000")
|
||||
await db.execute("PRAGMA temp_store=MEMORY")
|
||||
|
||||
await db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS proxies (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
ip TEXT NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
protocol TEXT DEFAULT 'http',
|
||||
score INTEGER DEFAULT 10,
|
||||
response_time_ms REAL,
|
||||
last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(ip, port)
|
||||
)
|
||||
""")
|
||||
|
||||
# 迁移:如果旧表缺少 response_time_ms 列,则添加
|
||||
try:
|
||||
await db.execute("SELECT response_time_ms FROM proxies LIMIT 1")
|
||||
except Exception:
|
||||
await db.execute("ALTER TABLE proxies ADD COLUMN response_time_ms REAL")
|
||||
logger.info("Migrated: added response_time_ms column")
|
||||
|
||||
# 迁移:如果旧表缺少 created_at 列,则添加
|
||||
try:
|
||||
await db.execute("SELECT created_at FROM proxies LIMIT 1")
|
||||
except Exception:
|
||||
await db.execute("ALTER TABLE proxies ADD COLUMN created_at TIMESTAMP")
|
||||
await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL")
|
||||
logger.info("Migrated: added created_at column")
|
||||
|
||||
await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)")
|
||||
await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)")
|
||||
await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)")
|
||||
await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)")
|
||||
|
||||
# 插件设置表
|
||||
await db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS plugin_settings (
|
||||
plugin_id TEXT PRIMARY KEY,
|
||||
enabled INTEGER DEFAULT 1,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# 系统设置表
|
||||
await db.execute("""
|
||||
CREATE TABLE IF NOT EXISTS settings (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
await db.commit()
|
||||
logger.info("Database initialized")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_db() -> AsyncIterator[aiosqlite.Connection]:
|
||||
"""获取数据库连接的异步上下文管理器"""
|
||||
ensure_db_dir()
|
||||
db = await aiosqlite.connect(DB_PATH)
|
||||
try:
|
||||
await db.execute("PRAGMA journal_mode=WAL")
|
||||
await db.execute("PRAGMA synchronous=NORMAL")
|
||||
yield db
|
||||
finally:
|
||||
await db.close()
|
||||
24
core/exceptions.py
Normal file
24
core/exceptions.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""业务异常定义"""
|
||||
|
||||
|
||||
class ProxyPoolException(Exception):
|
||||
"""基础业务异常"""
|
||||
def __init__(self, message: str, code: int = 500):
|
||||
self.message = message
|
||||
self.code = code
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
class PluginNotFoundException(ProxyPoolException):
|
||||
def __init__(self, plugin_id: str):
|
||||
super().__init__(f"Plugin '{plugin_id}' not found", 404)
|
||||
|
||||
|
||||
class ProxyNotFoundException(ProxyPoolException):
|
||||
def __init__(self, ip: str, port: int):
|
||||
super().__init__(f"Proxy {ip}:{port} not found", 404)
|
||||
|
||||
|
||||
class ValidationException(ProxyPoolException):
|
||||
def __init__(self, message: str):
|
||||
super().__init__(message, 400)
|
||||
@@ -1,125 +0,0 @@
|
||||
import os
|
||||
import importlib
|
||||
import inspect
|
||||
import asyncio
|
||||
from typing import List, Dict, Optional
|
||||
from core.crawler import BasePlugin
|
||||
from core.log import logger
|
||||
|
||||
class PluginManager:
|
||||
def __init__(self, plugin_dir='plugins'):
|
||||
self.plugin_dir = plugin_dir
|
||||
self.plugins = []
|
||||
self.plugin_stats = {}
|
||||
self._load_plugins()
|
||||
self._init_stats()
|
||||
|
||||
def _init_stats(self):
|
||||
for plugin in self.plugins:
|
||||
self.plugin_stats[plugin.name] = {
|
||||
'success_count': 0,
|
||||
'failure_count': 0,
|
||||
'last_run': None
|
||||
}
|
||||
|
||||
def _load_plugins(self):
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
full_plugin_path = os.path.join(base_dir, self.plugin_dir)
|
||||
|
||||
if not os.path.exists(full_plugin_path):
|
||||
logger.error(f"插件目录不存在: {full_plugin_path}")
|
||||
return
|
||||
|
||||
for filename in os.listdir(full_plugin_path):
|
||||
if filename.endswith('.py') and not filename.startswith('__'):
|
||||
module_name = f"{self.plugin_dir}.{filename[:-3]}"
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
for name, obj in inspect.getmembers(module):
|
||||
if inspect.isclass(obj) and issubclass(obj, BasePlugin) and obj is not BasePlugin:
|
||||
plugin_instance = obj()
|
||||
if plugin_instance.enabled:
|
||||
logger.info(f"成功加载插件: {name} 来自 {module_name}")
|
||||
self.plugins.append(plugin_instance)
|
||||
else:
|
||||
logger.info(f"插件已禁用,跳过加载: {name} 来自 {module_name}")
|
||||
except Exception as e:
|
||||
logger.error(f"加载插件失败 {module_name}: {e}")
|
||||
|
||||
def get_plugin_by_name(self, plugin_name: str) -> Optional[BasePlugin]:
|
||||
for plugin in self.plugins:
|
||||
if plugin.name == plugin_name:
|
||||
return plugin
|
||||
return None
|
||||
|
||||
def get_all_plugin_info(self) -> List[Dict]:
|
||||
plugins_info = []
|
||||
for plugin in self.plugins:
|
||||
stats = self.plugin_stats.get(plugin.name, {
|
||||
'success_count': 0,
|
||||
'failure_count': 0,
|
||||
'last_run': None
|
||||
})
|
||||
plugins_info.append({
|
||||
'id': plugin.name,
|
||||
'name': plugin.name,
|
||||
'enabled': plugin.enabled,
|
||||
'description': getattr(plugin, 'description', f'从{plugin.name}网站爬取代理'),
|
||||
'last_run': stats['last_run'],
|
||||
'success_count': stats['success_count'],
|
||||
'failure_count': stats['failure_count']
|
||||
})
|
||||
return plugins_info
|
||||
|
||||
def toggle_plugin(self, plugin_name: str, enabled: bool) -> bool:
|
||||
plugin = self.get_plugin_by_name(plugin_name)
|
||||
if plugin:
|
||||
plugin.enabled = enabled
|
||||
logger.info(f"插件 {plugin_name} 已{'启用' if enabled else '禁用'}")
|
||||
return True
|
||||
return False
|
||||
|
||||
async def run_plugin(self, plugin_name: str):
|
||||
plugin = self.get_plugin_by_name(plugin_name)
|
||||
if not plugin:
|
||||
logger.error(f"插件不存在: {plugin_name}")
|
||||
return []
|
||||
|
||||
if not plugin.enabled:
|
||||
logger.warning(f"插件已禁用: {plugin_name}")
|
||||
return []
|
||||
|
||||
try:
|
||||
results = await plugin.run()
|
||||
success_count = len(results)
|
||||
failure_count = 0
|
||||
|
||||
from datetime import datetime
|
||||
self.plugin_stats[plugin.name] = {
|
||||
'success_count': self.plugin_stats[plugin.name]['success_count'] + success_count,
|
||||
'failure_count': self.plugin_stats[plugin.name]['failure_count'] + failure_count,
|
||||
'last_run': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
logger.info(f"插件 {plugin_name} 执行完成,成功: {success_count}")
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"插件 {plugin_name} 执行失败: {e}")
|
||||
from datetime import datetime
|
||||
self.plugin_stats[plugin.name] = {
|
||||
'success_count': self.plugin_stats[plugin.name]['success_count'],
|
||||
'failure_count': self.plugin_stats[plugin.name]['failure_count'] + 1,
|
||||
'last_run': datetime.now().isoformat()
|
||||
}
|
||||
return []
|
||||
|
||||
async def run_all(self):
|
||||
"""并发运行所有插件"""
|
||||
tasks = [plugin.run() for plugin in self.plugins]
|
||||
# 并发执行并收集结果
|
||||
results_list = await asyncio.gather(*tasks)
|
||||
|
||||
# 将嵌套列表扁平化并产出结果
|
||||
for results in results_list:
|
||||
for proxy in results:
|
||||
yield proxy
|
||||
4
core/plugin_system/__init__.py
Normal file
4
core/plugin_system/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .base import BaseCrawlerPlugin, ProxyRaw
|
||||
from .registry import registry
|
||||
|
||||
__all__ = ["BaseCrawlerPlugin", "ProxyRaw", "registry"]
|
||||
41
core/plugin_system/base.py
Normal file
41
core/plugin_system/base.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""插件基类 - 所有爬虫插件必须继承此基类"""
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProxyRaw:
|
||||
"""爬虫产出的原始代理数据"""
|
||||
ip: str
|
||||
port: int
|
||||
protocol: str = "http"
|
||||
|
||||
def __post_init__(self):
|
||||
self.protocol = self.protocol.lower().strip()
|
||||
if self.protocol not in ("http", "https", "socks4", "socks5"):
|
||||
self.protocol = "http"
|
||||
|
||||
|
||||
class BaseCrawlerPlugin(ABC):
|
||||
"""爬虫插件基类
|
||||
|
||||
添加新爬虫只需:
|
||||
1. 继承 BaseCrawlerPlugin
|
||||
2. 实现 crawl() 方法返回 List[ProxyRaw]
|
||||
3. 用 @registry.register 装饰或在 __init__ 中显式注册
|
||||
"""
|
||||
|
||||
name: str = ""
|
||||
display_name: str = ""
|
||||
description: str = ""
|
||||
enabled: bool = True
|
||||
|
||||
@abstractmethod
|
||||
async def crawl(self) -> List[ProxyRaw]:
|
||||
"""爬取代理的核心方法。只负责爬取,不要在这里验证。"""
|
||||
raise NotImplementedError
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""可选:检查插件健康状态"""
|
||||
return True
|
||||
77
core/plugin_system/registry.py
Normal file
77
core/plugin_system/registry.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""插件注册中心 - 显式注册,类型安全,测试友好"""
|
||||
import importlib
|
||||
import inspect
|
||||
import os
|
||||
from typing import Dict, List, Type, Optional
|
||||
from core.plugin_system.base import BaseCrawlerPlugin
|
||||
from core.log import logger
|
||||
|
||||
|
||||
class PluginRegistry:
|
||||
"""插件注册中心"""
|
||||
|
||||
def __init__(self):
|
||||
self._plugins: Dict[str, Type[BaseCrawlerPlugin]] = {}
|
||||
self._instances: Dict[str, BaseCrawlerPlugin] = {}
|
||||
|
||||
def register(self, plugin_cls: Type[BaseCrawlerPlugin]) -> Type[BaseCrawlerPlugin]:
|
||||
"""注册一个插件类。支持装饰器语法。"""
|
||||
if not inspect.isclass(plugin_cls) or not issubclass(plugin_cls, BaseCrawlerPlugin):
|
||||
raise ValueError("Plugin must be a subclass of BaseCrawlerPlugin")
|
||||
if not plugin_cls.name:
|
||||
raise ValueError(f"Plugin {plugin_cls.__name__} must have a 'name' attribute")
|
||||
|
||||
self._plugins[plugin_cls.name] = plugin_cls
|
||||
logger.info(f"Plugin registered: {plugin_cls.name} ({plugin_cls.__name__})")
|
||||
return plugin_cls
|
||||
|
||||
def get(self, name: str) -> Optional[BaseCrawlerPlugin]:
|
||||
"""获取插件实例(懒加载)"""
|
||||
if name not in self._instances:
|
||||
cls = self._plugins.get(name)
|
||||
if cls:
|
||||
self._instances[name] = cls()
|
||||
return self._instances.get(name)
|
||||
|
||||
def list_plugins(self) -> List[BaseCrawlerPlugin]:
|
||||
"""获取所有已注册插件的实例列表"""
|
||||
result = []
|
||||
for name in self._plugins:
|
||||
instance = self.get(name)
|
||||
if instance:
|
||||
result.append(instance)
|
||||
return result
|
||||
|
||||
def get_plugin_names(self) -> List[str]:
|
||||
return list(self._plugins.keys())
|
||||
|
||||
def auto_discover(self, package_name: str):
|
||||
"""自动扫描指定包下的所有模块并注册其中的插件类。
|
||||
注意:为了类型安全和可控性,推荐显式注册。auto_discover 仅作为兼容。"""
|
||||
try:
|
||||
package = importlib.import_module(package_name)
|
||||
package_dir = os.path.dirname(package.__file__)
|
||||
except Exception as e:
|
||||
logger.error(f"Auto discover failed for package {package_name}: {e}")
|
||||
return
|
||||
|
||||
for filename in os.listdir(package_dir):
|
||||
if filename.endswith(".py") and not filename.startswith("__"):
|
||||
module_name = f"{package_name}.{filename[:-3]}"
|
||||
try:
|
||||
module = importlib.import_module(module_name)
|
||||
for attr_name in dir(module):
|
||||
obj = getattr(module, attr_name)
|
||||
if (
|
||||
inspect.isclass(obj)
|
||||
and issubclass(obj, BaseCrawlerPlugin)
|
||||
and obj is not BaseCrawlerPlugin
|
||||
and obj not in self._plugins.values()
|
||||
):
|
||||
self.register(obj)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load module {module_name}: {e}")
|
||||
|
||||
|
||||
# 全局注册中心实例
|
||||
registry = PluginRegistry()
|
||||
@@ -1,206 +0,0 @@
|
||||
"""
|
||||
代理验证调度器
|
||||
负责定期验证数据库中的代理,并更新分数
|
||||
"""
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
from core.sqlite import SQLiteManager
|
||||
from core.validator import ProxyValidator
|
||||
from core.log import logger
|
||||
from config import config
|
||||
|
||||
|
||||
class ValidationScheduler:
|
||||
"""代理验证调度器"""
|
||||
|
||||
def __init__(self):
|
||||
self.db = SQLiteManager()
|
||||
self.validator: Optional[ProxyValidator] = None
|
||||
self.running = False
|
||||
self.task: Optional[asyncio.Task] = None
|
||||
self.interval_minutes = 30 # 默认每30分钟验证一次
|
||||
self.batch_size = 100 # 每批验证数量
|
||||
|
||||
async def start(self):
|
||||
"""启动验证调度器"""
|
||||
if self.running:
|
||||
logger.warning("验证调度器已在运行")
|
||||
return
|
||||
|
||||
self.running = True
|
||||
self.validator = ProxyValidator(
|
||||
max_concurrency=config.VALIDATOR_MAX_CONCURRENCY,
|
||||
timeout=config.VALIDATOR_TIMEOUT
|
||||
)
|
||||
self.task = asyncio.create_task(self._run_loop())
|
||||
logger.info("代理验证调度器已启动")
|
||||
|
||||
async def stop(self):
|
||||
"""停止验证调度器"""
|
||||
self.running = False
|
||||
if self.task:
|
||||
self.task.cancel()
|
||||
try:
|
||||
await self.task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
if self.validator:
|
||||
await self.validator.__aexit__(None, None, None)
|
||||
logger.info("代理验证调度器已停止")
|
||||
|
||||
async def _run_loop(self):
|
||||
"""运行循环"""
|
||||
while self.running:
|
||||
try:
|
||||
await self.validate_all_proxies()
|
||||
except Exception as e:
|
||||
logger.error(f"验证循环出错: {e}")
|
||||
|
||||
# 等待下一次验证
|
||||
await asyncio.sleep(self.interval_minutes * 60)
|
||||
|
||||
async def validate_all_proxies(self):
|
||||
"""验证所有代理"""
|
||||
logger.info("开始批量验证代理...")
|
||||
|
||||
try:
|
||||
# 获取所有代理
|
||||
proxies = await self.db.get_all_proxies()
|
||||
if not proxies:
|
||||
logger.info("数据库中没有代理需要验证")
|
||||
return
|
||||
|
||||
logger.info(f"需要验证 {len(proxies)} 个代理")
|
||||
|
||||
# 分批验证
|
||||
validated_count = 0
|
||||
valid_count = 0
|
||||
invalid_count = 0
|
||||
|
||||
async with self.validator:
|
||||
for i in range(0, len(proxies), self.batch_size):
|
||||
if not self.running:
|
||||
break
|
||||
|
||||
batch = proxies[i:i + self.batch_size]
|
||||
tasks = []
|
||||
|
||||
for proxy in batch:
|
||||
ip, port, protocol, score, last_check = proxy
|
||||
task = self._validate_and_update(ip, port, protocol)
|
||||
tasks.append(task)
|
||||
|
||||
# 并发验证一批
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
for result in results:
|
||||
validated_count += 1
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"验证过程出错: {result}")
|
||||
continue
|
||||
if result:
|
||||
valid_count += 1
|
||||
else:
|
||||
invalid_count += 1
|
||||
|
||||
logger.info(f"已验证 {validated_count}/{len(proxies)} 个代理")
|
||||
|
||||
# 批次间短暂延迟,避免过载
|
||||
if i + self.batch_size < len(proxies):
|
||||
await asyncio.sleep(1)
|
||||
|
||||
logger.info(f"验证完成: 总计 {validated_count}, 有效 {valid_count}, 无效 {invalid_count}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量验证代理失败: {e}", exc_info=True)
|
||||
|
||||
async def _validate_and_update(self, ip: str, port: int, protocol: str) -> bool:
|
||||
"""验证单个代理并更新分数"""
|
||||
try:
|
||||
is_valid, latency = await self.validator.validate(ip, port, protocol)
|
||||
|
||||
if is_valid:
|
||||
# 验证成功,增加分数
|
||||
await self.db.update_score(
|
||||
ip, port,
|
||||
config.SCORE_VALID,
|
||||
min_score=config.SCORE_MIN,
|
||||
max_score=config.SCORE_MAX
|
||||
)
|
||||
logger.debug(f"代理验证成功 {ip}:{port} ({protocol}) - 延迟 {latency}ms")
|
||||
return True
|
||||
else:
|
||||
# 验证失败,减少分数
|
||||
await self.db.update_score(
|
||||
ip, port,
|
||||
config.SCORE_INVALID,
|
||||
min_score=config.SCORE_MIN,
|
||||
max_score=config.SCORE_MAX
|
||||
)
|
||||
logger.debug(f"代理验证失败 {ip}:{port} ({protocol})")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"验证代理 {ip}:{port} 时出错: {e}")
|
||||
# 出错也视为失败
|
||||
await self.db.update_score(
|
||||
ip, port,
|
||||
config.SCORE_INVALID,
|
||||
min_score=config.SCORE_MIN,
|
||||
max_score=config.SCORE_MAX
|
||||
)
|
||||
return False
|
||||
|
||||
async def validate_proxies_batch(self, proxies: list) -> tuple:
|
||||
"""
|
||||
验证一批新抓取的代理
|
||||
|
||||
Args:
|
||||
proxies: [(ip, port, protocol), ...]
|
||||
|
||||
Returns:
|
||||
(有效代理列表, 无效代理列表)
|
||||
"""
|
||||
if not proxies:
|
||||
return [], []
|
||||
|
||||
valid_proxies = []
|
||||
invalid_proxies = []
|
||||
|
||||
logger.info(f"开始验证 {len(proxies)} 个新抓取代理...")
|
||||
|
||||
try:
|
||||
validator = ProxyValidator(
|
||||
max_concurrency=min(config.VALIDATOR_MAX_CONCURRENCY, 50),
|
||||
timeout=config.VALIDATOR_TIMEOUT
|
||||
)
|
||||
|
||||
async with validator:
|
||||
tasks = []
|
||||
for ip, port, protocol in proxies:
|
||||
task = validator.validate(ip, port, protocol)
|
||||
tasks.append((ip, port, protocol, task))
|
||||
|
||||
for ip, port, protocol, task in tasks:
|
||||
try:
|
||||
is_valid, latency = await task
|
||||
if is_valid:
|
||||
valid_proxies.append((ip, port, protocol))
|
||||
logger.debug(f"新代理有效: {ip}:{port} ({protocol}) - {latency}ms")
|
||||
else:
|
||||
invalid_proxies.append((ip, port, protocol))
|
||||
except Exception as e:
|
||||
logger.warning(f"验证新代理 {ip}:{port} 失败: {e}")
|
||||
invalid_proxies.append((ip, port, protocol))
|
||||
|
||||
logger.info(f"新代理验证完成: 有效 {len(valid_proxies)}, 无效 {len(invalid_proxies)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量验证新代理失败: {e}")
|
||||
|
||||
return valid_proxies, invalid_proxies
|
||||
|
||||
|
||||
# 全局调度器实例
|
||||
scheduler = ValidationScheduler()
|
||||
331
core/sqlite.py
331
core/sqlite.py
@@ -1,331 +0,0 @@
|
||||
import aiosqlite
|
||||
import os
|
||||
import asyncio
|
||||
from core.log import logger
|
||||
|
||||
VALID_PROTOCOLS = ['http', 'https', 'socks4', 'socks5']
|
||||
|
||||
class SQLiteManager:
|
||||
_instance = None
|
||||
_connection = None
|
||||
_lock = asyncio.Lock()
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(SQLiteManager, cls).__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, db_path=None):
|
||||
if hasattr(self, 'initialized') and self.initialized:
|
||||
return
|
||||
|
||||
if db_path is None:
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
db_dir = os.path.join(base_dir, 'db')
|
||||
if not os.path.exists(db_dir):
|
||||
os.makedirs(db_dir)
|
||||
self.db_path = os.path.join(db_dir, 'proxies.sqlite')
|
||||
else:
|
||||
self.db_path = db_path
|
||||
|
||||
self.initialized = True
|
||||
|
||||
async def get_connection(self):
|
||||
async with self._lock:
|
||||
if self._connection is None:
|
||||
self._connection = await aiosqlite.connect(self.db_path)
|
||||
await self._connection.execute("PRAGMA journal_mode=WAL")
|
||||
await self._connection.execute("PRAGMA synchronous=NORMAL")
|
||||
await self._connection.execute("PRAGMA cache_size=-64000")
|
||||
await self._connection.execute("PRAGMA temp_store=MEMORY")
|
||||
return self._connection
|
||||
|
||||
async def close_connection(self):
|
||||
async with self._lock:
|
||||
if self._connection is not None:
|
||||
await self._connection.close()
|
||||
self._connection = None
|
||||
|
||||
async def init_db(self):
|
||||
"""初始化数据库和表结构"""
|
||||
db = await self.get_connection()
|
||||
await db.execute('''
|
||||
CREATE TABLE IF NOT EXISTS proxies (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
ip TEXT NOT NULL,
|
||||
port INTEGER NOT NULL,
|
||||
protocol TEXT DEFAULT 'http',
|
||||
score INTEGER DEFAULT 10,
|
||||
last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(ip, port)
|
||||
)
|
||||
''')
|
||||
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)')
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)')
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)')
|
||||
await db.execute('CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)')
|
||||
|
||||
await db.commit()
|
||||
|
||||
async def insert_proxy(self, ip, port, protocol='http', score=10):
|
||||
"""异步插入或更新代理"""
|
||||
try:
|
||||
# 验证协议类型
|
||||
if protocol not in VALID_PROTOCOLS:
|
||||
protocol = 'http'
|
||||
logger.warning(f"无效的协议类型 {protocol},默认使用 http")
|
||||
|
||||
db = await self.get_connection()
|
||||
# 先检查是否存在
|
||||
async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
# 如果存在,则更新最后检查时间和分数
|
||||
await db.execute('''
|
||||
UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ?
|
||||
''', (score, protocol, ip, port))
|
||||
else:
|
||||
# 如果不存在,则插入新记录
|
||||
await db.execute('''
|
||||
INSERT INTO proxies (ip, port, protocol, score, last_check)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
''', (ip, port, protocol, score))
|
||||
await db.commit()
|
||||
return True
|
||||
except aiosqlite.IntegrityError as e:
|
||||
# 处理唯一性约束冲突
|
||||
if "UNIQUE" in str(e):
|
||||
# 代理已存在,更新它
|
||||
if protocol not in VALID_PROTOCOLS:
|
||||
protocol = 'http'
|
||||
db = await self.get_connection()
|
||||
await db.execute('''
|
||||
UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ?
|
||||
''', (score, protocol, ip, port))
|
||||
await db.commit()
|
||||
return True
|
||||
else:
|
||||
logger.error(f"数据库完整性错误: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"插入代理失败 {ip}:{port} - {e}")
|
||||
return False
|
||||
|
||||
async def get_all_proxies(self):
|
||||
"""异步获取所有代理"""
|
||||
db = await self.get_connection()
|
||||
async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies') as cursor:
|
||||
return await cursor.fetchall()
|
||||
|
||||
async def get_random_proxy(self):
|
||||
"""异步随机获取一个高分代理"""
|
||||
db = await self.get_connection()
|
||||
async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1') as cursor:
|
||||
return await cursor.fetchone()
|
||||
|
||||
async def update_score(self, ip, port, delta, min_score=0, max_score=100):
|
||||
"""异步更新代理分数(增量更新,带分数限制)"""
|
||||
try:
|
||||
db = await self.get_connection()
|
||||
# 获取当前分数
|
||||
async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
current_score = row[0]
|
||||
new_score = max(min_score, min(max_score, current_score + delta))
|
||||
await db.execute('''
|
||||
UPDATE proxies SET score = ?, last_check = CURRENT_TIMESTAMP WHERE ip = ? AND port = ?
|
||||
''', (new_score, ip, port))
|
||||
if new_score <= 0:
|
||||
await db.execute('DELETE FROM proxies WHERE score <= 0')
|
||||
await db.commit()
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"更新代理分数失败 {ip}:{port} - {e}")
|
||||
return False
|
||||
|
||||
async def delete_proxy(self, ip, port):
|
||||
"""异步删除指定代理"""
|
||||
db = await self.get_connection()
|
||||
await db.execute('DELETE FROM proxies WHERE ip = ? AND port = ?', (ip, port))
|
||||
await db.commit()
|
||||
|
||||
async def count_proxies(self):
|
||||
"""异步统计代理数量"""
|
||||
db = await self.get_connection()
|
||||
async with db.execute('SELECT COUNT(*) FROM proxies') as cursor:
|
||||
row = await cursor.fetchone()
|
||||
return row[0] if row else 0
|
||||
|
||||
async def get_proxies_paginated_with_total(self, page: int = 1, page_size: int = 20,
|
||||
protocol: str = None, min_score: int = 0,
|
||||
max_score: int = None,
|
||||
sort_by: str = 'last_check',
|
||||
sort_order: str = 'DESC'):
|
||||
"""分页获取代理列表(一次查询返回数据和总数)"""
|
||||
db = await self.get_connection()
|
||||
conditions = ['score >= ?']
|
||||
params = [min_score]
|
||||
|
||||
if protocol:
|
||||
conditions.append('protocol = ?')
|
||||
params.append(protocol)
|
||||
|
||||
if max_score is not None:
|
||||
conditions.append('score <= ?')
|
||||
params.append(max_score)
|
||||
|
||||
where_clause = ' AND '.join(conditions)
|
||||
|
||||
order_by_clause = f'{sort_by} {sort_order}'
|
||||
|
||||
offset = (page - 1) * page_size
|
||||
query = f'''
|
||||
SELECT ip, port, protocol, score, last_check,
|
||||
COUNT(*) OVER() as total_count
|
||||
FROM proxies
|
||||
WHERE {where_clause}
|
||||
ORDER BY {order_by_clause}
|
||||
LIMIT ? OFFSET ?
|
||||
'''
|
||||
params.extend([page_size, offset])
|
||||
|
||||
async with db.execute(query, params) as cursor:
|
||||
rows = await cursor.fetchall()
|
||||
total = rows[0][5] if rows else 0
|
||||
proxies = [(row[0], row[1], row[2], row[3], row[4]) for row in rows]
|
||||
return proxies, total
|
||||
|
||||
async def get_proxies_paginated(self, page: int = 1, page_size: int = 20,
|
||||
protocol: str = None, min_score: int = 0,
|
||||
max_score: int = None,
|
||||
sort_by: str = 'last_check',
|
||||
sort_order: str = 'DESC'):
|
||||
"""分页获取代理列表"""
|
||||
db = await self.get_connection()
|
||||
conditions = ['score >= ?']
|
||||
params = [min_score]
|
||||
|
||||
if protocol:
|
||||
conditions.append('protocol = ?')
|
||||
params.append(protocol)
|
||||
|
||||
if max_score is not None:
|
||||
conditions.append('score <= ?')
|
||||
params.append(max_score)
|
||||
|
||||
where_clause = ' AND '.join(conditions)
|
||||
|
||||
order_by_clause = f'{sort_by} {sort_order}'
|
||||
|
||||
offset = (page - 1) * page_size
|
||||
query = f'''
|
||||
SELECT ip, port, protocol, score, last_check
|
||||
FROM proxies
|
||||
WHERE {where_clause}
|
||||
ORDER BY {order_by_clause}
|
||||
LIMIT ? OFFSET ?
|
||||
'''
|
||||
params.extend([page_size, offset])
|
||||
|
||||
async with db.execute(query, params) as cursor:
|
||||
return await cursor.fetchall()
|
||||
|
||||
async def get_proxies_total(self, protocol: str = None, min_score: int = 0, max_score: int = None):
|
||||
"""获取符合条件的代理总数"""
|
||||
db = await self.get_connection()
|
||||
conditions = ['score >= ?']
|
||||
params = [min_score]
|
||||
|
||||
if protocol:
|
||||
conditions.append('protocol = ?')
|
||||
params.append(protocol)
|
||||
|
||||
if max_score is not None:
|
||||
conditions.append('score <= ?')
|
||||
params.append(max_score)
|
||||
|
||||
where_clause = ' AND '.join(conditions)
|
||||
|
||||
query = f'SELECT COUNT(*) FROM proxies WHERE {where_clause}'
|
||||
|
||||
async with db.execute(query, params) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
return row[0] if row else 0
|
||||
|
||||
async def get_proxy_detail(self, ip: str, port: int):
|
||||
"""获取单个代理的详细信息"""
|
||||
db = await self.get_connection()
|
||||
async with db.execute(
|
||||
'SELECT ip, port, protocol, score, last_check FROM proxies WHERE ip = ? AND port = ?',
|
||||
(ip, port)
|
||||
) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
return row
|
||||
|
||||
async def batch_delete_proxies(self, proxy_list: list):
|
||||
"""批量删除代理,返回实际删除的数量(使用executemany优化性能)"""
|
||||
if not proxy_list:
|
||||
return 0
|
||||
|
||||
db = await self.get_connection()
|
||||
await db.executemany('DELETE FROM proxies WHERE ip = ? AND port = ?', proxy_list)
|
||||
await db.commit()
|
||||
return len(proxy_list)
|
||||
|
||||
async def get_stats(self):
|
||||
"""获取统计信息(使用单个GROUP BY查询优化性能)"""
|
||||
db = await self.get_connection()
|
||||
stats = {}
|
||||
|
||||
query = '''
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(CASE WHEN score > 0 THEN 1 END) as available,
|
||||
AVG(score) as avg_score,
|
||||
COUNT(CASE WHEN protocol = "http" THEN 1 END) as http_count,
|
||||
COUNT(CASE WHEN protocol = "https" THEN 1 END) as https_count,
|
||||
COUNT(CASE WHEN protocol = "socks4" THEN 1 END) as socks4_count,
|
||||
COUNT(CASE WHEN protocol = "socks5" THEN 1 END) as socks5_count
|
||||
FROM proxies
|
||||
'''
|
||||
|
||||
async with db.execute(query) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
stats = {
|
||||
'total': row[0] if row[0] else 0,
|
||||
'available': row[1] if row[1] else 0,
|
||||
'avg_score': round(row[2], 2) if row[2] else 0,
|
||||
'http_count': row[3] if row[3] else 0,
|
||||
'https_count': row[4] if row[4] else 0,
|
||||
'socks4_count': row[5] if row[5] else 0,
|
||||
'socks5_count': row[6] if row[6] else 0
|
||||
}
|
||||
|
||||
return stats
|
||||
|
||||
async def get_today_new_count(self):
|
||||
"""获取今日新增代理数量"""
|
||||
try:
|
||||
db = await self.get_connection()
|
||||
query = '''
|
||||
SELECT COUNT(*) FROM proxies
|
||||
WHERE DATE(last_check) = DATE('now', 'localtime')
|
||||
'''
|
||||
async with db.execute(query) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
return row[0] if row else 0
|
||||
except Exception as e:
|
||||
logger.error(f"获取今日新增数量失败: {e}")
|
||||
return 0
|
||||
|
||||
async def clean_invalid_proxies(self):
|
||||
"""清理无效代理(分数<=0)"""
|
||||
db = await self.get_connection()
|
||||
async with db.execute('DELETE FROM proxies WHERE score <= 0') as cursor:
|
||||
deleted_count = cursor.rowcount
|
||||
await db.commit()
|
||||
return deleted_count
|
||||
3
core/tasks/__init__.py
Normal file
3
core/tasks/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .queue import ValidationQueue
|
||||
|
||||
__all__ = ["ValidationQueue"]
|
||||
111
core/tasks/queue.py
Normal file
111
core/tasks/queue.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""验证任务队列 - 解耦爬取与验证,支持背压控制"""
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
from models.domain import ProxyRaw
|
||||
from core.log import logger
|
||||
|
||||
|
||||
class ValidationQueue:
|
||||
"""代理验证队列
|
||||
|
||||
工作流程:
|
||||
1. 爬虫将原始代理 submit() 到队列
|
||||
2. Worker 池从队列消费并验证
|
||||
3. 验证通过的代理写入数据库
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
validator,
|
||||
proxy_repo,
|
||||
db_ctx,
|
||||
worker_count: int = 50,
|
||||
score_valid: int = 10,
|
||||
score_invalid: int = -5,
|
||||
score_min: int = 0,
|
||||
score_max: int = 100,
|
||||
):
|
||||
self.validator = validator
|
||||
self.proxy_repo = proxy_repo
|
||||
self.db_ctx = db_ctx
|
||||
self.worker_count = worker_count
|
||||
self.score_valid = score_valid
|
||||
self.score_invalid = score_invalid
|
||||
self.score_min = score_min
|
||||
self.score_max = score_max
|
||||
|
||||
self._queue: asyncio.Queue[Optional[ProxyRaw]] = asyncio.Queue()
|
||||
self._workers: list[asyncio.Task] = []
|
||||
self._running = False
|
||||
|
||||
# 统计
|
||||
self.valid_count = 0
|
||||
self.invalid_count = 0
|
||||
|
||||
async def start(self):
|
||||
if self._running:
|
||||
return
|
||||
self._running = True
|
||||
for i in range(self.worker_count):
|
||||
self._workers.append(asyncio.create_task(self._worker_loop(i)))
|
||||
logger.info(f"ValidationQueue started with {self.worker_count} workers")
|
||||
|
||||
async def stop(self):
|
||||
if not self._running:
|
||||
return
|
||||
self._running = False
|
||||
for _ in self._workers:
|
||||
self._queue.put_nowait(None) # sentinel
|
||||
if self._workers:
|
||||
await asyncio.gather(*self._workers, return_exceptions=True)
|
||||
self._workers.clear()
|
||||
logger.info("ValidationQueue stopped")
|
||||
|
||||
async def submit(self, proxies: list[ProxyRaw]):
|
||||
"""提交代理到验证队列"""
|
||||
for p in proxies:
|
||||
await self._queue.put(p)
|
||||
|
||||
async def submit_one(self, proxy: ProxyRaw):
|
||||
await self._queue.put(proxy)
|
||||
|
||||
async def drain(self):
|
||||
"""等待队列中当前所有任务处理完毕"""
|
||||
await self._queue.join()
|
||||
|
||||
async def _worker_loop(self, worker_id: int):
|
||||
while True:
|
||||
item = await self._queue.get()
|
||||
if item is None:
|
||||
self._queue.task_done()
|
||||
break
|
||||
try:
|
||||
await self._validate_and_save(item)
|
||||
except Exception as e:
|
||||
logger.error(f"Worker {worker_id} validation error: {e}")
|
||||
finally:
|
||||
self._queue.task_done()
|
||||
|
||||
async def _validate_and_save(self, proxy: ProxyRaw):
|
||||
is_valid, latency = await self.validator.validate(
|
||||
proxy.ip, proxy.port, proxy.protocol
|
||||
)
|
||||
async with self.db_ctx() as db:
|
||||
if is_valid:
|
||||
await self.proxy_repo.insert_or_update(
|
||||
db, proxy.ip, proxy.port, proxy.protocol, score=self.score_valid
|
||||
)
|
||||
if latency:
|
||||
await self.proxy_repo.update_response_time(
|
||||
db, proxy.ip, proxy.port, latency
|
||||
)
|
||||
self.valid_count += 1
|
||||
logger.debug(f"ValidationQueue: valid {proxy.ip}:{proxy.port}")
|
||||
else:
|
||||
# 对于新爬取的无效代理,不需要入库,直接丢弃
|
||||
self.invalid_count += 1
|
||||
logger.debug(f"ValidationQueue: invalid {proxy.ip}:{proxy.port}")
|
||||
|
||||
def reset_stats(self):
|
||||
self.valid_count = 0
|
||||
self.invalid_count = 0
|
||||
@@ -1,192 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import aiohttp_socks
|
||||
import random
|
||||
import time
|
||||
from core.log import logger
|
||||
|
||||
|
||||
class ProxyValidator:
|
||||
"""代理验证器 - 支持 HTTP/HTTPS/SOCKS4/SOCKS5"""
|
||||
|
||||
def __init__(self, max_concurrency=50, timeout=5):
|
||||
# 验证目标源
|
||||
self.http_sources = [
|
||||
"http://httpbin.org/ip",
|
||||
"http://api.ipify.org"
|
||||
]
|
||||
self.https_sources = [
|
||||
"https://httpbin.org/ip",
|
||||
"https://api.ipify.org"
|
||||
]
|
||||
self.semaphore = asyncio.Semaphore(max_concurrency)
|
||||
self.timeout = timeout
|
||||
self.session = None
|
||||
|
||||
async def __aenter__(self):
|
||||
"""异步上下文管理器入口"""
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""异步上下文管理器出口"""
|
||||
if self.session:
|
||||
await self.session.close()
|
||||
self.session = None
|
||||
|
||||
def _get_test_url(self, protocol: str) -> str:
|
||||
"""根据协议获取测试 URL"""
|
||||
protocol = protocol.lower()
|
||||
if protocol == 'https':
|
||||
return random.choice(self.https_sources)
|
||||
return random.choice(self.http_sources)
|
||||
|
||||
def _create_connector(self, ip: str, port: int, protocol: str):
|
||||
"""创建代理连接器"""
|
||||
protocol = protocol.lower()
|
||||
|
||||
if protocol == 'socks4':
|
||||
return aiohttp_socks.ProxyConnector(
|
||||
proxy_type=aiohttp_socks.ProxyType.SOCKS4,
|
||||
host=ip,
|
||||
port=port,
|
||||
rdns=True
|
||||
)
|
||||
elif protocol == 'socks5':
|
||||
return aiohttp_socks.ProxyConnector(
|
||||
proxy_type=aiohttp_socks.ProxyType.SOCKS5,
|
||||
host=ip,
|
||||
port=port,
|
||||
rdns=True
|
||||
)
|
||||
elif protocol in ('http', 'https'):
|
||||
# HTTP/HTTPS 使用普通 connector,在请求时指定 proxy 参数
|
||||
return aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
|
||||
else:
|
||||
# 未知协议默认使用 HTTP
|
||||
return aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
|
||||
|
||||
async def validate(self, ip: str, port: int, protocol: str = 'http'):
|
||||
"""
|
||||
验证单个代理是否可用
|
||||
|
||||
Args:
|
||||
ip: 代理 IP
|
||||
port: 代理端口
|
||||
protocol: 协议类型 (http/https/socks4/socks5)
|
||||
|
||||
Returns:
|
||||
(is_valid: bool, latency_ms: float)
|
||||
"""
|
||||
protocol = protocol.lower()
|
||||
test_url = self._get_test_url(protocol)
|
||||
|
||||
async with self.semaphore:
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
if protocol in ('socks4', 'socks5'):
|
||||
return await self._validate_socks(ip, port, protocol, test_url, start_time)
|
||||
else:
|
||||
return await self._validate_http(ip, port, protocol, test_url, start_time)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.warning(f"验证超时: {ip}:{port} ({protocol})")
|
||||
return False, 0
|
||||
except Exception as e:
|
||||
logger.warning(f"验证失败: {ip}:{port} ({protocol}) - {e}")
|
||||
return False, 0
|
||||
|
||||
async def _validate_http(self, ip: str, port: int, protocol: str, test_url: str, start_time: float):
|
||||
"""验证 HTTP/HTTPS 代理"""
|
||||
proxy_url = f"http://{ip}:{port}"
|
||||
|
||||
connector = aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
|
||||
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=3)
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
connector=connector,
|
||||
timeout=timeout
|
||||
) as session:
|
||||
async with session.get(
|
||||
test_url,
|
||||
proxy=proxy_url,
|
||||
allow_redirects=True
|
||||
) as response:
|
||||
if response.status in [200, 301, 302]:
|
||||
try:
|
||||
content = await response.text()
|
||||
if 'ip' in content.lower() or 'origin' in content.lower():
|
||||
latency = round((time.time() - start_time) * 1000, 2)
|
||||
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
|
||||
return True, latency
|
||||
except:
|
||||
pass
|
||||
|
||||
# 内容解析失败但状态码正常,也算可用
|
||||
latency = round((time.time() - start_time) * 1000, 2)
|
||||
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
|
||||
return True, latency
|
||||
|
||||
return False, 0
|
||||
|
||||
async def _validate_socks(self, ip: str, port: int, protocol: str, test_url: str, start_time: float):
|
||||
"""验证 SOCKS4/SOCKS5 代理"""
|
||||
proxy_type = (
|
||||
aiohttp_socks.ProxyType.SOCKS4
|
||||
if protocol == 'socks4'
|
||||
else aiohttp_socks.ProxyType.SOCKS5
|
||||
)
|
||||
|
||||
connector = aiohttp_socks.ProxyConnector(
|
||||
proxy_type=proxy_type,
|
||||
host=ip,
|
||||
port=port,
|
||||
rdns=True, # 远程 DNS 解析,避免 DNS 泄漏
|
||||
ssl=False
|
||||
)
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=3)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(
|
||||
connector=connector,
|
||||
timeout=timeout
|
||||
) as session:
|
||||
async with session.get(test_url, allow_redirects=True) as response:
|
||||
if response.status in [200, 301, 302]:
|
||||
try:
|
||||
content = await response.text()
|
||||
if 'ip' in content.lower() or 'origin' in content.lower():
|
||||
latency = round((time.time() - start_time) * 1000, 2)
|
||||
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
|
||||
return True, latency
|
||||
except:
|
||||
pass
|
||||
|
||||
# 内容解析失败但状态码正常
|
||||
latency = round((time.time() - start_time) * 1000, 2)
|
||||
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
|
||||
return True, latency
|
||||
|
||||
return False, 0
|
||||
finally:
|
||||
await connector.close()
|
||||
|
||||
|
||||
class ProxyValidatorLegacy:
|
||||
"""
|
||||
兼容旧版本的验证器
|
||||
保持原有接口不变
|
||||
"""
|
||||
def __init__(self, max_concurrency=50, timeout=5):
|
||||
self.validator = ProxyValidator(max_concurrency, timeout)
|
||||
|
||||
async def __aenter__(self):
|
||||
await self.validator.__aenter__()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.validator.__aexit__(exc_type, exc_val, exc_tb)
|
||||
|
||||
async def validate(self, ip, port, protocol='http'):
|
||||
return await self.validator.validate(ip, port, protocol)
|
||||
Reference in New Issue
Block a user