全面架构重构:建立分层架构与高度可扩展的插件系统
后端重构: - 新增分层架构:API Routes -> Services -> Repositories -> Infrastructure - 彻底移除全局单例,全面采用 FastAPI 依赖注入 - 新增 api/ 目录拆分路由(proxies, plugins, scheduler, settings, stats) - 新增 services/ 业务逻辑层:ProxyService, PluginService, SchedulerService, ValidatorService, SettingsService - 新增 repositories/ 数据访问层:ProxyRepository, SettingsRepository, PluginSettingsRepository - 新增 models/ 层:Pydantic Schemas + Domain Models - 重写 core/config.py:采用 Pydantic Settings 管理配置 - 新增 core/db.py:基于 asynccontextmanager 的连接管理,支持数据库迁移 - 新增 core/exceptions.py:统一业务异常体系 插件系统重构(核心): - 新增 core/plugin_system/:BaseCrawlerPlugin + PluginRegistry - 采用显式注册模式(装饰器 + plugins/__init__.py),类型安全、测试友好 - 新增 plugins/base.py:BaseHTTPPlugin 通用 HTTP 爬虫基类 - 迁移全部 7 个插件到新架构(fate0, proxylist_download, ip3366, ip89, kuaidaili, speedx, yundaili) - 插件状态持久化到 plugin_settings 表 任务调度重构: - 新增 core/tasks/queue.py:ValidationQueue + WorkerPool - 解耦爬取与验证:爬虫只负责爬取,代理提交队列后由 Worker 异步验证 - 调度器定时从数据库拉取存量代理并分批投入验证队列 前端调整: - 新增 frontend/src/services/ 层拆分 API 调用逻辑 - 调整 stores/ 和 views/ 使用 Service 层 - 保持 API 兼容性,页面无需大幅修改 其他: - 新增 main.py 作为新入口 - 新增 DESIGN.md 架构设计文档 - 更新 requirements.txt 增加 pydantic-settings
This commit is contained in:
111
services/plugin_service.py
Normal file
111
services/plugin_service.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""插件业务服务"""
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
from core.db import get_db
|
||||
from core.plugin_system.registry import registry
|
||||
from core.plugin_system.base import BaseCrawlerPlugin
|
||||
from repositories.settings_repo import PluginSettingsRepository
|
||||
from models.domain import PluginInfo, ProxyRaw
|
||||
from core.log import logger
|
||||
|
||||
|
||||
class PluginService:
|
||||
"""插件业务服务:管理插件生命周期、执行爬取"""
|
||||
|
||||
def __init__(self):
|
||||
self.plugin_settings_repo = PluginSettingsRepository()
|
||||
self._stats: dict[str, dict] = {}
|
||||
|
||||
async def list_plugins(self) -> List[PluginInfo]:
|
||||
"""获取所有插件信息(合并持久化状态)"""
|
||||
async with get_db() as db:
|
||||
db_states = await self.plugin_settings_repo.list_all(db)
|
||||
|
||||
result = []
|
||||
for plugin in registry.list_plugins():
|
||||
# 如果有持久化状态,覆盖内存状态
|
||||
if plugin.name in db_states:
|
||||
plugin.enabled = db_states[plugin.name]
|
||||
|
||||
stat = self._stats.get(plugin.name, {
|
||||
"success_count": 0,
|
||||
"failure_count": 0,
|
||||
"last_run": None,
|
||||
})
|
||||
result.append(PluginInfo(
|
||||
id=plugin.name,
|
||||
name=plugin.name,
|
||||
display_name=plugin.display_name or plugin.name,
|
||||
description=plugin.description or f"从 {plugin.name} 爬取代理",
|
||||
enabled=plugin.enabled,
|
||||
last_run=stat.get("last_run"),
|
||||
success_count=stat.get("success_count", 0),
|
||||
failure_count=stat.get("failure_count", 0),
|
||||
))
|
||||
return result
|
||||
|
||||
async def toggle_plugin(self, plugin_id: str, enabled: bool) -> bool:
|
||||
plugin = registry.get(plugin_id)
|
||||
if not plugin:
|
||||
return False
|
||||
async with get_db() as db:
|
||||
success = await self.plugin_settings_repo.set_enabled(db, plugin_id, enabled)
|
||||
if success:
|
||||
plugin.enabled = enabled
|
||||
logger.info(f"Plugin {plugin_id} toggled to {enabled}")
|
||||
return success
|
||||
|
||||
def get_plugin(self, plugin_id: str) -> Optional[BaseCrawlerPlugin]:
|
||||
return registry.get(plugin_id)
|
||||
|
||||
async def run_plugin(self, plugin_id: str) -> List[ProxyRaw]:
|
||||
"""执行单个插件爬取"""
|
||||
plugin = self.get_plugin(plugin_id)
|
||||
if not plugin:
|
||||
raise ValueError(f"Plugin {plugin_id} not found")
|
||||
if not plugin.enabled:
|
||||
logger.warning(f"Plugin {plugin_id} is disabled, skip crawl")
|
||||
return []
|
||||
|
||||
try:
|
||||
results = await plugin.crawl()
|
||||
self._record_stat(plugin_id, success=len(results))
|
||||
logger.info(f"Plugin {plugin_id} crawled {len(results)} proxies")
|
||||
return results
|
||||
except Exception as e:
|
||||
self._record_stat(plugin_id, failure=1)
|
||||
logger.error(f"Plugin {plugin_id} crawl failed: {e}")
|
||||
return []
|
||||
|
||||
async def run_all_plugins(self) -> List[ProxyRaw]:
|
||||
"""执行所有启用插件的爬取"""
|
||||
all_results: List[ProxyRaw] = []
|
||||
for plugin in registry.list_plugins():
|
||||
if not plugin.enabled:
|
||||
continue
|
||||
try:
|
||||
results = await self.run_plugin(plugin.name)
|
||||
all_results.extend(results)
|
||||
except Exception as e:
|
||||
logger.error(f"Run all plugins error at {plugin.name}: {e}")
|
||||
# 去重
|
||||
seen = set()
|
||||
unique = []
|
||||
for p in all_results:
|
||||
key = (p.ip, p.port, p.protocol)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(p)
|
||||
return unique
|
||||
|
||||
def _record_stat(self, plugin_id: str, success: int = 0, failure: int = 0):
|
||||
if plugin_id not in self._stats:
|
||||
self._stats[plugin_id] = {
|
||||
"success_count": 0,
|
||||
"failure_count": 0,
|
||||
"last_run": None,
|
||||
}
|
||||
self._stats[plugin_id]["success_count"] += success
|
||||
self._stats[plugin_id]["failure_count"] += failure
|
||||
if success or failure:
|
||||
self._stats[plugin_id]["last_run"] = datetime.now()
|
||||
93
services/proxy_service.py
Normal file
93
services/proxy_service.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""代理业务服务"""
|
||||
import csv
|
||||
import json
|
||||
import io
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple, AsyncIterator
|
||||
from core.db import get_db
|
||||
from repositories.proxy_repo import ProxyRepository
|
||||
from models.domain import Proxy
|
||||
from core.log import logger
|
||||
|
||||
|
||||
class ProxyService:
|
||||
def __init__(self, proxy_repo: ProxyRepository = ProxyRepository()):
|
||||
self.proxy_repo = proxy_repo
|
||||
|
||||
async def get_stats(self) -> dict:
|
||||
async with get_db() as db:
|
||||
stats = await self.proxy_repo.get_stats(db)
|
||||
stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
|
||||
return stats
|
||||
|
||||
async def list_proxies(
|
||||
self,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
protocol: Optional[str] = None,
|
||||
min_score: int = 0,
|
||||
max_score: Optional[int] = None,
|
||||
sort_by: str = "last_check",
|
||||
sort_order: str = "DESC",
|
||||
) -> Tuple[List[Proxy], int]:
|
||||
async with get_db() as db:
|
||||
return await self.proxy_repo.list_paginated(
|
||||
db, page, page_size, protocol, min_score, max_score, sort_by, sort_order
|
||||
)
|
||||
|
||||
async def get_random_proxy(self) -> Optional[Proxy]:
|
||||
async with get_db() as db:
|
||||
return await self.proxy_repo.get_random(db)
|
||||
|
||||
async def delete_proxy(self, ip: str, port: int) -> None:
|
||||
async with get_db() as db:
|
||||
await self.proxy_repo.delete(db, ip, port)
|
||||
|
||||
async def batch_delete(self, proxies: List[Tuple[str, int]]) -> int:
|
||||
async with get_db() as db:
|
||||
return await self.proxy_repo.batch_delete(db, proxies)
|
||||
|
||||
async def clean_invalid(self) -> int:
|
||||
async with get_db() as db:
|
||||
return await self.proxy_repo.clean_invalid(db)
|
||||
|
||||
async def clean_expired(self, days: int) -> int:
|
||||
async with get_db() as db:
|
||||
return await self.proxy_repo.clean_expired(db, days)
|
||||
|
||||
async def export_proxies(
|
||||
self,
|
||||
fmt: str,
|
||||
protocol: Optional[str] = None,
|
||||
limit: int = 10000,
|
||||
) -> AsyncIterator[str]:
|
||||
async with get_db() as db:
|
||||
proxies = await self.proxy_repo.list_all(db, protocol=protocol, limit=limit)
|
||||
|
||||
if fmt == "csv":
|
||||
yield "IP,Port,Protocol,Score,Last Check\n"
|
||||
for p in proxies:
|
||||
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
|
||||
elif fmt == "txt":
|
||||
for p in proxies:
|
||||
yield f"{p.ip}:{p.port}\n"
|
||||
elif fmt == "json":
|
||||
data = [
|
||||
{
|
||||
"ip": p.ip,
|
||||
"port": p.port,
|
||||
"protocol": p.protocol,
|
||||
"score": p.score,
|
||||
"last_check": self._fmt_time(p.last_check),
|
||||
}
|
||||
for p in proxies
|
||||
]
|
||||
yield json.dumps(data, ensure_ascii=False, indent=2)
|
||||
|
||||
@staticmethod
|
||||
def _fmt_time(dt: Optional[datetime]) -> str:
|
||||
if not dt:
|
||||
return ""
|
||||
if isinstance(dt, str):
|
||||
return dt
|
||||
return dt.isoformat()
|
||||
88
services/scheduler_service.py
Normal file
88
services/scheduler_service.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""调度器服务 - 定时验证存量代理"""
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from core.db import get_db
|
||||
from repositories.proxy_repo import ProxyRepository
|
||||
from core.tasks.queue import ValidationQueue
|
||||
from core.config import settings as app_settings
|
||||
from core.log import logger
|
||||
|
||||
|
||||
class SchedulerService:
|
||||
"""代理验证调度器"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
validation_queue: ValidationQueue,
|
||||
proxy_repo: ProxyRepository = ProxyRepository(),
|
||||
):
|
||||
self.validation_queue = validation_queue
|
||||
self.proxy_repo = proxy_repo
|
||||
self.interval_minutes = 30
|
||||
self.running = False
|
||||
self._task: asyncio.Task | None = None
|
||||
|
||||
async def start(self):
|
||||
if self.running:
|
||||
logger.warning("Scheduler already running")
|
||||
return
|
||||
self.running = True
|
||||
await self.validation_queue.start()
|
||||
self._task = asyncio.create_task(self._run_loop())
|
||||
logger.info("Scheduler started")
|
||||
|
||||
async def stop(self):
|
||||
self.running = False
|
||||
if self._task:
|
||||
self._task.cancel()
|
||||
try:
|
||||
await self._task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._task = None
|
||||
await self.validation_queue.stop()
|
||||
logger.info("Scheduler stopped")
|
||||
|
||||
async def validate_all_now(self):
|
||||
"""立即执行一次全量验证(后台运行,不阻塞)"""
|
||||
asyncio.create_task(self._do_validate_all())
|
||||
|
||||
async def _run_loop(self):
|
||||
"""定时循环"""
|
||||
while self.running:
|
||||
try:
|
||||
await self._do_validate_all()
|
||||
except Exception as e:
|
||||
logger.error(f"Scheduler loop error: {e}")
|
||||
# 等待下一次
|
||||
for _ in range(self.interval_minutes * 60):
|
||||
if not self.running:
|
||||
break
|
||||
await asyncio.sleep(1)
|
||||
|
||||
async def _do_validate_all(self):
|
||||
"""验证数据库中所有存量代理"""
|
||||
logger.info("Starting scheduled validation for all proxies")
|
||||
async with get_db() as db:
|
||||
proxies = await self.proxy_repo.list_all(db)
|
||||
if not proxies:
|
||||
logger.info("No proxies to validate")
|
||||
return
|
||||
|
||||
logger.info(f"Validating {len(proxies)} proxies from database")
|
||||
from models.domain import ProxyRaw
|
||||
|
||||
# 批量提交到验证队列
|
||||
batch_size = 100
|
||||
for i in range(0, len(proxies), batch_size):
|
||||
if not self.running:
|
||||
break
|
||||
batch = proxies[i : i + batch_size]
|
||||
await self.validation_queue.submit([
|
||||
ProxyRaw(p.ip, p.port, p.protocol) for p in batch
|
||||
])
|
||||
# 等待当前批次处理完
|
||||
await self.validation_queue.drain()
|
||||
logger.info(f"Validated batch {i//batch_size + 1}/{(len(proxies)-1)//batch_size + 1}")
|
||||
|
||||
logger.info("Scheduled validation completed")
|
||||
19
services/settings_service.py
Normal file
19
services/settings_service.py
Normal file
@@ -0,0 +1,19 @@
|
||||
"""系统设置业务服务"""
|
||||
from typing import Any, Dict
|
||||
from core.db import get_db
|
||||
from repositories.settings_repo import SettingsRepository
|
||||
from models.schemas import SettingsSchema
|
||||
|
||||
|
||||
class SettingsService:
|
||||
def __init__(self):
|
||||
self.repo = SettingsRepository()
|
||||
|
||||
async def get_settings(self) -> Dict[str, Any]:
|
||||
async with get_db() as db:
|
||||
return await self.repo.get_all(db)
|
||||
|
||||
async def save_settings(self, data: SettingsSchema) -> bool:
|
||||
settings_dict = data.model_dump()
|
||||
async with get_db() as db:
|
||||
return await self.repo.save(db, settings_dict)
|
||||
103
services/validator_service.py
Normal file
103
services/validator_service.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""代理验证服务 - 支持 HTTP/HTTPS/SOCKS4/SOCKS5"""
|
||||
import asyncio
|
||||
import random
|
||||
import time
|
||||
import aiohttp
|
||||
import aiohttp_socks
|
||||
from typing import Tuple
|
||||
from core.log import logger
|
||||
|
||||
|
||||
class ValidatorService:
|
||||
"""代理验证器"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout: float = 5.0,
|
||||
connect_timeout: float = 3.0,
|
||||
max_concurrency: int = 50,
|
||||
):
|
||||
self.timeout = timeout
|
||||
self.connect_timeout = connect_timeout
|
||||
self.semaphore = asyncio.Semaphore(max_concurrency)
|
||||
self.http_sources = [
|
||||
"http://httpbin.org/ip",
|
||||
"http://api.ipify.org",
|
||||
]
|
||||
self.https_sources = [
|
||||
"https://httpbin.org/ip",
|
||||
"https://api.ipify.org",
|
||||
]
|
||||
|
||||
def _get_test_url(self, protocol: str) -> str:
|
||||
protocol = protocol.lower()
|
||||
if protocol == "https":
|
||||
return random.choice(self.https_sources)
|
||||
return random.choice(self.http_sources)
|
||||
|
||||
async def validate(self, ip: str, port: int, protocol: str = "http") -> Tuple[bool, float]:
|
||||
"""验证单个代理,返回 (是否有效, 延迟毫秒)"""
|
||||
protocol = protocol.lower()
|
||||
test_url = self._get_test_url(protocol)
|
||||
|
||||
async with self.semaphore:
|
||||
start = time.time()
|
||||
try:
|
||||
if protocol in ("socks4", "socks5"):
|
||||
return await self._validate_socks(ip, port, protocol, test_url, start)
|
||||
else:
|
||||
return await self._validate_http(ip, port, protocol, test_url, start)
|
||||
except asyncio.TimeoutError:
|
||||
logger.debug(f"Validation timeout: {ip}:{port} ({protocol})")
|
||||
return False, 0.0
|
||||
except Exception as e:
|
||||
logger.debug(f"Validation error {ip}:{port} ({protocol}): {e}")
|
||||
return False, 0.0
|
||||
|
||||
async def _validate_http(
|
||||
self, ip: str, port: int, protocol: str, test_url: str, start: float
|
||||
) -> Tuple[bool, float]:
|
||||
proxy_url = f"http://{ip}:{port}"
|
||||
connector = aiohttp.TCPConnector(ssl=False, limit=0, force_close=True)
|
||||
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
||||
async with session.get(
|
||||
test_url, proxy=proxy_url, allow_redirects=True
|
||||
) as response:
|
||||
if response.status in (200, 301, 302):
|
||||
latency = round((time.time() - start) * 1000, 2)
|
||||
logger.info(f"HTTP valid: {ip}:{port} ({protocol}) {latency}ms")
|
||||
return True, latency
|
||||
return False, 0.0
|
||||
finally:
|
||||
await connector.close()
|
||||
|
||||
async def _validate_socks(
|
||||
self, ip: str, port: int, protocol: str, test_url: str, start: float
|
||||
) -> Tuple[bool, float]:
|
||||
proxy_type = (
|
||||
aiohttp_socks.ProxyType.SOCKS4
|
||||
if protocol == "socks4"
|
||||
else aiohttp_socks.ProxyType.SOCKS5
|
||||
)
|
||||
connector = aiohttp_socks.ProxyConnector(
|
||||
proxy_type=proxy_type,
|
||||
host=ip,
|
||||
port=port,
|
||||
rdns=True,
|
||||
ssl=False,
|
||||
)
|
||||
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout)
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
||||
async with session.get(test_url, allow_redirects=True) as response:
|
||||
if response.status in (200, 301, 302):
|
||||
latency = round((time.time() - start) * 1000, 2)
|
||||
logger.info(f"SOCKS valid: {ip}:{port} ({protocol}) {latency}ms")
|
||||
return True, latency
|
||||
return False, 0.0
|
||||
finally:
|
||||
await connector.close()
|
||||
Reference in New Issue
Block a user