first commit

This commit is contained in:
祀梦
2026-01-27 21:17:36 +08:00
commit b06044c91c
57 changed files with 6714 additions and 0 deletions

89
core/auth.py Normal file
View File

@@ -0,0 +1,89 @@
from fastapi import HTTPException, Depends, Header, status
from typing import Optional
from config import Config
from core.log import logger
class PermissionLevel:
READ_ONLY = "read_only"
ADMIN = "admin"
def verify_api_key(
x_api_key: Optional[str] = Header(None, alias="X-API-Key"),
authorization: Optional[str] = Header(None)
) -> str:
"""
验证API Key并返回权限级别
Args:
x_api_key: X-API-Key header中的API Key
authorization: Authorization header中的Bearer token
Returns:
str: 权限级别
Raises:
HTTPException: 认证失败时抛出401错误
"""
api_key = x_api_key
if authorization and authorization.startswith("Bearer "):
api_key = authorization.replace("Bearer ", "")
if not api_key:
logger.warning("API请求缺少API Key")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="缺少API Key请在请求头中添加 X-API-Key 或 Authorization: Bearer <key>",
headers={"WWW-Authenticate": "Bearer"},
)
if api_key == Config.ADMIN_API_KEY:
logger.info(f"管理员API认证成功: {api_key[:8]}...")
return PermissionLevel.ADMIN
elif api_key == Config.API_KEY:
logger.info(f"普通用户API认证成功: {api_key[:8]}...")
return PermissionLevel.READ_ONLY
else:
logger.warning(f"无效的API Key尝试: {api_key[:8]}...")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="无效的API Key",
headers={"WWW-Authenticate": "Bearer"},
)
def require_admin(permission_level: str = Depends(verify_api_key)) -> str:
"""
要求管理员权限的依赖函数
Args:
permission_level: 从verify_api_key获得的权限级别
Returns:
str: 权限级别
Raises:
HTTPException: 权限不足时抛出403错误
"""
if permission_level != PermissionLevel.ADMIN:
logger.warning(f"非管理员用户尝试访问管理接口")
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="需要管理员权限才能执行此操作"
)
return permission_level
def skip_auth_for_dev() -> Optional[str]:
"""
开发环境跳过认证(仅在开发模式下使用)
Returns:
Optional[str]: 返回管理员权限级别
Warning:
仅用于开发环境,生产环境务必使用真实认证
"""
import os
if os.getenv("SKIP_AUTH", "false").lower() == "true":
logger.warning("开发模式跳过API Key认证")
return PermissionLevel.ADMIN
return None

86
core/crawler.py Normal file
View File

@@ -0,0 +1,86 @@
import aiohttp
import asyncio
import random
from core.log import logger
class BaseCrawler:
def __init__(self):
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Mobile/15E148 Safari/604.1"
]
def get_headers(self):
return {
'User-Agent': random.choice(self.user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
}
async def fetch(self, url, method='GET', params=None, data=None, proxies=None, timeout=10, retry_count=3):
"""异步抓取方法"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
async with aiohttp.ClientSession(headers=headers) as session:
for i in range(retry_count):
try:
# 注意aiohttp 的代理格式与 requests 不同,通常为 http://user:pass@host:port
async with session.request(
method=method,
url=url,
params=params,
data=data,
proxy=proxies,
timeout=aiohttp.ClientTimeout(total=timeout)
) as response:
if response.status == 200:
# 先读取内容,再处理编码
content = await response.read()
# 尝试获取编码
encoding = response.get_encoding()
if encoding == 'utf-8' or not encoding:
try:
return content.decode('utf-8')
except UnicodeDecodeError:
# 尝试从内容中检测编码或手动设置为 gbk (国内网站常见)
return content.decode('gbk', errors='ignore')
return content.decode(encoding, errors='ignore')
else:
logger.warning(f"请求失败 [{response.status}]: {url}, 正在进行第 {i+1} 次重试...")
except Exception as e:
logger.error(f"请求异常: {url}, 错误: {e}, 正在进行第 {i+1} 次重试...")
await asyncio.sleep(random.uniform(1, 3))
return None
class BasePlugin(BaseCrawler):
def __init__(self):
super().__init__()
self.name = "BasePlugin"
self.urls = []
self.enabled = True
async def parse(self, html):
"""异步解析网页内容,需在子类中实现"""
raise NotImplementedError("Please implement parse method")
async def run(self):
"""异步运行插件"""
logger.info(f"正在运行插件: {self.name}")
results = []
for url in self.urls:
self.current_url = url # 记录当前正在抓取的 URL供 parse 使用
html = await self.fetch(url)
if html:
async for proxy in self.parse(html):
results.append(proxy)
await asyncio.sleep(random.uniform(1, 2))
return results

38
core/log.py Normal file
View File

@@ -0,0 +1,38 @@
import logging
import os
from datetime import datetime
class LogHandler(logging.Logger):
def __init__(self, name='ProxyPool', level=logging.INFO):
super().__init__(name, level)
# 获取项目根目录并创建 logs 目录
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
log_dir = os.path.join(base_dir, 'logs')
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# 仅使用日期作为文件名
log_filename = f"{datetime.now().strftime('%Y-%m-%d')}.log"
log_file = os.path.join(log_dir, log_filename)
# 设置格式
formatter = logging.Formatter(
'[%(asctime)s] %(name)s [%(levelname)s] %(filename)s[line:%(lineno)d]: %(message)s'
)
# 文件处理器
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setFormatter(formatter)
self.addHandler(file_handler)
# 控制台处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
self.addHandler(console_handler)
# 实例化一个默认 logger 供外部直接使用
logger = LogHandler()
if __name__ == '__main__':
logger.info('这是一条按日期存储的日志测试')

125
core/plugin_manager.py Normal file
View File

@@ -0,0 +1,125 @@
import os
import importlib
import inspect
import asyncio
from typing import List, Dict, Optional
from core.crawler import BasePlugin
from core.log import logger
class PluginManager:
def __init__(self, plugin_dir='plugins'):
self.plugin_dir = plugin_dir
self.plugins = []
self.plugin_stats = {}
self._load_plugins()
self._init_stats()
def _init_stats(self):
for plugin in self.plugins:
self.plugin_stats[plugin.name] = {
'success_count': 0,
'failure_count': 0,
'last_run': None
}
def _load_plugins(self):
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
full_plugin_path = os.path.join(base_dir, self.plugin_dir)
if not os.path.exists(full_plugin_path):
logger.error(f"插件目录不存在: {full_plugin_path}")
return
for filename in os.listdir(full_plugin_path):
if filename.endswith('.py') and not filename.startswith('__'):
module_name = f"{self.plugin_dir}.{filename[:-3]}"
try:
module = importlib.import_module(module_name)
for name, obj in inspect.getmembers(module):
if inspect.isclass(obj) and issubclass(obj, BasePlugin) and obj is not BasePlugin:
plugin_instance = obj()
if plugin_instance.enabled:
logger.info(f"成功加载插件: {name} 来自 {module_name}")
self.plugins.append(plugin_instance)
else:
logger.info(f"插件已禁用,跳过加载: {name} 来自 {module_name}")
except Exception as e:
logger.error(f"加载插件失败 {module_name}: {e}")
def get_plugin_by_name(self, plugin_name: str) -> Optional[BasePlugin]:
for plugin in self.plugins:
if plugin.name == plugin_name:
return plugin
return None
def get_all_plugin_info(self) -> List[Dict]:
plugins_info = []
for plugin in self.plugins:
stats = self.plugin_stats.get(plugin.name, {
'success_count': 0,
'failure_count': 0,
'last_run': None
})
plugins_info.append({
'id': plugin.name,
'name': plugin.name,
'enabled': plugin.enabled,
'description': getattr(plugin, 'description', f'{plugin.name}网站爬取代理'),
'last_run': stats['last_run'],
'success_count': stats['success_count'],
'failure_count': stats['failure_count']
})
return plugins_info
def toggle_plugin(self, plugin_name: str, enabled: bool) -> bool:
plugin = self.get_plugin_by_name(plugin_name)
if plugin:
plugin.enabled = enabled
logger.info(f"插件 {plugin_name}{'启用' if enabled else '禁用'}")
return True
return False
async def run_plugin(self, plugin_name: str):
plugin = self.get_plugin_by_name(plugin_name)
if not plugin:
logger.error(f"插件不存在: {plugin_name}")
return []
if not plugin.enabled:
logger.warning(f"插件已禁用: {plugin_name}")
return []
try:
results = await plugin.run()
success_count = len(results)
failure_count = 0
from datetime import datetime
self.plugin_stats[plugin.name] = {
'success_count': self.plugin_stats[plugin.name]['success_count'] + success_count,
'failure_count': self.plugin_stats[plugin.name]['failure_count'] + failure_count,
'last_run': datetime.now().isoformat()
}
logger.info(f"插件 {plugin_name} 执行完成,成功: {success_count}")
return results
except Exception as e:
logger.error(f"插件 {plugin_name} 执行失败: {e}")
from datetime import datetime
self.plugin_stats[plugin.name] = {
'success_count': self.plugin_stats[plugin.name]['success_count'],
'failure_count': self.plugin_stats[plugin.name]['failure_count'] + 1,
'last_run': datetime.now().isoformat()
}
return []
async def run_all(self):
"""并发运行所有插件"""
tasks = [plugin.run() for plugin in self.plugins]
# 并发执行并收集结果
results_list = await asyncio.gather(*tasks)
# 将嵌套列表扁平化并产出结果
for results in results_list:
for proxy in results:
yield proxy

334
core/sqlite.py Normal file
View File

@@ -0,0 +1,334 @@
import aiosqlite
import os
import asyncio
from core.log import logger
VALID_PROTOCOLS = ['http', 'https', 'socks4', 'socks5']
class SQLiteManager:
_instance = None
_connection = None
_lock = asyncio.Lock()
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super(SQLiteManager, cls).__new__(cls)
return cls._instance
def __init__(self, db_path=None):
if hasattr(self, 'initialized') and self.initialized:
return
if db_path is None:
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
db_dir = os.path.join(base_dir, 'db')
if not os.path.exists(db_dir):
os.makedirs(db_dir)
self.db_path = os.path.join(db_dir, 'proxies.sqlite')
else:
self.db_path = db_path
self.initialized = True
async def get_connection(self):
async with self._lock:
if self._connection is None:
self._connection = await aiosqlite.connect(self.db_path)
await self._connection.execute("PRAGMA journal_mode=WAL")
await self._connection.execute("PRAGMA synchronous=NORMAL")
await self._connection.execute("PRAGMA cache_size=-64000")
await self._connection.execute("PRAGMA temp_store=MEMORY")
return self._connection
async def close_connection(self):
async with self._lock:
if self._connection is not None:
await self._connection.close()
self._connection = None
async def init_db(self):
"""初始化数据库和表结构"""
db = await self.get_connection()
await db.execute('''
CREATE TABLE IF NOT EXISTS proxies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ip TEXT NOT NULL,
port INTEGER NOT NULL,
protocol TEXT DEFAULT 'http',
score INTEGER DEFAULT 10,
last_check TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(ip, port)
)
''')
await db.execute('CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)')
await db.execute('CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)')
await db.execute('CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)')
await db.execute('CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)')
await db.commit()
async def insert_proxy(self, ip, port, protocol='http', score=10):
"""异步插入或更新代理"""
try:
# 验证协议类型
if protocol not in VALID_PROTOCOLS:
protocol = 'http'
logger.warning(f"无效的协议类型 {protocol},默认使用 http")
db = await self.get_connection()
# 先检查是否存在
async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor:
row = await cursor.fetchone()
if row:
# 如果存在,则更新最后检查时间和分数
await db.execute('''
UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ?
''', (score, protocol, ip, port))
else:
# 如果不存在,则插入新记录
await db.execute('''
INSERT INTO proxies (ip, port, protocol, score, last_check)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
''', (ip, port, protocol, score))
await db.commit()
return True
except aiosqlite.IntegrityError as e:
# 处理唯一性约束冲突
if "UNIQUE" in str(e):
# 代理已存在,更新它
if protocol not in VALID_PROTOCOLS:
protocol = 'http'
db = await self.get_connection()
await db.execute('''
UPDATE proxies SET last_check = CURRENT_TIMESTAMP, score = ?, protocol = ? WHERE ip = ? AND port = ?
''', (score, protocol, ip, port))
await db.commit()
return True
else:
logger.error(f"数据库完整性错误: {e}")
return False
except Exception as e:
logger.error(f"插入代理失败 {ip}:{port} - {e}")
return False
async def get_all_proxies(self):
"""异步获取所有代理"""
db = await self.get_connection()
async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies') as cursor:
return await cursor.fetchall()
async def get_random_proxy(self):
"""异步随机获取一个高分代理"""
db = await self.get_connection()
async with db.execute('SELECT ip, port, protocol, score, last_check FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1') as cursor:
return await cursor.fetchone()
async def update_score(self, ip, port, delta, min_score=0, max_score=100):
"""异步更新代理分数(增量更新,带分数限制)"""
try:
db = await self.get_connection()
# 获取当前分数
async with db.execute('SELECT score FROM proxies WHERE ip = ? AND port = ?', (ip, port)) as cursor:
row = await cursor.fetchone()
if row:
current_score = row[0]
new_score = max(min_score, min(max_score, current_score + delta))
await db.execute('''
UPDATE proxies SET score = ?, last_check = CURRENT_TIMESTAMP WHERE ip = ? AND port = ?
''', (new_score, ip, port))
if new_score <= 0:
await db.execute('DELETE FROM proxies WHERE score <= 0')
await db.commit()
return True
return False
except Exception as e:
logger.error(f"更新代理分数失败 {ip}:{port} - {e}")
return False
async def delete_proxy(self, ip, port):
"""异步删除指定代理"""
db = await self.get_connection()
await db.execute('DELETE FROM proxies WHERE ip = ? AND port = ?', (ip, port))
await db.commit()
async def count_proxies(self):
"""异步统计代理数量"""
db = await self.get_connection()
async with db.execute('SELECT COUNT(*) FROM proxies') as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def get_proxies_paginated_with_total(self, page: int = 1, page_size: int = 20,
protocol: str = None, min_score: int = 0,
max_score: int = None,
sort_by: str = 'last_check',
sort_order: str = 'DESC'):
"""分页获取代理列表(一次查询返回数据和总数)"""
db = await self.get_connection()
conditions = ['score >= ?']
params = [min_score]
if protocol:
conditions.append('protocol = ?')
params.append(protocol)
if max_score is not None:
conditions.append('score <= ?')
params.append(max_score)
where_clause = ' AND '.join(conditions)
order_by_clause = f'{sort_by} {sort_order}'
offset = (page - 1) * page_size
query = f'''
SELECT ip, port, protocol, score, last_check,
COUNT(*) OVER() as total_count
FROM proxies
WHERE {where_clause}
ORDER BY {order_by_clause}
LIMIT ? OFFSET ?
'''
params.extend([page_size, offset])
async with db.execute(query, params) as cursor:
rows = await cursor.fetchall()
total = rows[0][5] if rows else 0
proxies = [(row[0], row[1], row[2], row[3], row[4]) for row in rows]
return proxies, total
async def get_proxies_paginated(self, page: int = 1, page_size: int = 20,
protocol: str = None, min_score: int = 0,
max_score: int = None,
sort_by: str = 'last_check',
sort_order: str = 'DESC'):
"""分页获取代理列表"""
db = await self.get_connection()
conditions = ['score >= ?']
params = [min_score]
if protocol:
conditions.append('protocol = ?')
params.append(protocol)
if max_score is not None:
conditions.append('score <= ?')
params.append(max_score)
where_clause = ' AND '.join(conditions)
order_by_clause = f'{sort_by} {sort_order}'
offset = (page - 1) * page_size
query = f'''
SELECT ip, port, protocol, score, last_check
FROM proxies
WHERE {where_clause}
ORDER BY {order_by_clause}
LIMIT ? OFFSET ?
'''
params.extend([page_size, offset])
async with db.execute(query, params) as cursor:
return await cursor.fetchall()
async def get_proxies_total(self, protocol: str = None, min_score: int = 0, max_score: int = None):
"""获取符合条件的代理总数"""
db = await self.get_connection()
conditions = ['score >= ?']
params = [min_score]
if protocol:
conditions.append('protocol = ?')
params.append(protocol)
if max_score is not None:
conditions.append('score <= ?')
params.append(max_score)
where_clause = ' AND '.join(conditions)
query = f'SELECT COUNT(*) FROM proxies WHERE {where_clause}'
async with db.execute(query, params) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
async def get_proxy_detail(self, ip: str, port: int):
"""获取单个代理的详细信息"""
db = await self.get_connection()
async with db.execute(
'SELECT ip, port, protocol, score, last_check FROM proxies WHERE ip = ? AND port = ?',
(ip, port)
) as cursor:
row = await cursor.fetchone()
return row
async def batch_delete_proxies(self, proxy_list: list):
"""批量删除代理,返回实际删除的数量"""
deleted_count = 0
db = await self.get_connection()
for ip, port in proxy_list:
cursor = await db.execute('DELETE FROM proxies WHERE ip = ? AND port = ?', (ip, port))
deleted_count += cursor.rowcount
await db.commit()
return deleted_count
async def get_stats(self):
"""获取统计信息"""
db = await self.get_connection()
stats = {}
async with db.execute('SELECT COUNT(*) FROM proxies') as cursor:
row = await cursor.fetchone()
stats['total'] = row[0] if row else 0
async with db.execute('SELECT COUNT(*) FROM proxies WHERE score > 0') as cursor:
row = await cursor.fetchone()
stats['available'] = row[0] if row else 0
async with db.execute('SELECT COUNT(*) FROM proxies WHERE protocol = "http"') as cursor:
row = await cursor.fetchone()
stats['http_count'] = row[0] if row else 0
async with db.execute('SELECT COUNT(*) FROM proxies WHERE protocol = "https"') as cursor:
row = await cursor.fetchone()
stats['https_count'] = row[0] if row else 0
async with db.execute('SELECT COUNT(*) FROM proxies WHERE protocol = "socks4"') as cursor:
row = await cursor.fetchone()
stats['socks4_count'] = row[0] if row else 0
async with db.execute('SELECT COUNT(*) FROM proxies WHERE protocol = "socks5"') as cursor:
row = await cursor.fetchone()
stats['socks5_count'] = row[0] if row else 0
async with db.execute('SELECT AVG(score) FROM proxies') as cursor:
row = await cursor.fetchone()
stats['avg_score'] = row[0] if row and row[0] else 0
return stats
async def get_today_new_count(self):
"""获取今日新增代理数量"""
try:
db = await self.get_connection()
query = '''
SELECT COUNT(*) FROM proxies
WHERE DATE(last_check) = DATE('now', 'localtime')
'''
async with db.execute(query) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
except Exception as e:
logger.error(f"获取今日新增数量失败: {e}")
return 0
async def clean_invalid_proxies(self):
"""清理无效代理(分数<=0"""
db = await self.get_connection()
async with db.execute('DELETE FROM proxies WHERE score <= 0') as cursor:
deleted_count = cursor.rowcount
await db.commit()
return deleted_count

76
core/validator.py Normal file
View File

@@ -0,0 +1,76 @@
import asyncio
import aiohttp
import random
import time
from core.log import logger
class ProxyValidator:
def __init__(self, max_concurrency=50, timeout=5):
# 验证目标源(使用更适合代理验证的源)
self.http_sources = [
"http://httpbin.org/ip",
"http://api.ipify.org"
]
self.https_sources = [
"https://httpbin.org/ip",
"https://api.ipify.org"
]
self.semaphore = asyncio.Semaphore(max_concurrency)
self.timeout = timeout
self.session = None
async def __aenter__(self):
# 允许通过 async with 管理 session
if not self.session:
self.session = aiohttp.ClientSession(
connector=aiohttp.TCPConnector(ssl=False, limit=0, force_close=True),
timeout=aiohttp.ClientTimeout(total=self.timeout, connect=3)
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
async def validate(self, ip, port, protocol='http'):
"""
验证单个代理是否可用
"""
protocol = protocol.lower()
sources = self.https_sources if protocol == 'https' else self.http_sources
test_url = random.choice(sources)
# aiohttp 代理 URL 格式
proxy_url = f"http://{ip}:{port}"
async with self.semaphore:
start_time = time.time()
try:
# 复用 session
async with self.session.get(
test_url,
proxy=proxy_url,
allow_redirects=True,
timeout=aiohttp.ClientTimeout(total=self.timeout, connect=3)
) as response:
# 检查状态码和响应内容
if response.status in [200, 301, 302]:
try:
content = await response.text()
# 确保返回了有效的JSON响应
if 'ip' in content.lower() or 'origin' in content.lower():
latency = round((time.time() - start_time) * 1000, 2)
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
return True, latency
except:
# 即使无法解析内容,如果状态码正常也认为可用
latency = round((time.time() - start_time) * 1000, 2)
logger.info(f"验证成功: {ip}:{port} ({protocol}) - 延迟: {latency}ms")
return True, latency
return False, 0
except asyncio.TimeoutError:
logger.warning(f"验证超时: {ip}:{port} ({protocol})")
return False, 0
except Exception as e:
logger.warning(f"验证失败: {ip}:{port} ({protocol}) - {e}")
return False, 0