重构: 迁移后端代码到 app 目录,前端移动到 WebUI,添加完整测试套件

主要变更:
- 后端代码从根目录迁移到 app/ 目录
- 前端代码从 frontend/ 重命名为 WebUI/
- 更新所有导入路径以适配新结构
- 提取公共 API 响应函数到 app/api/common.py
- 精简验证器服务代码
- 更新启动脚本和文档

测试:
- 新增完整测试套件 (tests/)
- 单元测试: 模型、仓库层
- 集成测试: 覆盖所有 22+ API 端点
- E2E 测试: 4个完整工作流场景
- 添加 pytest 配置和测试运行脚本
This commit is contained in:
祀梦
2026-04-04 13:32:36 +08:00
parent df3cc87f88
commit 38bd66128b
109 changed files with 2017 additions and 548 deletions

21
app/plugins/__init__.py Normal file
View File

@@ -0,0 +1,21 @@
"""插件包 - 在这里显式注册所有爬虫插件"""
from app.core.plugin_system import registry
from .fate0 import Fate0Plugin
from .proxylist_download import ProxyListDownloadPlugin
from .ip3366 import Ip3366Plugin
from .ip89 import Ip89Plugin
from .kuaidaili import KuaiDaiLiPlugin
from .speedx import SpeedXPlugin
from .yundaili import YunDaiLiPlugin
from .proxyscrape import ProxyScrapePlugin
# 显式注册所有插件
registry.register(Fate0Plugin)
registry.register(ProxyListDownloadPlugin)
registry.register(Ip3366Plugin)
registry.register(Ip89Plugin)
registry.register(KuaiDaiLiPlugin)
registry.register(SpeedXPlugin)
registry.register(YunDaiLiPlugin)
registry.register(ProxyScrapePlugin)

52
app/plugins/base.py Normal file
View File

@@ -0,0 +1,52 @@
"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
import random
import asyncio
import aiohttp
from typing import List
from app.core.plugin_system import BaseCrawlerPlugin
class BaseHTTPPlugin(BaseCrawlerPlugin):
"""基于 HTTP 的爬虫插件基类"""
def __init__(self):
super().__init__()
self.user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
]
self.urls: List[str] = []
self.current_url: str = ""
def get_headers(self) -> dict:
return {
"User-Agent": random.choice(self.user_agents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection": "keep-alive",
}
async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str:
"""异步抓取指定 URL 的 HTML 内容"""
headers = self.get_headers()
async with aiohttp.ClientSession(headers=headers) as session:
for attempt in range(retries):
try:
async with session.get(
url, timeout=aiohttp.ClientTimeout(total=timeout)
) as response:
if response.status == 200:
content = await response.read()
encoding = response.get_encoding()
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
except Exception:
pass
await asyncio.sleep(random.uniform(1, 3))
return ""

38
app/plugins/fate0.py Normal file
View File

@@ -0,0 +1,38 @@
import json
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class Fate0Plugin(BaseHTTPPlugin):
name = "fate0"
display_name = "Fate0聚合源"
description = "从 GitHub 持续更新的高质量代理聚合列表"
def __init__(self):
super().__init__()
self.urls = ["https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list"]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=30)
if not html:
continue
for line in html.split("\n"):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
ip = data.get("host")
port = data.get("port")
protocol = data.get("type", "http")
if ip and port:
results.append(ProxyRaw(ip, int(port), protocol))
except Exception:
continue
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

56
app/plugins/ip3366.py Normal file
View File

@@ -0,0 +1,56 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class Ip3366Plugin(BaseHTTPPlugin):
name = "ip3366"
display_name = "IP3366"
description = "从 IP3366 网站爬取免费代理"
default_config = {"max_pages": 5}
def __init__(self):
super().__init__()
self._update_urls()
def _update_urls(self):
max_pages = self.config.get("max_pages", 5)
self.urls = [
f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, max_pages + 1)
] + [
f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, max_pages + 1)
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
list_div = soup.find("div", id="list")
if not list_div:
continue
table = list_div.find("table")
if not table:
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 5:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

39
app/plugins/ip89.py Normal file
View File

@@ -0,0 +1,39 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class Ip89Plugin(BaseHTTPPlugin):
name = "ip89"
display_name = "89免费代理"
description = "从 89ip.cn 爬取免费代理"
def __init__(self):
super().__init__()
self.urls = [f"https://www.89ip.cn/index_{i}.html" for i in range(1, 6)]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
table = soup.find("table", class_="layui-table")
if not table:
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 2:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), "http"))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

49
app/plugins/kuaidaili.py Normal file
View File

@@ -0,0 +1,49 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class KuaiDaiLiPlugin(BaseHTTPPlugin):
name = "kuaidaili"
display_name = "快代理"
description = "从快代理网站爬取免费代理"
def __init__(self):
super().__init__()
self.urls = [
f"https://www.kuaidaili.com/free/inha/{i}/" for i in range(1, 11)
] + [
f"https://www.kuaidaili.com/free/intr/{i}/" for i in range(1, 11)
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
table = soup.find("table")
if not table:
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬")
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 5:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

View File

@@ -0,0 +1,55 @@
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyListDownloadPlugin(BaseHTTPPlugin):
name = "proxylist_download"
display_name = "ProxyListDownload"
description = "从 ProxyListDownload API 获取代理"
def __init__(self):
super().__init__()
self.urls = [
"https://www.proxy-list.download/api/v1/get?type=http",
"https://www.proxy-list.download/api/v1/get?type=https",
"https://www.proxy-list.download/api/v1/get?type=socks4",
"https://www.proxy-list.download/api/v1/get?type=socks5",
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=30)
if not html:
continue
# 根据 URL 判断协议
if "type=socks4" in url:
protocol = "socks4"
elif "type=socks5" in url:
protocol = "socks5"
elif "type=https" in url:
protocol = "https"
else:
protocol = "http"
lines = html.split("\r\n")
if len(lines) <= 1:
lines = html.split("\n")
for line in lines:
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port = parts[1].strip()
if ip and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

View File

@@ -0,0 +1,75 @@
"""ProxyScrape 测试爬虫 - 用于验证架构,支持全协议类型"""
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyScrapePlugin(BaseHTTPPlugin):
"""
从 ProxyScrape 公开 API 获取代理。
覆盖 http/https/socks4/socks5 全协议,专门用于测试插件系统的可扩展性。
"""
name = "proxyscrape"
display_name = "ProxyScrape测试源"
description = "从 ProxyScrape API 获取各类型代理HTTP/HTTPS/SOCKS4/SOCKS5用于测试架构扩展"
enabled = True
def __init__(self):
super().__init__()
# 使用多个公开 GitHub 代理列表作为源,稳定性较高
self.urls = [
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
]
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
for protocol, url in self.urls:
try:
html = await self.fetch(url, timeout=30)
if not html:
logger.warning(f"ProxyScrape {protocol.upper()} 返回空内容")
continue
count = 0
for line in html.splitlines():
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port_str = parts[1].strip()
if port_str.isdigit():
results.append(ProxyRaw(ip, int(port_str), protocol))
count += 1
logger.info(f"ProxyScrape {protocol.upper()} 获取 {count} 个代理")
except Exception as e:
logger.error(f"ProxyScrape {protocol.upper()} 爬取失败: {e}")
if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else:
# Fallback生成测试代理确保在测试环境也能验证完整流程
logger.warning("ProxyScrape 所有真实源均不可用,生成测试代理用于架构验证")
results = self._generate_test_proxies()
return results
def _generate_test_proxies(self) -> List[ProxyRaw]:
"""生成测试代理数据,覆盖全协议类型,用于验证插件系统"""
import random
test_proxies = []
protocols = ["http", "https", "socks4", "socks5"]
for protocol in protocols:
for _ in range(3):
# 生成随机公网格式 IP仅用于测试流程
ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
port = random.randint(1024, 65535)
test_proxies.append(ProxyRaw(ip, port, protocol))
logger.info(f"生成 {len(test_proxies)} 个测试代理: HTTP/HTTPS/SOCKS4/SOCKS5 各 3 个")
return test_proxies

51
app/plugins/speedx.py Normal file
View File

@@ -0,0 +1,51 @@
import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class SpeedXPlugin(BaseHTTPPlugin):
name = "speedx"
display_name = "SpeedX代理源"
description = "从 SpeedX GitHub 仓库获取 SOCKS 代理列表"
def __init__(self):
super().__init__()
self.urls = [
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt",
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt",
"https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt",
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=30)
if not html:
continue
# 根据 URL 判断协议
protocol = "http"
if "socks5" in url:
protocol = "socks5"
elif "socks4" in url:
protocol = "socks4"
for line in html.split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port = parts[1].strip()
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
continue
if not port.isdigit() or not (1 <= int(port) <= 65535):
continue
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results

51
app/plugins/yundaili.py Normal file
View File

@@ -0,0 +1,51 @@
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class YunDaiLiPlugin(BaseHTTPPlugin):
name = "yundaili"
display_name = "云代理"
description = "从云代理网站爬取免费代理"
def __init__(self):
super().__init__()
self.urls = [
f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, 6)
] + [
f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, 6)
]
async def crawl(self) -> List[ProxyRaw]:
results = []
for url in self.urls:
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
list_table = soup.find("div", id="list")
if not list_table:
continue
table = list_table.find("table")
if not table:
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 5:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results