refactor: 全面重构核心架构,消除反复修改的根因
- 删除 ValidationQueue 双轨持久化队列,替换为纯内存 AsyncWorkerPool - 引入统一后台任务框架 JobExecutor(Job/CrawlJob/ValidateAllJob) - 新增 PluginRunner 统一插件执行(超时、重试、健康检查、统计) - 重构 SchedulerService 职责收敛为仅定时触发 ValidateAllJob - 使用 AsyncExitStack 重构 lifespan,安全管理长生命周期资源 - 路由层瘦身 50%+,业务异常上抛由全局中间件统一处理 - 实现设置全热更新(WorkerPool 并发、Validator 超时即时生效) - 前端 Store 强制写后重新拉取,消除乐观更新数据不同步 - 删除 queue.py / task_repo.py / task_service.py - 新增 execution 单元测试,全部 85 个测试通过
This commit is contained in:
@@ -1,9 +1,15 @@
|
||||
"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
|
||||
import re
|
||||
import random
|
||||
import asyncio
|
||||
import httpx
|
||||
from typing import List, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from app.core.plugin_system import BaseCrawlerPlugin
|
||||
from app.models.domain import ProxyRaw
|
||||
|
||||
|
||||
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
|
||||
|
||||
|
||||
class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
@@ -20,6 +26,7 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
self.urls: List[str] = []
|
||||
self.current_url: str = ""
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
self.max_concurrency: int = 3
|
||||
|
||||
def get_headers(self) -> dict:
|
||||
return {
|
||||
@@ -39,13 +46,21 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
)
|
||||
return self._client
|
||||
|
||||
async def fetch(self, url: str, timeout: float = 15.0, retries: int = 2) -> str:
|
||||
async def fetch(
|
||||
self,
|
||||
url: str,
|
||||
timeout: float = 15.0,
|
||||
retries: int = 2,
|
||||
raise_for_status: bool = False,
|
||||
) -> str:
|
||||
"""异步抓取指定 URL 的 HTML 内容"""
|
||||
from app.core.log import logger
|
||||
client = self._get_client()
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
response = await client.get(url, headers=self.get_headers(), timeout=timeout)
|
||||
if raise_for_status:
|
||||
response.raise_for_status()
|
||||
if response.status_code == 200:
|
||||
content = response.content
|
||||
encoding = response.encoding
|
||||
@@ -64,8 +79,8 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
return ""
|
||||
|
||||
async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
|
||||
"""并发抓取多个 URL,限制单个插件内部并发为 3"""
|
||||
semaphore = asyncio.Semaphore(3)
|
||||
"""并发抓取多个 URL,限制单个插件内部并发"""
|
||||
semaphore = asyncio.Semaphore(self.max_concurrency)
|
||||
|
||||
async def _fetch_limited(url: str):
|
||||
async with semaphore:
|
||||
@@ -74,6 +89,70 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
tasks = [_fetch_limited(url) for url in urls]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
def parse_text_proxies(self, text: str, protocol: str = "http") -> List[ProxyRaw]:
|
||||
"""解析 ip:port 格式的文本代理列表
|
||||
|
||||
统一处理 \r\n、\n 两种换行以及可能存在的空行。
|
||||
"""
|
||||
results = []
|
||||
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line or ":" not in line:
|
||||
continue
|
||||
ip, _, port = line.rpartition(":")
|
||||
ip = ip.strip()
|
||||
port = port.strip()
|
||||
if ip and port.isdigit() and 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), protocol))
|
||||
except ValueError:
|
||||
continue
|
||||
return results
|
||||
|
||||
def parse_html_table(
|
||||
self,
|
||||
html: str,
|
||||
column_map: dict,
|
||||
protocol: str = "http",
|
||||
) -> List[ProxyRaw]:
|
||||
"""通用 HTML 表格解析器
|
||||
|
||||
Args:
|
||||
html: HTML 文本
|
||||
column_map: 列名到索引的映射,如 {"ip": 0, "port": 1, "protocol": 4}
|
||||
protocol: 默认协议,如果表格中没有协议列则使用此值
|
||||
"""
|
||||
results = []
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
table = soup.find("table")
|
||||
if not table:
|
||||
return results
|
||||
|
||||
ip_idx = column_map.get("ip", 0)
|
||||
port_idx = column_map.get("port", 1)
|
||||
protocol_idx = column_map.get("protocol", -1)
|
||||
|
||||
for row in table.find_all("tr"):
|
||||
tds = row.find_all("td")
|
||||
if len(tds) <= max(ip_idx, port_idx):
|
||||
continue
|
||||
ip = tds[ip_idx].get_text(strip=True)
|
||||
port = tds[port_idx].get_text(strip=True)
|
||||
if protocol_idx >= 0 and len(tds) > protocol_idx:
|
||||
proto = tds[protocol_idx].get_text(strip=True).lower()
|
||||
if proto not in VALID_PROTOCOLS:
|
||||
proto = protocol
|
||||
else:
|
||||
proto = protocol
|
||||
|
||||
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
|
||||
try:
|
||||
results.append(ProxyRaw(ip, int(port), proto))
|
||||
except ValueError:
|
||||
continue
|
||||
return results
|
||||
|
||||
async def close(self):
|
||||
"""关闭复用的 HTTP 客户端"""
|
||||
if self._client and not self._client.is_closed:
|
||||
|
||||
Reference in New Issue
Block a user