refactor: 全面重构核心架构，消除反复修改的根因

- 删除 ValidationQueue 双轨持久化队列，替换为纯内存 AsyncWorkerPool - 引入统一后台任务框架 JobExecutor（Job/CrawlJob/ValidateAllJob） - 新增 PluginRunner 统一插件执行（超时、重试、健康检查、统计） - 重构 SchedulerService 职责收敛为仅定时触发 ValidateAllJob - 使用 AsyncExitStack 重构 lifespan，安全管理长生命周期资源 - 路由层瘦身 50%+，业务异常上抛由全局中间件统一处理 - 实现设置全热更新（WorkerPool 并发、Validator 超时即时生效） - 前端 Store 强制写后重新拉取，消除乐观更新数据不同步 - 删除 queue.py / task_repo.py / task_service.py - 新增 execution 单元测试，全部 85 个测试通过
2026-04-04 22:36:57 +08:00
parent 4ef7931941
commit b972b64616
33 changed files with 1168 additions and 864 deletions
--- a/app/plugins/base.py
+++ b/app/plugins/base.py
@@ -1,9 +1,15 @@
 """通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
+import re
 import random
 import asyncio
 import httpx
 from typing import List, Optional
+from bs4 import BeautifulSoup
 from app.core.plugin_system import BaseCrawlerPlugin
+from app.models.domain import ProxyRaw
+
+
+VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")


 class BaseHTTPPlugin(BaseCrawlerPlugin):
@@ -20,6 +26,7 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
        self.urls: List[str] = []
        self.current_url: str = ""
        self._client: Optional[httpx.AsyncClient] = None
+        self.max_concurrency: int = 3

    def get_headers(self) -> dict:
        return {
@@ -39,13 +46,21 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
            )
        return self._client

-    async def fetch(self, url: str, timeout: float = 15.0, retries: int = 2) -> str:
+    async def fetch(
+        self,
+        url: str,
+        timeout: float = 15.0,
+        retries: int = 2,
+        raise_for_status: bool = False,
+    ) -> str:
        """异步抓取指定 URL 的 HTML 内容"""
        from app.core.log import logger
        client = self._get_client()
        for attempt in range(retries):
            try:
                response = await client.get(url, headers=self.get_headers(), timeout=timeout)
+                if raise_for_status:
+                    response.raise_for_status()
                if response.status_code == 200:
                    content = response.content
                    encoding = response.encoding
@@ -64,8 +79,8 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
        return ""

    async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
-        """并发抓取多个 URL，限制单个插件内部并发为 3"""
-        semaphore = asyncio.Semaphore(3)
+        """并发抓取多个 URL，限制单个插件内部并发"""
+        semaphore = asyncio.Semaphore(self.max_concurrency)

        async def _fetch_limited(url: str):
            async with semaphore:
@@ -74,6 +89,70 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
        tasks = [_fetch_limited(url) for url in urls]
        return await asyncio.gather(*tasks)

+    def parse_text_proxies(self, text: str, protocol: str = "http") -> List[ProxyRaw]:
+        """解析 ip:port 格式的文本代理列表
+
+        统一处理 \r\n、\n 两种换行以及可能存在的空行。
+        """
+        results = []
+        text = text.replace("\r\n", "\n").replace("\r", "\n")
+        for line in text.split("\n"):
+            line = line.strip()
+            if not line or ":" not in line:
+                continue
+            ip, _, port = line.rpartition(":")
+            ip = ip.strip()
+            port = port.strip()
+            if ip and port.isdigit() and 1 <= int(port) <= 65535:
+                try:
+                    results.append(ProxyRaw(ip, int(port), protocol))
+                except ValueError:
+                    continue
+        return results
+
+    def parse_html_table(
+        self,
+        html: str,
+        column_map: dict,
+        protocol: str = "http",
+    ) -> List[ProxyRaw]:
+        """通用 HTML 表格解析器
+
+        Args:
+            html: HTML 文本
+            column_map: 列名到索引的映射，如 {"ip": 0, "port": 1, "protocol": 4}
+            protocol: 默认协议，如果表格中没有协议列则使用此值
+        """
+        results = []
+        soup = BeautifulSoup(html, "lxml")
+        table = soup.find("table")
+        if not table:
+            return results
+
+        ip_idx = column_map.get("ip", 0)
+        port_idx = column_map.get("port", 1)
+        protocol_idx = column_map.get("protocol", -1)
+
+        for row in table.find_all("tr"):
+            tds = row.find_all("td")
+            if len(tds) <= max(ip_idx, port_idx):
+                continue
+            ip = tds[ip_idx].get_text(strip=True)
+            port = tds[port_idx].get_text(strip=True)
+            if protocol_idx >= 0 and len(tds) > protocol_idx:
+                proto = tds[protocol_idx].get_text(strip=True).lower()
+                if proto not in VALID_PROTOCOLS:
+                    proto = protocol
+            else:
+                proto = protocol
+
+            if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
+                try:
+                    results.append(ProxyRaw(ip, int(port), proto))
+                except ValueError:
+                    continue
+        return results
+
    async def close(self):
        """关闭复用的 HTTP 客户端"""
        if self._client and not self._client.is_closed: