feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation

- Add Free_Proxy_Website-style fpw_* plugins and register them - Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting - Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore - Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait - Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys - WebSocket dashboard stats; ProxyList pool_filter and API alignment - POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404 - pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB - .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import Made-with: Cursor
2026-04-05 13:39:19 +08:00
parent 92c7fa19e2
commit 0131c8b408
63 changed files with 2331 additions and 531 deletions
--- a/app/plugins/base.py
+++ b/app/plugins/base.py
@@ -3,7 +3,7 @@ import re
 import random
 import asyncio
 import httpx
-from typing import List, Optional
+from typing import Dict, List, Optional
 from bs4 import BeautifulSoup
 from app.core.plugin_system import BaseCrawlerPlugin
 from app.models.domain import ProxyRaw
@@ -43,9 +43,56 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
            self._client = httpx.AsyncClient(
                transport=transport,
                follow_redirects=True,
+                # 忽略系统 HTTP(S)_PROXY，避免误配导致列表站全部连接失败
+                trust_env=False,
            )
        return self._client

+    @staticmethod
+    def _http_timeout(seconds: float) -> httpx.Timeout:
+        """连接阶段单独收紧，避免 AsyncClient 在部分环境下长时间卡在 connect。"""
+        t = max(2.0, float(seconds))
+        c = min(6.0, max(3.0, t * 0.35))
+        return httpx.Timeout(t, connect=c)
+
+    @staticmethod
+    def _decode_response_body(response: httpx.Response) -> str:
+        content = response.content
+        encoding = response.encoding
+        if encoding == "utf-8" or not encoding:
+            try:
+                return content.decode("utf-8")
+            except UnicodeDecodeError:
+                return content.decode("gbk", errors="ignore")
+        return content.decode(encoding, errors="ignore")
+
+    def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
+        """同步 GET（部分站点在 Windows 上 AsyncClient 易 ConnectTimeout，同步 Client 正常）。"""
+        to = BaseHTTPPlugin._http_timeout(timeout)
+        with httpx.Client(
+            transport=httpx.HTTPTransport(retries=0),
+            follow_redirects=True,
+            trust_env=False,
+        ) as c:
+            r = c.get(url, headers=headers, timeout=to)
+            if r.status_code != 200:
+                return ""
+            return self._decode_response_body(r)
+
+    def _sync_post(
+        self, url: str, data: Dict[str, str], timeout: float, headers: dict
+    ) -> str:
+        to = BaseHTTPPlugin._http_timeout(timeout)
+        with httpx.Client(
+            transport=httpx.HTTPTransport(retries=0),
+            follow_redirects=True,
+            trust_env=False,
+        ) as c:
+            r = c.post(url, headers=headers, data=data, timeout=to)
+            if r.status_code != 200:
+                return ""
+            return self._decode_response_body(r)
+
    async def fetch(
        self,
        url: str,
@@ -56,35 +103,81 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
        """异步抓取指定 URL 的 HTML 内容"""
        from app.core.log import logger
        client = self._get_client()
+        to = self._http_timeout(timeout)
        for attempt in range(retries):
            try:
-                response = await client.get(url, headers=self.get_headers(), timeout=timeout)
+                response = await client.get(url, headers=self.get_headers(), timeout=to)
                if raise_for_status:
                    response.raise_for_status()
                if response.status_code == 200:
-                    content = response.content
-                    encoding = response.encoding
-                    if encoding == "utf-8" or not encoding:
-                        try:
-                            return content.decode("utf-8")
-                        except UnicodeDecodeError:
-                            return content.decode("gbk", errors="ignore")
-                    return content.decode(encoding, errors="ignore")
-                else:
-                    logger.warning(f"Fetch {url} returned status {response.status_code}")
+                    return self._decode_response_body(response)
+                logger.warning(f"Fetch {url} returned status {response.status_code}")
            except Exception as e:
                logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
            if attempt < retries - 1:
                await asyncio.sleep(random.uniform(1, 3))
+        try:
+            text = await asyncio.to_thread(
+                self._sync_get, url, timeout, self.get_headers()
+            )
+            if text:
+                logger.info(f"Fetch {url} 使用同步回退成功")
+                return text
+        except Exception as e:
+            logger.warning(f"Fetch {url} 同步回退失败: {e}")
        return ""

-    async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
+    async def fetch_post(
+        self,
+        url: str,
+        data: Optional[Dict[str, str]] = None,
+        timeout: float = 15.0,
+        retries: int = 2,
+    ) -> str:
+        """POST application/x-www-form-urlencoded，用于 spys.one 等表单页。"""
+        from app.core.log import logger
+
+        client = self._get_client()
+        payload = data or {}
+        to = self._http_timeout(timeout)
+        for attempt in range(retries):
+            try:
+                response = await client.post(
+                    url,
+                    headers=self.get_headers(),
+                    data=payload,
+                    timeout=to,
+                )
+                if response.status_code == 200:
+                    return self._decode_response_body(response)
+                logger.warning(f"POST {url} returned status {response.status_code}")
+            except Exception as e:
+                logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
+            if attempt < retries - 1:
+                await asyncio.sleep(random.uniform(1, 3))
+        try:
+            text = await asyncio.to_thread(
+                self._sync_post, url, payload, timeout, self.get_headers()
+            )
+            if text:
+                logger.info(f"POST {url} 使用同步回退成功")
+                return text
+        except Exception as e:
+            logger.warning(f"POST {url} 同步回退失败: {e}")
+        return ""
+
+    async def fetch_all(
+        self,
+        urls: List[str],
+        timeout: float = 15.0,
+        retries: int = 2,
+    ) -> List[str]:
        """并发抓取多个 URL，限制单个插件内部并发"""
        semaphore = asyncio.Semaphore(self.max_concurrency)

        async def _fetch_limited(url: str):
            async with semaphore:
-                return await self.fetch(url, timeout=timeout)
+                return await self.fetch(url, timeout=timeout, retries=retries)

        tasks = [_fetch_limited(url) for url in urls]
        return await asyncio.gather(*tasks)