feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation
- Add Free_Proxy_Website-style fpw_* plugins and register them - Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting - Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore - Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait - Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys - WebSocket dashboard stats; ProxyList pool_filter and API alignment - POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404 - pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB - .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import Made-with: Cursor
This commit is contained in:
@@ -3,7 +3,7 @@ import re
|
||||
import random
|
||||
import asyncio
|
||||
import httpx
|
||||
from typing import List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
from app.core.plugin_system import BaseCrawlerPlugin
|
||||
from app.models.domain import ProxyRaw
|
||||
@@ -43,9 +43,56 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
self._client = httpx.AsyncClient(
|
||||
transport=transport,
|
||||
follow_redirects=True,
|
||||
# 忽略系统 HTTP(S)_PROXY,避免误配导致列表站全部连接失败
|
||||
trust_env=False,
|
||||
)
|
||||
return self._client
|
||||
|
||||
@staticmethod
|
||||
def _http_timeout(seconds: float) -> httpx.Timeout:
|
||||
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
|
||||
t = max(2.0, float(seconds))
|
||||
c = min(6.0, max(3.0, t * 0.35))
|
||||
return httpx.Timeout(t, connect=c)
|
||||
|
||||
@staticmethod
|
||||
def _decode_response_body(response: httpx.Response) -> str:
|
||||
content = response.content
|
||||
encoding = response.encoding
|
||||
if encoding == "utf-8" or not encoding:
|
||||
try:
|
||||
return content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return content.decode("gbk", errors="ignore")
|
||||
return content.decode(encoding, errors="ignore")
|
||||
|
||||
def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
|
||||
"""同步 GET(部分站点在 Windows 上 AsyncClient 易 ConnectTimeout,同步 Client 正常)。"""
|
||||
to = BaseHTTPPlugin._http_timeout(timeout)
|
||||
with httpx.Client(
|
||||
transport=httpx.HTTPTransport(retries=0),
|
||||
follow_redirects=True,
|
||||
trust_env=False,
|
||||
) as c:
|
||||
r = c.get(url, headers=headers, timeout=to)
|
||||
if r.status_code != 200:
|
||||
return ""
|
||||
return self._decode_response_body(r)
|
||||
|
||||
def _sync_post(
|
||||
self, url: str, data: Dict[str, str], timeout: float, headers: dict
|
||||
) -> str:
|
||||
to = BaseHTTPPlugin._http_timeout(timeout)
|
||||
with httpx.Client(
|
||||
transport=httpx.HTTPTransport(retries=0),
|
||||
follow_redirects=True,
|
||||
trust_env=False,
|
||||
) as c:
|
||||
r = c.post(url, headers=headers, data=data, timeout=to)
|
||||
if r.status_code != 200:
|
||||
return ""
|
||||
return self._decode_response_body(r)
|
||||
|
||||
async def fetch(
|
||||
self,
|
||||
url: str,
|
||||
@@ -56,35 +103,81 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
|
||||
"""异步抓取指定 URL 的 HTML 内容"""
|
||||
from app.core.log import logger
|
||||
client = self._get_client()
|
||||
to = self._http_timeout(timeout)
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
response = await client.get(url, headers=self.get_headers(), timeout=timeout)
|
||||
response = await client.get(url, headers=self.get_headers(), timeout=to)
|
||||
if raise_for_status:
|
||||
response.raise_for_status()
|
||||
if response.status_code == 200:
|
||||
content = response.content
|
||||
encoding = response.encoding
|
||||
if encoding == "utf-8" or not encoding:
|
||||
try:
|
||||
return content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return content.decode("gbk", errors="ignore")
|
||||
return content.decode(encoding, errors="ignore")
|
||||
else:
|
||||
logger.warning(f"Fetch {url} returned status {response.status_code}")
|
||||
return self._decode_response_body(response)
|
||||
logger.warning(f"Fetch {url} returned status {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
|
||||
if attempt < retries - 1:
|
||||
await asyncio.sleep(random.uniform(1, 3))
|
||||
try:
|
||||
text = await asyncio.to_thread(
|
||||
self._sync_get, url, timeout, self.get_headers()
|
||||
)
|
||||
if text:
|
||||
logger.info(f"Fetch {url} 使用同步回退成功")
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.warning(f"Fetch {url} 同步回退失败: {e}")
|
||||
return ""
|
||||
|
||||
async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
|
||||
async def fetch_post(
|
||||
self,
|
||||
url: str,
|
||||
data: Optional[Dict[str, str]] = None,
|
||||
timeout: float = 15.0,
|
||||
retries: int = 2,
|
||||
) -> str:
|
||||
"""POST application/x-www-form-urlencoded,用于 spys.one 等表单页。"""
|
||||
from app.core.log import logger
|
||||
|
||||
client = self._get_client()
|
||||
payload = data or {}
|
||||
to = self._http_timeout(timeout)
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
response = await client.post(
|
||||
url,
|
||||
headers=self.get_headers(),
|
||||
data=payload,
|
||||
timeout=to,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return self._decode_response_body(response)
|
||||
logger.warning(f"POST {url} returned status {response.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
|
||||
if attempt < retries - 1:
|
||||
await asyncio.sleep(random.uniform(1, 3))
|
||||
try:
|
||||
text = await asyncio.to_thread(
|
||||
self._sync_post, url, payload, timeout, self.get_headers()
|
||||
)
|
||||
if text:
|
||||
logger.info(f"POST {url} 使用同步回退成功")
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.warning(f"POST {url} 同步回退失败: {e}")
|
||||
return ""
|
||||
|
||||
async def fetch_all(
|
||||
self,
|
||||
urls: List[str],
|
||||
timeout: float = 15.0,
|
||||
retries: int = 2,
|
||||
) -> List[str]:
|
||||
"""并发抓取多个 URL,限制单个插件内部并发"""
|
||||
semaphore = asyncio.Semaphore(self.max_concurrency)
|
||||
|
||||
async def _fetch_limited(url: str):
|
||||
async with semaphore:
|
||||
return await self.fetch(url, timeout=timeout)
|
||||
return await self.fetch(url, timeout=timeout, retries=retries)
|
||||
|
||||
tasks = [_fetch_limited(url) for url in urls]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
Reference in New Issue
Block a user