Round 3 fixes: cancelled polling, aggregator invalid_count, filter state, scheduler atomicity, HTTP exception handler, tests

This commit is contained in:
祀梦
2026-04-05 10:20:23 +08:00
parent 49e440cb41
commit dc5f050683
32 changed files with 321 additions and 163 deletions

View File

@@ -19,7 +19,7 @@ class PluginRunner:
"""
def __init__(self, timeout: Optional[float] = None):
self.timeout = timeout or getattr(app_settings, "crawler_timeout", 30)
self.timeout = timeout if timeout is not None else getattr(app_settings, "crawler_timeout", 30)
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
"""执行单个插件爬取"""
@@ -37,6 +37,10 @@ class PluginRunner:
return result
except Exception as e:
logger.warning(f"Plugin {plugin.name} health check error: {e}")
result.error = f"health check error: {e}"
result.failure_count = 1
await self._save_stats(plugin, result)
return result
# 执行爬取
try:
@@ -66,7 +70,7 @@ class PluginRunner:
seen = set()
unique = []
for p in proxies:
key = (p.ip, p.port)
key = (p.ip, p.port, p.protocol)
if key not in seen:
seen.add(key)
unique.append(p)

View File

@@ -132,7 +132,7 @@ class PluginService:
seen = set()
unique = []
for p in all_results:
key = (p.ip, p.port)
key = (p.ip, p.port, p.protocol)
if key not in seen:
seen.add(key)
unique.append(p)

View File

@@ -5,7 +5,7 @@ import io
from datetime import datetime
from typing import List, Optional, Tuple, AsyncIterator
from app.core.db import get_db, transaction
from app.core.db import get_db
from app.repositories.proxy_repo import ProxyRepository
from app.models.domain import Proxy
from app.core.log import logger

View File

@@ -1,6 +1,6 @@
"""调度器服务 - 定时触发全量验证"""
import asyncio
from typing import Optional
from typing import Optional, Any
from app.core.execution.executor import JobExecutor
from app.core.execution.job import ValidateAllJob
@@ -17,9 +17,11 @@ class SchedulerService:
def __init__(
self,
executor: JobExecutor,
worker_pool: Optional[Any] = None,
interval_minutes: int = 30,
):
self.executor = executor
self.worker_pool = worker_pool
self.interval_minutes = interval_minutes
self.running = False
self._stop_event = asyncio.Event()
@@ -50,7 +52,7 @@ class SchedulerService:
def validate_all_now(self) -> str:
"""立即执行一次全量验证,返回 Job ID"""
job_id = self.executor.submit_job(ValidateAllJob())
job_id = self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
logger.info(f"ValidateAllJob submitted: {job_id}")
return job_id
@@ -58,7 +60,7 @@ class SchedulerService:
"""定时循环"""
while self.running:
try:
self.executor.submit_job(ValidateAllJob())
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
except Exception as e:
logger.error(f"Scheduler loop error: {e}", exc_info=True)
# 等待下一次

View File

@@ -39,9 +39,9 @@ class ValidatorService:
max_concurrency: Optional[int] = None,
):
# 初始化时使用传入值或默认值,但运行期会动态读取 settings
self._init_timeout = timeout or app_settings.validator_timeout
self._init_connect_timeout = connect_timeout or app_settings.validator_connect_timeout
self._init_max_concurrency = max_concurrency or app_settings.validator_max_concurrency
self._init_timeout = timeout if timeout is not None else app_settings.validator_timeout
self._init_connect_timeout = connect_timeout if connect_timeout is not None else app_settings.validator_connect_timeout
self._init_max_concurrency = max_concurrency if max_concurrency is not None else app_settings.validator_max_concurrency
self._http_connector: Optional[aiohttp.TCPConnector] = None
self._http_session: Optional[aiohttp.ClientSession] = None
@@ -61,23 +61,26 @@ class ValidatorService:
def max_concurrency(self) -> int:
return int(self._init_max_concurrency)
def _ensure_session(self) -> aiohttp.ClientSession:
async def _ensure_session(self) -> aiohttp.ClientSession:
"""懒加载共享 HTTP session"""
if self._http_session is None or self._http_session.closed:
connector = aiohttp.TCPConnector(
ssl=False,
limit=self.max_concurrency,
limit_per_host=self.max_concurrency,
force_close=False,
)
timeout = aiohttp.ClientTimeout(
total=self.timeout, connect=self.connect_timeout
)
self._http_connector = connector
self._http_session = aiohttp.ClientSession(
connector=connector,
timeout=timeout,
)
async with self._lock:
# 双重检查,避免多个协程在获取锁后重复创建
if self._http_session is None or self._http_session.closed:
connector = aiohttp.TCPConnector(
ssl=False,
limit=self.max_concurrency,
limit_per_host=self.max_concurrency,
force_close=False,
)
timeout = aiohttp.ClientTimeout(
total=self.timeout, connect=self.connect_timeout
)
self._http_connector = connector
self._http_session = aiohttp.ClientSession(
connector=connector,
timeout=timeout,
)
return self._http_session
def _ensure_semaphore(self) -> asyncio.Semaphore:
@@ -119,9 +122,9 @@ class ValidatorService:
return False, 0.0
async def _validate_http(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]:
proxy_url = f"http://{ip}:{port}"
proxy_url = f"{protocol}://{ip}:{port}"
test_url = self._get_test_url(protocol)
session = self._ensure_session()
session = await self._ensure_session()
async with session.get(test_url, proxy=proxy_url, allow_redirects=True) as response:
if response.status in (200, 301, 302):