Round 3 fixes: cancelled polling, aggregator invalid_count, filter state, scheduler atomicity, HTTP exception handler, tests
This commit is contained in:
@@ -19,7 +19,7 @@ class PluginRunner:
|
||||
"""
|
||||
|
||||
def __init__(self, timeout: Optional[float] = None):
|
||||
self.timeout = timeout or getattr(app_settings, "crawler_timeout", 30)
|
||||
self.timeout = timeout if timeout is not None else getattr(app_settings, "crawler_timeout", 30)
|
||||
|
||||
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
|
||||
"""执行单个插件爬取"""
|
||||
@@ -37,6 +37,10 @@ class PluginRunner:
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.warning(f"Plugin {plugin.name} health check error: {e}")
|
||||
result.error = f"health check error: {e}"
|
||||
result.failure_count = 1
|
||||
await self._save_stats(plugin, result)
|
||||
return result
|
||||
|
||||
# 执行爬取
|
||||
try:
|
||||
@@ -66,7 +70,7 @@ class PluginRunner:
|
||||
seen = set()
|
||||
unique = []
|
||||
for p in proxies:
|
||||
key = (p.ip, p.port)
|
||||
key = (p.ip, p.port, p.protocol)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(p)
|
||||
|
||||
@@ -132,7 +132,7 @@ class PluginService:
|
||||
seen = set()
|
||||
unique = []
|
||||
for p in all_results:
|
||||
key = (p.ip, p.port)
|
||||
key = (p.ip, p.port, p.protocol)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(p)
|
||||
|
||||
@@ -5,7 +5,7 @@ import io
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple, AsyncIterator
|
||||
|
||||
from app.core.db import get_db, transaction
|
||||
from app.core.db import get_db
|
||||
from app.repositories.proxy_repo import ProxyRepository
|
||||
from app.models.domain import Proxy
|
||||
from app.core.log import logger
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""调度器服务 - 定时触发全量验证"""
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
from typing import Optional, Any
|
||||
|
||||
from app.core.execution.executor import JobExecutor
|
||||
from app.core.execution.job import ValidateAllJob
|
||||
@@ -17,9 +17,11 @@ class SchedulerService:
|
||||
def __init__(
|
||||
self,
|
||||
executor: JobExecutor,
|
||||
worker_pool: Optional[Any] = None,
|
||||
interval_minutes: int = 30,
|
||||
):
|
||||
self.executor = executor
|
||||
self.worker_pool = worker_pool
|
||||
self.interval_minutes = interval_minutes
|
||||
self.running = False
|
||||
self._stop_event = asyncio.Event()
|
||||
@@ -50,7 +52,7 @@ class SchedulerService:
|
||||
|
||||
def validate_all_now(self) -> str:
|
||||
"""立即执行一次全量验证,返回 Job ID"""
|
||||
job_id = self.executor.submit_job(ValidateAllJob())
|
||||
job_id = self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
|
||||
logger.info(f"ValidateAllJob submitted: {job_id}")
|
||||
return job_id
|
||||
|
||||
@@ -58,7 +60,7 @@ class SchedulerService:
|
||||
"""定时循环"""
|
||||
while self.running:
|
||||
try:
|
||||
self.executor.submit_job(ValidateAllJob())
|
||||
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
|
||||
except Exception as e:
|
||||
logger.error(f"Scheduler loop error: {e}", exc_info=True)
|
||||
# 等待下一次
|
||||
|
||||
@@ -39,9 +39,9 @@ class ValidatorService:
|
||||
max_concurrency: Optional[int] = None,
|
||||
):
|
||||
# 初始化时使用传入值或默认值,但运行期会动态读取 settings
|
||||
self._init_timeout = timeout or app_settings.validator_timeout
|
||||
self._init_connect_timeout = connect_timeout or app_settings.validator_connect_timeout
|
||||
self._init_max_concurrency = max_concurrency or app_settings.validator_max_concurrency
|
||||
self._init_timeout = timeout if timeout is not None else app_settings.validator_timeout
|
||||
self._init_connect_timeout = connect_timeout if connect_timeout is not None else app_settings.validator_connect_timeout
|
||||
self._init_max_concurrency = max_concurrency if max_concurrency is not None else app_settings.validator_max_concurrency
|
||||
|
||||
self._http_connector: Optional[aiohttp.TCPConnector] = None
|
||||
self._http_session: Optional[aiohttp.ClientSession] = None
|
||||
@@ -61,23 +61,26 @@ class ValidatorService:
|
||||
def max_concurrency(self) -> int:
|
||||
return int(self._init_max_concurrency)
|
||||
|
||||
def _ensure_session(self) -> aiohttp.ClientSession:
|
||||
async def _ensure_session(self) -> aiohttp.ClientSession:
|
||||
"""懒加载共享 HTTP session"""
|
||||
if self._http_session is None or self._http_session.closed:
|
||||
connector = aiohttp.TCPConnector(
|
||||
ssl=False,
|
||||
limit=self.max_concurrency,
|
||||
limit_per_host=self.max_concurrency,
|
||||
force_close=False,
|
||||
)
|
||||
timeout = aiohttp.ClientTimeout(
|
||||
total=self.timeout, connect=self.connect_timeout
|
||||
)
|
||||
self._http_connector = connector
|
||||
self._http_session = aiohttp.ClientSession(
|
||||
connector=connector,
|
||||
timeout=timeout,
|
||||
)
|
||||
async with self._lock:
|
||||
# 双重检查,避免多个协程在获取锁后重复创建
|
||||
if self._http_session is None or self._http_session.closed:
|
||||
connector = aiohttp.TCPConnector(
|
||||
ssl=False,
|
||||
limit=self.max_concurrency,
|
||||
limit_per_host=self.max_concurrency,
|
||||
force_close=False,
|
||||
)
|
||||
timeout = aiohttp.ClientTimeout(
|
||||
total=self.timeout, connect=self.connect_timeout
|
||||
)
|
||||
self._http_connector = connector
|
||||
self._http_session = aiohttp.ClientSession(
|
||||
connector=connector,
|
||||
timeout=timeout,
|
||||
)
|
||||
return self._http_session
|
||||
|
||||
def _ensure_semaphore(self) -> asyncio.Semaphore:
|
||||
@@ -119,9 +122,9 @@ class ValidatorService:
|
||||
return False, 0.0
|
||||
|
||||
async def _validate_http(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]:
|
||||
proxy_url = f"http://{ip}:{port}"
|
||||
proxy_url = f"{protocol}://{ip}:{port}"
|
||||
test_url = self._get_test_url(protocol)
|
||||
session = self._ensure_session()
|
||||
session = await self._ensure_session()
|
||||
|
||||
async with session.get(test_url, proxy=proxy_url, allow_redirects=True) as response:
|
||||
if response.status in (200, 301, 302):
|
||||
|
||||
Reference in New Issue
Block a user