feat: external plugin loading, score threshold, expiry cleanup and more improvements

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 18:53:33 +08:00
parent 7bc6d4e4de
commit 7d5eaa438a
13 changed files with 302 additions and 39 deletions

View File

@@ -5,7 +5,12 @@ from typing import List, Optional
from app.core.db import get_db
from app.core.plugin_system.registry import registry
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.exceptions import PluginNotFoundException, ValidationException
from app.core.exceptions import (
PluginNotFoundException,
ProxyPoolException,
ValidationException,
)
from app.core.config import settings as app_settings
from app.repositories.settings_repo import PluginSettingsRepository
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
from app.core.log import logger
@@ -110,7 +115,8 @@ class PluginService:
async def run_all_plugins(self, plugin_runner) -> List[ProxyRaw]:
"""执行所有启用插件的爬取,限制并发数以避免触发目标站反爬"""
all_results: List[ProxyRaw] = []
semaphore = asyncio.Semaphore(5)
n = max(1, int(app_settings.crawler_num_validators))
semaphore = asyncio.Semaphore(n)
async def _run_with_limit(plugin_name: str):
plugin = self.get_plugin_or_raise(plugin_name)

View File

@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, AsyncIterator
from app.core.db import get_db
from app.repositories.proxy_repo import ProxyRepository
from app.repositories.settings_repo import SettingsRepository
from app.models.domain import Proxy
from app.core.log import logger
from app.core.config import settings as app_settings
@@ -19,7 +20,9 @@ class ProxyService:
async def get_stats(self) -> dict:
async with get_db() as db:
stats = await self.proxy_repo.get_stats(db)
s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
stats = await self.proxy_repo.get_stats(db, low_score_threshold=floor)
stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
return stats
@@ -49,7 +52,10 @@ class ProxyService:
async def get_random_proxy(self) -> Optional[Proxy]:
async with get_db() as db:
p = await self.proxy_repo.get_random(db)
s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
ms = max(1, floor)
p = await self.proxy_repo.get_random(db, min_score=ms)
if not p:
return None
new_uc = int(getattr(p, "use_count", 0) or 0) + 1
@@ -73,7 +79,9 @@ class ProxyService:
async def clean_invalid(self) -> int:
async with get_db() as db:
return await self.proxy_repo.clean_invalid(db)
s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
return await self.proxy_repo.clean_invalid(db, low_score_threshold=floor)
async def clean_expired(self, days: int) -> int:
async with get_db() as db:
@@ -83,8 +91,11 @@ class ProxyService:
self,
fmt: str,
protocol: Optional[str] = None,
limit: int = 10000,
limit: Optional[int] = None,
) -> AsyncIterator[str]:
cap = int(app_settings.export_max_records) if limit is None else int(limit)
if cap < 1:
cap = 1
if fmt == "csv":
yield "\ufeffIP,Port,Protocol,Score,Last Check\n"
elif fmt == "txt":
@@ -95,11 +106,17 @@ class ProxyService:
exported = 0
async with get_db() as db:
s = await SettingsRepository.get_all(db)
floor = max(1, int(s.get("min_proxy_score", 0)))
async for batch in self.proxy_repo.iter_batches(
db, protocol=protocol, batch_size=1000, only_usable=True
db,
protocol=protocol,
batch_size=1000,
only_usable=True,
usable_min_score=floor,
):
for p in batch:
if exported >= limit:
if exported >= cap:
break
if fmt == "csv":
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
@@ -117,7 +134,7 @@ class ProxyService:
yield prefix + json.dumps(item, ensure_ascii=False)
first = False
exported += 1
if exported >= limit:
if exported >= cap:
break
if fmt == "json":

View File

@@ -19,10 +19,14 @@ class SchedulerService:
executor: JobExecutor,
worker_pool: Optional[Any] = None,
interval_minutes: int = 30,
proxy_service: Optional[Any] = None,
settings_repo: Optional[Any] = None,
):
self.executor = executor
self.worker_pool = worker_pool
self.interval_minutes = interval_minutes
self._proxy_service = proxy_service
self._settings_repo = settings_repo
self.running = False
self._stop_event = asyncio.Event()
self._task: Optional[asyncio.Task] = None
@@ -59,6 +63,22 @@ class SchedulerService:
async def _run_loop(self) -> None:
"""定时循环"""
while self.running:
if self._proxy_service is not None and self._settings_repo is not None:
try:
from app.core.db import get_db
async with get_db() as db:
s = await self._settings_repo.get_all(db)
days = int(s.get("proxy_expiry_days", 7))
removed = await self._proxy_service.clean_expired(days)
if removed:
logger.info(
"Scheduler removed %s proxies (last_check older than %s days)",
removed,
days,
)
except Exception as e:
logger.error("Scheduler clean_expired failed: %s", e, exc_info=True)
try:
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
except Exception as e: