feat: external plugin loading, score threshold, expiry cleanup and more improvements
Made-with: Cursor
This commit is contained in:
@@ -5,7 +5,12 @@ from typing import List, Optional
|
||||
from app.core.db import get_db
|
||||
from app.core.plugin_system.registry import registry
|
||||
from app.core.plugin_system.base import BaseCrawlerPlugin
|
||||
from app.core.exceptions import PluginNotFoundException, ValidationException
|
||||
from app.core.exceptions import (
|
||||
PluginNotFoundException,
|
||||
ProxyPoolException,
|
||||
ValidationException,
|
||||
)
|
||||
from app.core.config import settings as app_settings
|
||||
from app.repositories.settings_repo import PluginSettingsRepository
|
||||
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
|
||||
from app.core.log import logger
|
||||
@@ -110,7 +115,8 @@ class PluginService:
|
||||
async def run_all_plugins(self, plugin_runner) -> List[ProxyRaw]:
|
||||
"""执行所有启用插件的爬取,限制并发数以避免触发目标站反爬"""
|
||||
all_results: List[ProxyRaw] = []
|
||||
semaphore = asyncio.Semaphore(5)
|
||||
n = max(1, int(app_settings.crawler_num_validators))
|
||||
semaphore = asyncio.Semaphore(n)
|
||||
|
||||
async def _run_with_limit(plugin_name: str):
|
||||
plugin = self.get_plugin_or_raise(plugin_name)
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, AsyncIterator
|
||||
|
||||
from app.core.db import get_db
|
||||
from app.repositories.proxy_repo import ProxyRepository
|
||||
from app.repositories.settings_repo import SettingsRepository
|
||||
from app.models.domain import Proxy
|
||||
from app.core.log import logger
|
||||
from app.core.config import settings as app_settings
|
||||
@@ -19,7 +20,9 @@ class ProxyService:
|
||||
|
||||
async def get_stats(self) -> dict:
|
||||
async with get_db() as db:
|
||||
stats = await self.proxy_repo.get_stats(db)
|
||||
s = await SettingsRepository.get_all(db)
|
||||
floor = int(s.get("min_proxy_score", 0))
|
||||
stats = await self.proxy_repo.get_stats(db, low_score_threshold=floor)
|
||||
stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
|
||||
return stats
|
||||
|
||||
@@ -49,7 +52,10 @@ class ProxyService:
|
||||
|
||||
async def get_random_proxy(self) -> Optional[Proxy]:
|
||||
async with get_db() as db:
|
||||
p = await self.proxy_repo.get_random(db)
|
||||
s = await SettingsRepository.get_all(db)
|
||||
floor = int(s.get("min_proxy_score", 0))
|
||||
ms = max(1, floor)
|
||||
p = await self.proxy_repo.get_random(db, min_score=ms)
|
||||
if not p:
|
||||
return None
|
||||
new_uc = int(getattr(p, "use_count", 0) or 0) + 1
|
||||
@@ -73,7 +79,9 @@ class ProxyService:
|
||||
|
||||
async def clean_invalid(self) -> int:
|
||||
async with get_db() as db:
|
||||
return await self.proxy_repo.clean_invalid(db)
|
||||
s = await SettingsRepository.get_all(db)
|
||||
floor = int(s.get("min_proxy_score", 0))
|
||||
return await self.proxy_repo.clean_invalid(db, low_score_threshold=floor)
|
||||
|
||||
async def clean_expired(self, days: int) -> int:
|
||||
async with get_db() as db:
|
||||
@@ -83,8 +91,11 @@ class ProxyService:
|
||||
self,
|
||||
fmt: str,
|
||||
protocol: Optional[str] = None,
|
||||
limit: int = 10000,
|
||||
limit: Optional[int] = None,
|
||||
) -> AsyncIterator[str]:
|
||||
cap = int(app_settings.export_max_records) if limit is None else int(limit)
|
||||
if cap < 1:
|
||||
cap = 1
|
||||
if fmt == "csv":
|
||||
yield "\ufeffIP,Port,Protocol,Score,Last Check\n"
|
||||
elif fmt == "txt":
|
||||
@@ -95,11 +106,17 @@ class ProxyService:
|
||||
|
||||
exported = 0
|
||||
async with get_db() as db:
|
||||
s = await SettingsRepository.get_all(db)
|
||||
floor = max(1, int(s.get("min_proxy_score", 0)))
|
||||
async for batch in self.proxy_repo.iter_batches(
|
||||
db, protocol=protocol, batch_size=1000, only_usable=True
|
||||
db,
|
||||
protocol=protocol,
|
||||
batch_size=1000,
|
||||
only_usable=True,
|
||||
usable_min_score=floor,
|
||||
):
|
||||
for p in batch:
|
||||
if exported >= limit:
|
||||
if exported >= cap:
|
||||
break
|
||||
if fmt == "csv":
|
||||
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
|
||||
@@ -117,7 +134,7 @@ class ProxyService:
|
||||
yield prefix + json.dumps(item, ensure_ascii=False)
|
||||
first = False
|
||||
exported += 1
|
||||
if exported >= limit:
|
||||
if exported >= cap:
|
||||
break
|
||||
|
||||
if fmt == "json":
|
||||
|
||||
@@ -19,10 +19,14 @@ class SchedulerService:
|
||||
executor: JobExecutor,
|
||||
worker_pool: Optional[Any] = None,
|
||||
interval_minutes: int = 30,
|
||||
proxy_service: Optional[Any] = None,
|
||||
settings_repo: Optional[Any] = None,
|
||||
):
|
||||
self.executor = executor
|
||||
self.worker_pool = worker_pool
|
||||
self.interval_minutes = interval_minutes
|
||||
self._proxy_service = proxy_service
|
||||
self._settings_repo = settings_repo
|
||||
self.running = False
|
||||
self._stop_event = asyncio.Event()
|
||||
self._task: Optional[asyncio.Task] = None
|
||||
@@ -59,6 +63,22 @@ class SchedulerService:
|
||||
async def _run_loop(self) -> None:
|
||||
"""定时循环"""
|
||||
while self.running:
|
||||
if self._proxy_service is not None and self._settings_repo is not None:
|
||||
try:
|
||||
from app.core.db import get_db
|
||||
|
||||
async with get_db() as db:
|
||||
s = await self._settings_repo.get_all(db)
|
||||
days = int(s.get("proxy_expiry_days", 7))
|
||||
removed = await self._proxy_service.clean_expired(days)
|
||||
if removed:
|
||||
logger.info(
|
||||
"Scheduler removed %s proxies (last_check older than %s days)",
|
||||
removed,
|
||||
days,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Scheduler clean_expired failed: %s", e, exc_info=True)
|
||||
try:
|
||||
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user