feat: external plugin loading, score threshold, expiry cleanup and more improvements

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 18:53:33 +08:00
parent 7bc6d4e4de
commit 7d5eaa438a
13 changed files with 302 additions and 39 deletions

View File

@@ -14,6 +14,7 @@ from app.services.validator_service import ValidatorService
from app.services.proxy_scoring import compute_proxy_quality_score
from app.services.plugin_runner import PluginRunner
from app.services.scheduler_service import SchedulerService
from app.services.proxy_service import ProxyService
from app.api.ws_manager import ConnectionManager
from app.api.realtime import stats_broadcaster_loop
@@ -80,10 +81,14 @@ async def lifespan(app: FastAPI):
proxy.protocol,
score=q_score,
)
if latency:
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
rt_ms = (
float(latency)
if latency is not None and float(latency) > 0
else float(app_settings.score_default_latency_ms)
)
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, rt_ms
)
else:
await proxy_repo.delete(db, proxy.ip, proxy.port)
else:
@@ -104,10 +109,14 @@ async def lifespan(app: FastAPI):
proxy.protocol,
score=q_score,
)
if latency:
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
rt_ms = (
float(latency)
if latency is not None and float(latency) > 0
else float(app_settings.score_default_latency_ms)
)
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, rt_ms
)
else:
await proxy_repo.update_score(
db,
@@ -125,20 +134,25 @@ async def lifespan(app: FastAPI):
)
await stack.enter_async_context(worker_pool)
# Job 执行器:槽位需覆盖「全部爬取」时 N 个 CrawlJob + 聚合任务 + 全量验证等
# Job 执行器:并发槽位crawler_max_queue_size 与插件数共同约束,避免 crawl-all 死锁)
_n_plugins = len(registry.list_plugins())
_max_jobs = max(24, _n_plugins + 8)
_floor = max(24, _n_plugins + 8)
_max_jobs = max(_floor, app_settings.crawler_max_queue_size)
executor = JobExecutor(worker_pool=worker_pool, max_concurrent_jobs=_max_jobs)
await stack.enter_async_context(executor)
# 插件运行器
plugin_runner = PluginRunner()
proxy_service = ProxyService()
# 调度器
scheduler = SchedulerService(
executor=executor,
worker_pool=worker_pool,
interval_minutes=db_settings.get("validate_interval_minutes", 30),
proxy_service=proxy_service,
settings_repo=settings_repo,
)
# 挂载到 app.state

View File

@@ -10,9 +10,12 @@ from app.models.schemas import ProxyListRequest, BatchDeleteRequest, ProxyDelete
from app.api.deps import get_proxy_service, get_scheduler_service
from app.api.common import success_response, format_proxy
from app.core.exceptions import ProxyPoolException, ProxyNotFoundException
from app.core.config import settings as app_settings
router = APIRouter(prefix="/api/proxies", tags=["proxies"])
_EXPORT_MAX = int(app_settings.export_max_records)
@router.get("/stats")
async def get_stats(
@@ -60,7 +63,7 @@ async def get_random_proxy(service: ProxyService = Depends(get_proxy_service)):
async def export_proxies(
fmt: str,
protocol: Optional[str] = None,
limit: int = Query(default=10000, ge=1, le=100000),
limit: int = Query(default=_EXPORT_MAX, ge=1, le=_EXPORT_MAX),
service: ProxyService = Depends(get_proxy_service),
):
if fmt not in ("csv", "txt", "json"):

View File

@@ -19,7 +19,7 @@ _DEFAULTS: Dict[str, Any] = {
"validator_max_concurrency": 200,
"validator_connect_timeout": 3,
"crawler_num_validators": 50,
"crawler_max_queue_size": 500,
"crawler_max_queue_size": 48,
"log_level": "INFO",
"log_dir": "logs",
"ws_stats_interval_seconds": 1,

View File

@@ -102,14 +102,22 @@ class CrawlJob(Job):
proxies: List[ProxyRaw] = result.proxies if result else []
if proxies:
from app.core.config import settings as app_settings
from app.core.db import transaction
from app.repositories.proxy_repo import ProxyRepository
try:
initial = max(
app_settings.score_min,
min(app_settings.score_max, int(app_settings.score_valid)),
)
async with transaction() as db:
await ProxyRepository.upsert_many_from_crawl(db, proxies, 0)
await ProxyRepository.upsert_many_from_crawl(
db, proxies, initial
)
logger.info(
f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies as pending"
f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies "
f"as pending (initial score={initial})"
)
except Exception as e:
logger.error(

View File

@@ -1,7 +1,10 @@
"""插件注册中心 - 显式注册,类型安全,测试友好"""
import importlib
import importlib.util
import inspect
import os
import sys
from pathlib import Path
from typing import Dict, List, Type, Optional
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.log import logger
@@ -77,6 +80,57 @@ class PluginRegistry:
except Exception as e:
logger.error(f"Failed to load module {module_name}: {e}")
def load_external_plugins_directory(self, directory: Path) -> int:
"""从项目下任意目录加载 ``BaseCrawlerPlugin`` 子类(每个 ``.py`` 一个模块)。
与内置 ``app.plugins`` 并存;若 ``name`` 与已注册插件冲突则跳过并打日志。
"""
directory = Path(directory).resolve()
if not directory.is_dir():
logger.info("外部插件目录不存在,已跳过: %s", directory)
return 0
loaded = 0
for path in sorted(directory.glob("*.py")):
if path.name.startswith("_"):
continue
mod_name = f"proxypool_ext_{path.stem}_{abs(hash(str(path))) % 10_000_000_000}"
try:
spec = importlib.util.spec_from_file_location(mod_name, path)
if spec is None or spec.loader is None:
continue
module = importlib.util.module_from_spec(spec)
sys.modules[mod_name] = module
spec.loader.exec_module(module)
for attr_name in dir(module):
obj = getattr(module, attr_name)
if (
inspect.isclass(obj)
and issubclass(obj, BaseCrawlerPlugin)
and obj is not BaseCrawlerPlugin
and obj not in self._plugins.values()
):
if not getattr(obj, "name", None):
logger.warning(
"跳过外部插件类(缺少 name: %s in %s",
obj.__name__,
path,
)
continue
if obj.name in self._plugins:
logger.warning(
"外部插件 %s 与已注册插件重名,已跳过: %s",
obj.name,
path,
)
continue
self.register(obj)
loaded += 1
except Exception as e:
logger.error("加载外部插件失败 %s: %s", path, e, exc_info=True)
if loaded:
logger.info("%s 额外加载 %s 个插件", directory, loaded)
return loaded
# 全局注册中心实例
registry = PluginRegistry()

View File

@@ -37,3 +37,12 @@ registry.register(FpwPremproxyPlugin)
registry.register(FpwFreeproxylistsPlugin)
registry.register(FpwGatherproxyPlugin)
registry.register(FpwCheckerproxyPlugin)
# 可选:从 config 的 plugins_dir 加载用户插件(根目录下目录,非 app/plugins 包)
from pathlib import Path
from app.core.config import settings as _app_settings
from app.core.config_paths import project_root as _project_root
_ext_dir = _project_root() / _app_settings.plugins_dir
registry.load_external_plugins_directory(_ext_dir)

View File

@@ -3,6 +3,8 @@ import aiosqlite
from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Union
from app.core.config import settings as app_settings
from app.models.domain import Proxy, ProxyRaw
from app.core.log import logger
@@ -54,10 +56,12 @@ class ProxyRepository:
ip: str,
port: int,
protocol: str = "http",
score: int = 10,
score: Optional[int] = None,
) -> bool:
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if score is None:
score = int(app_settings.score_valid)
try:
await db.execute(
"""
@@ -85,7 +89,7 @@ class ProxyRepository:
protocol: str = "http",
initial_score: int = 0,
) -> None:
"""爬取入库:待验证状态validated=0, score=0再次爬取同一条则重置为待验证"""
"""爬取入库待验证validated=0score 由 initial_score 决定(通常来自配置 score_valid"""
if protocol not in VALID_PROTOCOLS:
protocol = "http"
await db.execute(
@@ -232,13 +236,17 @@ class ProxyRepository:
return None
@staticmethod
async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]:
async def get_random(
db: aiosqlite.Connection, min_score: int = 1
) -> Optional[Proxy]:
ms = max(1, int(min_score))
async with db.execute(
f"""
SELECT {_SELECT_PROXY_COLS} FROM proxies
WHERE validated = 1 AND score > 0
WHERE validated = 1 AND score >= ?
ORDER BY RANDOM() LIMIT 1
"""
""",
(ms,),
) as cursor:
row = await cursor.fetchone()
if row:
@@ -306,12 +314,18 @@ class ProxyRepository:
protocol: Optional[str] = None,
batch_size: int = 1000,
only_usable: bool = False,
usable_min_score: int = 1,
):
"""流式分批读取代理,避免一次性加载大量数据到内存"""
offset = 0
while True:
batch = await ProxyRepository._list_batch_offset(
db, protocol, batch_size, offset, only_usable=only_usable
db,
protocol,
batch_size,
offset,
only_usable=only_usable,
usable_min_score=usable_min_score,
)
if not batch:
break
@@ -325,12 +339,15 @@ class ProxyRepository:
batch_size: int,
offset: int,
only_usable: bool,
usable_min_score: int = 1,
) -> List[Proxy]:
query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
params: List = []
clauses = []
if only_usable:
clauses.append("validated = 1 AND score > 0")
ms = max(1, int(usable_min_score))
clauses.append("validated = 1 AND score >= ?")
params.append(ms)
if protocol:
clauses.append("protocol = ?")
params.append(protocol.lower())
@@ -396,12 +413,16 @@ class ProxyRepository:
return proxies, total
@staticmethod
async def get_stats(db: aiosqlite.Connection) -> dict:
async def get_stats(
db: aiosqlite.Connection, low_score_threshold: int = 0
) -> dict:
"""统计快照。
协议计数http/https/socks*)仅含已验证且 score>0 的可用代理,供首页图表与「可用」口径一致。
pending_* 为待验证池validated=0按协议分布。
invalid_count已验证且 score<=0或 score 低于系统「最低分」阈值(阈值>0 时)。
"""
thr = max(0, int(low_score_threshold))
query = """
SELECT
COUNT(*) as total,
@@ -416,12 +437,12 @@ class ProxyRepository:
COUNT(CASE WHEN validated = 0 AND protocol = 'https' THEN 1 END) as pending_https_count,
COUNT(CASE WHEN validated = 0 AND protocol = 'socks4' THEN 1 END) as pending_socks4_count,
COUNT(CASE WHEN validated = 0 AND protocol = 'socks5' THEN 1 END) as pending_socks5_count,
COUNT(CASE WHEN validated = 1 AND score <= 0 THEN 1 END) as invalid_count,
COUNT(CASE WHEN validated = 1 AND (score <= 0 OR (? > 0 AND score < ?)) THEN 1 END) as invalid_count,
(SELECT AVG(response_time_ms) FROM proxies WHERE validated = 1 AND score > 0
AND response_time_ms IS NOT NULL AND response_time_ms > 0) as avg_response_ms
FROM proxies
"""
async with db.execute(query) as cursor:
async with db.execute(query, (thr, thr)) as cursor:
row = await cursor.fetchone()
if row:
avg_lat = row[13]
@@ -477,10 +498,19 @@ class ProxyRepository:
return 0
@staticmethod
async def clean_invalid(db: aiosqlite.Connection) -> int:
await db.execute(
"DELETE FROM proxies WHERE validated = 1 AND score <= 0"
)
async def clean_invalid(
db: aiosqlite.Connection, low_score_threshold: int = 0
) -> int:
thr = max(0, int(low_score_threshold))
if thr > 0:
await db.execute(
"DELETE FROM proxies WHERE validated = 1 AND (score <= 0 OR score < ?)",
(thr,),
)
else:
await db.execute(
"DELETE FROM proxies WHERE validated = 1 AND score <= 0"
)
await db.commit()
return db.total_changes

View File

@@ -5,7 +5,12 @@ from typing import List, Optional
from app.core.db import get_db
from app.core.plugin_system.registry import registry
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.exceptions import PluginNotFoundException, ValidationException
from app.core.exceptions import (
PluginNotFoundException,
ProxyPoolException,
ValidationException,
)
from app.core.config import settings as app_settings
from app.repositories.settings_repo import PluginSettingsRepository
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
from app.core.log import logger
@@ -110,7 +115,8 @@ class PluginService:
async def run_all_plugins(self, plugin_runner) -> List[ProxyRaw]:
"""执行所有启用插件的爬取,限制并发数以避免触发目标站反爬"""
all_results: List[ProxyRaw] = []
semaphore = asyncio.Semaphore(5)
n = max(1, int(app_settings.crawler_num_validators))
semaphore = asyncio.Semaphore(n)
async def _run_with_limit(plugin_name: str):
plugin = self.get_plugin_or_raise(plugin_name)

View File

@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, AsyncIterator
from app.core.db import get_db
from app.repositories.proxy_repo import ProxyRepository
from app.repositories.settings_repo import SettingsRepository
from app.models.domain import Proxy
from app.core.log import logger
from app.core.config import settings as app_settings
@@ -19,7 +20,9 @@ class ProxyService:
async def get_stats(self) -> dict:
async with get_db() as db:
stats = await self.proxy_repo.get_stats(db)
s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
stats = await self.proxy_repo.get_stats(db, low_score_threshold=floor)
stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
return stats
@@ -49,7 +52,10 @@ class ProxyService:
async def get_random_proxy(self) -> Optional[Proxy]:
async with get_db() as db:
p = await self.proxy_repo.get_random(db)
s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
ms = max(1, floor)
p = await self.proxy_repo.get_random(db, min_score=ms)
if not p:
return None
new_uc = int(getattr(p, "use_count", 0) or 0) + 1
@@ -73,7 +79,9 @@ class ProxyService:
async def clean_invalid(self) -> int:
async with get_db() as db:
return await self.proxy_repo.clean_invalid(db)
s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
return await self.proxy_repo.clean_invalid(db, low_score_threshold=floor)
async def clean_expired(self, days: int) -> int:
async with get_db() as db:
@@ -83,8 +91,11 @@ class ProxyService:
self,
fmt: str,
protocol: Optional[str] = None,
limit: int = 10000,
limit: Optional[int] = None,
) -> AsyncIterator[str]:
cap = int(app_settings.export_max_records) if limit is None else int(limit)
if cap < 1:
cap = 1
if fmt == "csv":
yield "\ufeffIP,Port,Protocol,Score,Last Check\n"
elif fmt == "txt":
@@ -95,11 +106,17 @@ class ProxyService:
exported = 0
async with get_db() as db:
s = await SettingsRepository.get_all(db)
floor = max(1, int(s.get("min_proxy_score", 0)))
async for batch in self.proxy_repo.iter_batches(
db, protocol=protocol, batch_size=1000, only_usable=True
db,
protocol=protocol,
batch_size=1000,
only_usable=True,
usable_min_score=floor,
):
for p in batch:
if exported >= limit:
if exported >= cap:
break
if fmt == "csv":
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
@@ -117,7 +134,7 @@ class ProxyService:
yield prefix + json.dumps(item, ensure_ascii=False)
first = False
exported += 1
if exported >= limit:
if exported >= cap:
break
if fmt == "json":

View File

@@ -19,10 +19,14 @@ class SchedulerService:
executor: JobExecutor,
worker_pool: Optional[Any] = None,
interval_minutes: int = 30,
proxy_service: Optional[Any] = None,
settings_repo: Optional[Any] = None,
):
self.executor = executor
self.worker_pool = worker_pool
self.interval_minutes = interval_minutes
self._proxy_service = proxy_service
self._settings_repo = settings_repo
self.running = False
self._stop_event = asyncio.Event()
self._task: Optional[asyncio.Task] = None
@@ -59,6 +63,22 @@ class SchedulerService:
async def _run_loop(self) -> None:
"""定时循环"""
while self.running:
if self._proxy_service is not None and self._settings_repo is not None:
try:
from app.core.db import get_db
async with get_db() as db:
s = await self._settings_repo.get_all(db)
days = int(s.get("proxy_expiry_days", 7))
removed = await self._proxy_service.clean_expired(days)
if removed:
logger.info(
"Scheduler removed %s proxies (last_check older than %s days)",
removed,
days,
)
except Exception as e:
logger.error("Scheduler clean_expired failed: %s", e, exc_info=True)
try:
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
except Exception as e: