feat: external plugin loading, score threshold, expiry cleanup and more improvements

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 18:53:33 +08:00
parent 7bc6d4e4de
commit 7d5eaa438a
13 changed files with 302 additions and 39 deletions

102
script/reset_and_recrawl.py Normal file
View File

@@ -0,0 +1,102 @@
"""清空 proxies 表,并依次执行各启用插件爬取;可选触发运行中 API 的全量验证。
用法(在项目根目录)::
python script/reset_and_recrawl.py
python script/reset_and_recrawl.py --api-base http://127.0.0.1:18080
python script/reset_and_recrawl.py --skip-validate
"""
from __future__ import annotations
import argparse
import asyncio
import sys
from pathlib import Path
# 项目根
_ROOT = Path(__file__).resolve().parents[1]
if str(_ROOT) not in sys.path:
sys.path.insert(0, str(_ROOT))
async def _main(api_base: str, skip_validate: bool) -> None:
from app.core.db import init_db, get_db, transaction
from app.repositories.proxy_repo import ProxyRepository
from app.core.config import settings
import app.plugins # noqa: F401 — 注册内置与外部插件
from app.core.plugin_system.registry import registry
from app.services.plugin_runner import PluginRunner
await init_db()
async with get_db() as db:
await db.execute("DELETE FROM proxies")
await db.commit()
print("已清空表 proxies")
initial = max(
settings.score_min,
min(settings.score_max, int(settings.score_valid)),
)
runner = PluginRunner()
total_in = 0
for plugin in registry.list_plugins():
if not plugin.enabled:
print(f"[跳过] {plugin.name}(已禁用)")
continue
print(f"[爬取] {plugin.name}", flush=True)
try:
result = await runner.run(plugin)
proxies = result.proxies or []
if not proxies:
err = result.error or "无数据"
print(f" -> 0 条 ({err})")
continue
async with transaction() as db:
await ProxyRepository.upsert_many_from_crawl(db, proxies, initial)
total_in += len(proxies)
print(f" -> {len(proxies)} 条已入库(待验证)")
except Exception as e:
print(f" -> 失败: {e}")
print(f"爬取阶段结束,累计入库约 {total_in} 条(去重前按插件计)。")
if skip_validate:
print("已跳过远程全量验证。请启动 API 后执行 POST /api/scheduler/validate-now")
return
try:
import httpx
except ImportError:
print("未安装 httpx跳过远程全量验证。")
return
url = api_base.rstrip("/") + "/api/scheduler/validate-now"
try:
async with httpx.AsyncClient(timeout=60.0) as client:
r = await client.post(url)
data = r.json() if r.headers.get("content-type", "").startswith("application/json") else {}
if r.status_code == 200 and data.get("code") == 200:
print("已提交全量验证:", data.get("data"))
else:
print(f"全量验证请求异常 HTTP {r.status_code}: {data or r.text[:200]}")
except Exception as e:
print(f"无法连接 API{url}{e}")
def main() -> None:
p = argparse.ArgumentParser(description="清空代理并逐插件爬取")
p.add_argument(
"--api-base",
default="http://127.0.0.1:18080",
help="运行中的 ProxyPool API 根地址,用于提交全量验证",
)
p.add_argument(
"--skip-validate",
action="store_true",
help="不调用 HTTP 全量验证",
)
args = p.parse_args()
asyncio.run(_main(args.api_base, args.skip_validate))
if __name__ == "__main__":
main()