feat: external plugin loading, score threshold, expiry cleanup and more improvements
Made-with: Cursor
This commit is contained in:
102
script/reset_and_recrawl.py
Normal file
102
script/reset_and_recrawl.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""清空 proxies 表,并依次执行各启用插件爬取;可选触发运行中 API 的全量验证。
|
||||
|
||||
用法(在项目根目录)::
|
||||
python script/reset_and_recrawl.py
|
||||
python script/reset_and_recrawl.py --api-base http://127.0.0.1:18080
|
||||
python script/reset_and_recrawl.py --skip-validate
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 项目根
|
||||
_ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(_ROOT))
|
||||
|
||||
|
||||
async def _main(api_base: str, skip_validate: bool) -> None:
|
||||
from app.core.db import init_db, get_db, transaction
|
||||
from app.repositories.proxy_repo import ProxyRepository
|
||||
from app.core.config import settings
|
||||
import app.plugins # noqa: F401 — 注册内置与外部插件
|
||||
from app.core.plugin_system.registry import registry
|
||||
from app.services.plugin_runner import PluginRunner
|
||||
|
||||
await init_db()
|
||||
async with get_db() as db:
|
||||
await db.execute("DELETE FROM proxies")
|
||||
await db.commit()
|
||||
print("已清空表 proxies")
|
||||
|
||||
initial = max(
|
||||
settings.score_min,
|
||||
min(settings.score_max, int(settings.score_valid)),
|
||||
)
|
||||
runner = PluginRunner()
|
||||
total_in = 0
|
||||
for plugin in registry.list_plugins():
|
||||
if not plugin.enabled:
|
||||
print(f"[跳过] {plugin.name}(已禁用)")
|
||||
continue
|
||||
print(f"[爬取] {plugin.name} …", flush=True)
|
||||
try:
|
||||
result = await runner.run(plugin)
|
||||
proxies = result.proxies or []
|
||||
if not proxies:
|
||||
err = result.error or "无数据"
|
||||
print(f" -> 0 条 ({err})")
|
||||
continue
|
||||
async with transaction() as db:
|
||||
await ProxyRepository.upsert_many_from_crawl(db, proxies, initial)
|
||||
total_in += len(proxies)
|
||||
print(f" -> {len(proxies)} 条已入库(待验证)")
|
||||
except Exception as e:
|
||||
print(f" -> 失败: {e}")
|
||||
|
||||
print(f"爬取阶段结束,累计入库约 {total_in} 条(去重前按插件计)。")
|
||||
|
||||
if skip_validate:
|
||||
print("已跳过远程全量验证。请启动 API 后执行 POST /api/scheduler/validate-now")
|
||||
return
|
||||
|
||||
try:
|
||||
import httpx
|
||||
except ImportError:
|
||||
print("未安装 httpx,跳过远程全量验证。")
|
||||
return
|
||||
|
||||
url = api_base.rstrip("/") + "/api/scheduler/validate-now"
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
r = await client.post(url)
|
||||
data = r.json() if r.headers.get("content-type", "").startswith("application/json") else {}
|
||||
if r.status_code == 200 and data.get("code") == 200:
|
||||
print("已提交全量验证:", data.get("data"))
|
||||
else:
|
||||
print(f"全量验证请求异常 HTTP {r.status_code}: {data or r.text[:200]}")
|
||||
except Exception as e:
|
||||
print(f"无法连接 API({url}):{e}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(description="清空代理并逐插件爬取")
|
||||
p.add_argument(
|
||||
"--api-base",
|
||||
default="http://127.0.0.1:18080",
|
||||
help="运行中的 ProxyPool API 根地址,用于提交全量验证",
|
||||
)
|
||||
p.add_argument(
|
||||
"--skip-validate",
|
||||
action="store_true",
|
||||
help="不调用 HTTP 全量验证",
|
||||
)
|
||||
args = p.parse_args()
|
||||
asyncio.run(_main(args.api_base, args.skip_validate))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user