"""清空 proxies 表,并依次执行各启用插件爬取;可选触发运行中 API 的全量验证。 用法(在项目根目录):: python script/reset_and_recrawl.py python script/reset_and_recrawl.py --api-base http://127.0.0.1:18080 python script/reset_and_recrawl.py --skip-validate """ from __future__ import annotations import argparse import asyncio import sys from pathlib import Path # 项目根 _ROOT = Path(__file__).resolve().parents[1] if str(_ROOT) not in sys.path: sys.path.insert(0, str(_ROOT)) async def _main(api_base: str, skip_validate: bool) -> None: from app.core.db import init_db, get_db, transaction from app.repositories.proxy_repo import ProxyRepository from app.core.config import settings import app.plugins # noqa: F401 — 注册内置与外部插件 from app.core.plugin_system.registry import registry from app.services.plugin_runner import PluginRunner await init_db() async with get_db() as db: await db.execute("DELETE FROM proxies") await db.commit() print("已清空表 proxies") initial = max( settings.score_min, min(settings.score_max, int(settings.score_valid)), ) runner = PluginRunner() total_in = 0 for plugin in registry.list_plugins(): if not plugin.enabled: print(f"[跳过] {plugin.name}(已禁用)") continue print(f"[爬取] {plugin.name} …", flush=True) try: result = await runner.run(plugin) proxies = result.proxies or [] if not proxies: err = result.error or "无数据" print(f" -> 0 条 ({err})") continue async with transaction() as db: await ProxyRepository.upsert_many_from_crawl(db, proxies, initial) total_in += len(proxies) print(f" -> {len(proxies)} 条已入库(待验证)") except Exception as e: print(f" -> 失败: {e}") print(f"爬取阶段结束,累计入库约 {total_in} 条(去重前按插件计)。") if skip_validate: print("已跳过远程全量验证。请启动 API 后执行 POST /api/scheduler/validate-now") return try: import httpx except ImportError: print("未安装 httpx,跳过远程全量验证。") return url = api_base.rstrip("/") + "/api/scheduler/validate-now" try: async with httpx.AsyncClient(timeout=60.0) as client: r = await client.post(url) data = r.json() if r.headers.get("content-type", "").startswith("application/json") else {} if r.status_code == 200 and data.get("code") == 200: print("已提交全量验证:", data.get("data")) else: print(f"全量验证请求异常 HTTP {r.status_code}: {data or r.text[:200]}") except Exception as e: print(f"无法连接 API({url}):{e}") def main() -> None: p = argparse.ArgumentParser(description="清空代理并逐插件爬取") p.add_argument( "--api-base", default="http://127.0.0.1:18080", help="运行中的 ProxyPool API 根地址,用于提交全量验证", ) p.add_argument( "--skip-validate", action="store_true", help="不调用 HTTP 全量验证", ) args = p.parse_args() asyncio.run(_main(args.api_base, args.skip_validate)) if __name__ == "__main__": main()