Files
ProxyPool/script/reset_and_recrawl.py

103 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""清空 proxies 表,并依次执行各启用插件爬取;可选触发运行中 API 的全量验证。
用法(在项目根目录)::
python script/reset_and_recrawl.py
python script/reset_and_recrawl.py --api-base http://127.0.0.1:18080
python script/reset_and_recrawl.py --skip-validate
"""
from __future__ import annotations
import argparse
import asyncio
import sys
from pathlib import Path
# 项目根
_ROOT = Path(__file__).resolve().parents[1]
if str(_ROOT) not in sys.path:
sys.path.insert(0, str(_ROOT))
async def _main(api_base: str, skip_validate: bool) -> None:
from app.core.db import init_db, get_db, transaction
from app.repositories.proxy_repo import ProxyRepository
from app.core.config import settings
import app.plugins # noqa: F401 — 注册内置与外部插件
from app.core.plugin_system.registry import registry
from app.services.plugin_runner import PluginRunner
await init_db()
async with get_db() as db:
await db.execute("DELETE FROM proxies")
await db.commit()
print("已清空表 proxies")
initial = max(
settings.score_min,
min(settings.score_max, int(settings.score_valid)),
)
runner = PluginRunner()
total_in = 0
for plugin in registry.list_plugins():
if not plugin.enabled:
print(f"[跳过] {plugin.name}(已禁用)")
continue
print(f"[爬取] {plugin.name}", flush=True)
try:
result = await runner.run(plugin)
proxies = result.proxies or []
if not proxies:
err = result.error or "无数据"
print(f" -> 0 条 ({err})")
continue
async with transaction() as db:
await ProxyRepository.upsert_many_from_crawl(db, proxies, initial)
total_in += len(proxies)
print(f" -> {len(proxies)} 条已入库(待验证)")
except Exception as e:
print(f" -> 失败: {e}")
print(f"爬取阶段结束,累计入库约 {total_in} 条(去重前按插件计)。")
if skip_validate:
print("已跳过远程全量验证。请启动 API 后执行 POST /api/scheduler/validate-now")
return
try:
import httpx
except ImportError:
print("未安装 httpx跳过远程全量验证。")
return
url = api_base.rstrip("/") + "/api/scheduler/validate-now"
try:
async with httpx.AsyncClient(timeout=60.0) as client:
r = await client.post(url)
data = r.json() if r.headers.get("content-type", "").startswith("application/json") else {}
if r.status_code == 200 and data.get("code") == 200:
print("已提交全量验证:", data.get("data"))
else:
print(f"全量验证请求异常 HTTP {r.status_code}: {data or r.text[:200]}")
except Exception as e:
print(f"无法连接 API{url}{e}")
def main() -> None:
p = argparse.ArgumentParser(description="清空代理并逐插件爬取")
p.add_argument(
"--api-base",
default="http://127.0.0.1:18080",
help="运行中的 ProxyPool API 根地址,用于提交全量验证",
)
p.add_argument(
"--skip-validate",
action="store_true",
help="不调用 HTTP 全量验证",
)
args = p.parse_args()
asyncio.run(_main(args.api_base, args.skip_validate))
if __name__ == "__main__":
main()