103 lines
3.4 KiB
Python
103 lines
3.4 KiB
Python
"""清空 proxies 表,并依次执行各启用插件爬取;可选触发运行中 API 的全量验证。
|
||
|
||
用法(在项目根目录)::
|
||
python script/reset_and_recrawl.py
|
||
python script/reset_and_recrawl.py --api-base http://127.0.0.1:18080
|
||
python script/reset_and_recrawl.py --skip-validate
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import asyncio
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# 项目根
|
||
_ROOT = Path(__file__).resolve().parents[1]
|
||
if str(_ROOT) not in sys.path:
|
||
sys.path.insert(0, str(_ROOT))
|
||
|
||
|
||
async def _main(api_base: str, skip_validate: bool) -> None:
|
||
from app.core.db import init_db, get_db, transaction
|
||
from app.repositories.proxy_repo import ProxyRepository
|
||
from app.core.config import settings
|
||
import app.plugins # noqa: F401 — 注册内置与外部插件
|
||
from app.core.plugin_system.registry import registry
|
||
from app.services.plugin_runner import PluginRunner
|
||
|
||
await init_db()
|
||
async with get_db() as db:
|
||
await db.execute("DELETE FROM proxies")
|
||
await db.commit()
|
||
print("已清空表 proxies")
|
||
|
||
initial = max(
|
||
settings.score_min,
|
||
min(settings.score_max, int(settings.score_valid)),
|
||
)
|
||
runner = PluginRunner()
|
||
total_in = 0
|
||
for plugin in registry.list_plugins():
|
||
if not plugin.enabled:
|
||
print(f"[跳过] {plugin.name}(已禁用)")
|
||
continue
|
||
print(f"[爬取] {plugin.name} …", flush=True)
|
||
try:
|
||
result = await runner.run(plugin)
|
||
proxies = result.proxies or []
|
||
if not proxies:
|
||
err = result.error or "无数据"
|
||
print(f" -> 0 条 ({err})")
|
||
continue
|
||
async with transaction() as db:
|
||
await ProxyRepository.upsert_many_from_crawl(db, proxies, initial)
|
||
total_in += len(proxies)
|
||
print(f" -> {len(proxies)} 条已入库(待验证)")
|
||
except Exception as e:
|
||
print(f" -> 失败: {e}")
|
||
|
||
print(f"爬取阶段结束,累计入库约 {total_in} 条(去重前按插件计)。")
|
||
|
||
if skip_validate:
|
||
print("已跳过远程全量验证。请启动 API 后执行 POST /api/scheduler/validate-now")
|
||
return
|
||
|
||
try:
|
||
import httpx
|
||
except ImportError:
|
||
print("未安装 httpx,跳过远程全量验证。")
|
||
return
|
||
|
||
url = api_base.rstrip("/") + "/api/scheduler/validate-now"
|
||
try:
|
||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||
r = await client.post(url)
|
||
data = r.json() if r.headers.get("content-type", "").startswith("application/json") else {}
|
||
if r.status_code == 200 and data.get("code") == 200:
|
||
print("已提交全量验证:", data.get("data"))
|
||
else:
|
||
print(f"全量验证请求异常 HTTP {r.status_code}: {data or r.text[:200]}")
|
||
except Exception as e:
|
||
print(f"无法连接 API({url}):{e}")
|
||
|
||
|
||
def main() -> None:
|
||
p = argparse.ArgumentParser(description="清空代理并逐插件爬取")
|
||
p.add_argument(
|
||
"--api-base",
|
||
default="http://127.0.0.1:18080",
|
||
help="运行中的 ProxyPool API 根地址,用于提交全量验证",
|
||
)
|
||
p.add_argument(
|
||
"--skip-validate",
|
||
action="store_true",
|
||
help="不调用 HTTP 全量验证",
|
||
)
|
||
args = p.parse_args()
|
||
asyncio.run(_main(args.api_base, args.skip_validate))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|