feat: external plugin loading, score threshold, expiry cleanup and more improvements

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 18:53:33 +08:00
parent 7bc6d4e4de
commit 7d5eaa438a
13 changed files with 302 additions and 39 deletions

View File

@@ -14,6 +14,7 @@ from app.services.validator_service import ValidatorService
from app.services.proxy_scoring import compute_proxy_quality_score from app.services.proxy_scoring import compute_proxy_quality_score
from app.services.plugin_runner import PluginRunner from app.services.plugin_runner import PluginRunner
from app.services.scheduler_service import SchedulerService from app.services.scheduler_service import SchedulerService
from app.services.proxy_service import ProxyService
from app.api.ws_manager import ConnectionManager from app.api.ws_manager import ConnectionManager
from app.api.realtime import stats_broadcaster_loop from app.api.realtime import stats_broadcaster_loop
@@ -80,10 +81,14 @@ async def lifespan(app: FastAPI):
proxy.protocol, proxy.protocol,
score=q_score, score=q_score,
) )
if latency: rt_ms = (
await proxy_repo.update_response_time( float(latency)
db, proxy.ip, proxy.port, latency if latency is not None and float(latency) > 0
) else float(app_settings.score_default_latency_ms)
)
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, rt_ms
)
else: else:
await proxy_repo.delete(db, proxy.ip, proxy.port) await proxy_repo.delete(db, proxy.ip, proxy.port)
else: else:
@@ -104,10 +109,14 @@ async def lifespan(app: FastAPI):
proxy.protocol, proxy.protocol,
score=q_score, score=q_score,
) )
if latency: rt_ms = (
await proxy_repo.update_response_time( float(latency)
db, proxy.ip, proxy.port, latency if latency is not None and float(latency) > 0
) else float(app_settings.score_default_latency_ms)
)
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, rt_ms
)
else: else:
await proxy_repo.update_score( await proxy_repo.update_score(
db, db,
@@ -125,20 +134,25 @@ async def lifespan(app: FastAPI):
) )
await stack.enter_async_context(worker_pool) await stack.enter_async_context(worker_pool)
# Job 执行器:槽位需覆盖「全部爬取」时 N 个 CrawlJob + 聚合任务 + 全量验证等 # Job 执行器:并发槽位crawler_max_queue_size 与插件数共同约束,避免 crawl-all 死锁)
_n_plugins = len(registry.list_plugins()) _n_plugins = len(registry.list_plugins())
_max_jobs = max(24, _n_plugins + 8) _floor = max(24, _n_plugins + 8)
_max_jobs = max(_floor, app_settings.crawler_max_queue_size)
executor = JobExecutor(worker_pool=worker_pool, max_concurrent_jobs=_max_jobs) executor = JobExecutor(worker_pool=worker_pool, max_concurrent_jobs=_max_jobs)
await stack.enter_async_context(executor) await stack.enter_async_context(executor)
# 插件运行器 # 插件运行器
plugin_runner = PluginRunner() plugin_runner = PluginRunner()
proxy_service = ProxyService()
# 调度器 # 调度器
scheduler = SchedulerService( scheduler = SchedulerService(
executor=executor, executor=executor,
worker_pool=worker_pool, worker_pool=worker_pool,
interval_minutes=db_settings.get("validate_interval_minutes", 30), interval_minutes=db_settings.get("validate_interval_minutes", 30),
proxy_service=proxy_service,
settings_repo=settings_repo,
) )
# 挂载到 app.state # 挂载到 app.state

View File

@@ -10,9 +10,12 @@ from app.models.schemas import ProxyListRequest, BatchDeleteRequest, ProxyDelete
from app.api.deps import get_proxy_service, get_scheduler_service from app.api.deps import get_proxy_service, get_scheduler_service
from app.api.common import success_response, format_proxy from app.api.common import success_response, format_proxy
from app.core.exceptions import ProxyPoolException, ProxyNotFoundException from app.core.exceptions import ProxyPoolException, ProxyNotFoundException
from app.core.config import settings as app_settings
router = APIRouter(prefix="/api/proxies", tags=["proxies"]) router = APIRouter(prefix="/api/proxies", tags=["proxies"])
_EXPORT_MAX = int(app_settings.export_max_records)
@router.get("/stats") @router.get("/stats")
async def get_stats( async def get_stats(
@@ -60,7 +63,7 @@ async def get_random_proxy(service: ProxyService = Depends(get_proxy_service)):
async def export_proxies( async def export_proxies(
fmt: str, fmt: str,
protocol: Optional[str] = None, protocol: Optional[str] = None,
limit: int = Query(default=10000, ge=1, le=100000), limit: int = Query(default=_EXPORT_MAX, ge=1, le=_EXPORT_MAX),
service: ProxyService = Depends(get_proxy_service), service: ProxyService = Depends(get_proxy_service),
): ):
if fmt not in ("csv", "txt", "json"): if fmt not in ("csv", "txt", "json"):

View File

@@ -19,7 +19,7 @@ _DEFAULTS: Dict[str, Any] = {
"validator_max_concurrency": 200, "validator_max_concurrency": 200,
"validator_connect_timeout": 3, "validator_connect_timeout": 3,
"crawler_num_validators": 50, "crawler_num_validators": 50,
"crawler_max_queue_size": 500, "crawler_max_queue_size": 48,
"log_level": "INFO", "log_level": "INFO",
"log_dir": "logs", "log_dir": "logs",
"ws_stats_interval_seconds": 1, "ws_stats_interval_seconds": 1,

View File

@@ -102,14 +102,22 @@ class CrawlJob(Job):
proxies: List[ProxyRaw] = result.proxies if result else [] proxies: List[ProxyRaw] = result.proxies if result else []
if proxies: if proxies:
from app.core.config import settings as app_settings
from app.core.db import transaction from app.core.db import transaction
from app.repositories.proxy_repo import ProxyRepository from app.repositories.proxy_repo import ProxyRepository
try: try:
initial = max(
app_settings.score_min,
min(app_settings.score_max, int(app_settings.score_valid)),
)
async with transaction() as db: async with transaction() as db:
await ProxyRepository.upsert_many_from_crawl(db, proxies, 0) await ProxyRepository.upsert_many_from_crawl(
db, proxies, initial
)
logger.info( logger.info(
f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies as pending" f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies "
f"as pending (initial score={initial})"
) )
except Exception as e: except Exception as e:
logger.error( logger.error(

View File

@@ -1,7 +1,10 @@
"""插件注册中心 - 显式注册,类型安全,测试友好""" """插件注册中心 - 显式注册,类型安全,测试友好"""
import importlib import importlib
import importlib.util
import inspect import inspect
import os import os
import sys
from pathlib import Path
from typing import Dict, List, Type, Optional from typing import Dict, List, Type, Optional
from app.core.plugin_system.base import BaseCrawlerPlugin from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.log import logger from app.core.log import logger
@@ -77,6 +80,57 @@ class PluginRegistry:
except Exception as e: except Exception as e:
logger.error(f"Failed to load module {module_name}: {e}") logger.error(f"Failed to load module {module_name}: {e}")
def load_external_plugins_directory(self, directory: Path) -> int:
"""从项目下任意目录加载 ``BaseCrawlerPlugin`` 子类(每个 ``.py`` 一个模块)。
与内置 ``app.plugins`` 并存;若 ``name`` 与已注册插件冲突则跳过并打日志。
"""
directory = Path(directory).resolve()
if not directory.is_dir():
logger.info("外部插件目录不存在,已跳过: %s", directory)
return 0
loaded = 0
for path in sorted(directory.glob("*.py")):
if path.name.startswith("_"):
continue
mod_name = f"proxypool_ext_{path.stem}_{abs(hash(str(path))) % 10_000_000_000}"
try:
spec = importlib.util.spec_from_file_location(mod_name, path)
if spec is None or spec.loader is None:
continue
module = importlib.util.module_from_spec(spec)
sys.modules[mod_name] = module
spec.loader.exec_module(module)
for attr_name in dir(module):
obj = getattr(module, attr_name)
if (
inspect.isclass(obj)
and issubclass(obj, BaseCrawlerPlugin)
and obj is not BaseCrawlerPlugin
and obj not in self._plugins.values()
):
if not getattr(obj, "name", None):
logger.warning(
"跳过外部插件类(缺少 name: %s in %s",
obj.__name__,
path,
)
continue
if obj.name in self._plugins:
logger.warning(
"外部插件 %s 与已注册插件重名,已跳过: %s",
obj.name,
path,
)
continue
self.register(obj)
loaded += 1
except Exception as e:
logger.error("加载外部插件失败 %s: %s", path, e, exc_info=True)
if loaded:
logger.info("%s 额外加载 %s 个插件", directory, loaded)
return loaded
# 全局注册中心实例 # 全局注册中心实例
registry = PluginRegistry() registry = PluginRegistry()

View File

@@ -37,3 +37,12 @@ registry.register(FpwPremproxyPlugin)
registry.register(FpwFreeproxylistsPlugin) registry.register(FpwFreeproxylistsPlugin)
registry.register(FpwGatherproxyPlugin) registry.register(FpwGatherproxyPlugin)
registry.register(FpwCheckerproxyPlugin) registry.register(FpwCheckerproxyPlugin)
# 可选:从 config 的 plugins_dir 加载用户插件(根目录下目录,非 app/plugins 包)
from pathlib import Path
from app.core.config import settings as _app_settings
from app.core.config_paths import project_root as _project_root
_ext_dir = _project_root() / _app_settings.plugins_dir
registry.load_external_plugins_directory(_ext_dir)

View File

@@ -3,6 +3,8 @@ import aiosqlite
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
from app.core.config import settings as app_settings
from app.models.domain import Proxy, ProxyRaw from app.models.domain import Proxy, ProxyRaw
from app.core.log import logger from app.core.log import logger
@@ -54,10 +56,12 @@ class ProxyRepository:
ip: str, ip: str,
port: int, port: int,
protocol: str = "http", protocol: str = "http",
score: int = 10, score: Optional[int] = None,
) -> bool: ) -> bool:
if protocol not in VALID_PROTOCOLS: if protocol not in VALID_PROTOCOLS:
protocol = "http" protocol = "http"
if score is None:
score = int(app_settings.score_valid)
try: try:
await db.execute( await db.execute(
""" """
@@ -85,7 +89,7 @@ class ProxyRepository:
protocol: str = "http", protocol: str = "http",
initial_score: int = 0, initial_score: int = 0,
) -> None: ) -> None:
"""爬取入库:待验证状态validated=0, score=0再次爬取同一条则重置为待验证""" """爬取入库待验证validated=0score 由 initial_score 决定(通常来自配置 score_valid"""
if protocol not in VALID_PROTOCOLS: if protocol not in VALID_PROTOCOLS:
protocol = "http" protocol = "http"
await db.execute( await db.execute(
@@ -232,13 +236,17 @@ class ProxyRepository:
return None return None
@staticmethod @staticmethod
async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]: async def get_random(
db: aiosqlite.Connection, min_score: int = 1
) -> Optional[Proxy]:
ms = max(1, int(min_score))
async with db.execute( async with db.execute(
f""" f"""
SELECT {_SELECT_PROXY_COLS} FROM proxies SELECT {_SELECT_PROXY_COLS} FROM proxies
WHERE validated = 1 AND score > 0 WHERE validated = 1 AND score >= ?
ORDER BY RANDOM() LIMIT 1 ORDER BY RANDOM() LIMIT 1
""" """,
(ms,),
) as cursor: ) as cursor:
row = await cursor.fetchone() row = await cursor.fetchone()
if row: if row:
@@ -306,12 +314,18 @@ class ProxyRepository:
protocol: Optional[str] = None, protocol: Optional[str] = None,
batch_size: int = 1000, batch_size: int = 1000,
only_usable: bool = False, only_usable: bool = False,
usable_min_score: int = 1,
): ):
"""流式分批读取代理,避免一次性加载大量数据到内存""" """流式分批读取代理,避免一次性加载大量数据到内存"""
offset = 0 offset = 0
while True: while True:
batch = await ProxyRepository._list_batch_offset( batch = await ProxyRepository._list_batch_offset(
db, protocol, batch_size, offset, only_usable=only_usable db,
protocol,
batch_size,
offset,
only_usable=only_usable,
usable_min_score=usable_min_score,
) )
if not batch: if not batch:
break break
@@ -325,12 +339,15 @@ class ProxyRepository:
batch_size: int, batch_size: int,
offset: int, offset: int,
only_usable: bool, only_usable: bool,
usable_min_score: int = 1,
) -> List[Proxy]: ) -> List[Proxy]:
query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies" query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
params: List = [] params: List = []
clauses = [] clauses = []
if only_usable: if only_usable:
clauses.append("validated = 1 AND score > 0") ms = max(1, int(usable_min_score))
clauses.append("validated = 1 AND score >= ?")
params.append(ms)
if protocol: if protocol:
clauses.append("protocol = ?") clauses.append("protocol = ?")
params.append(protocol.lower()) params.append(protocol.lower())
@@ -396,12 +413,16 @@ class ProxyRepository:
return proxies, total return proxies, total
@staticmethod @staticmethod
async def get_stats(db: aiosqlite.Connection) -> dict: async def get_stats(
db: aiosqlite.Connection, low_score_threshold: int = 0
) -> dict:
"""统计快照。 """统计快照。
协议计数http/https/socks*)仅含已验证且 score>0 的可用代理,供首页图表与「可用」口径一致。 协议计数http/https/socks*)仅含已验证且 score>0 的可用代理,供首页图表与「可用」口径一致。
pending_* 为待验证池validated=0按协议分布。 pending_* 为待验证池validated=0按协议分布。
invalid_count已验证且 score<=0或 score 低于系统「最低分」阈值(阈值>0 时)。
""" """
thr = max(0, int(low_score_threshold))
query = """ query = """
SELECT SELECT
COUNT(*) as total, COUNT(*) as total,
@@ -416,12 +437,12 @@ class ProxyRepository:
COUNT(CASE WHEN validated = 0 AND protocol = 'https' THEN 1 END) as pending_https_count, COUNT(CASE WHEN validated = 0 AND protocol = 'https' THEN 1 END) as pending_https_count,
COUNT(CASE WHEN validated = 0 AND protocol = 'socks4' THEN 1 END) as pending_socks4_count, COUNT(CASE WHEN validated = 0 AND protocol = 'socks4' THEN 1 END) as pending_socks4_count,
COUNT(CASE WHEN validated = 0 AND protocol = 'socks5' THEN 1 END) as pending_socks5_count, COUNT(CASE WHEN validated = 0 AND protocol = 'socks5' THEN 1 END) as pending_socks5_count,
COUNT(CASE WHEN validated = 1 AND score <= 0 THEN 1 END) as invalid_count, COUNT(CASE WHEN validated = 1 AND (score <= 0 OR (? > 0 AND score < ?)) THEN 1 END) as invalid_count,
(SELECT AVG(response_time_ms) FROM proxies WHERE validated = 1 AND score > 0 (SELECT AVG(response_time_ms) FROM proxies WHERE validated = 1 AND score > 0
AND response_time_ms IS NOT NULL AND response_time_ms > 0) as avg_response_ms AND response_time_ms IS NOT NULL AND response_time_ms > 0) as avg_response_ms
FROM proxies FROM proxies
""" """
async with db.execute(query) as cursor: async with db.execute(query, (thr, thr)) as cursor:
row = await cursor.fetchone() row = await cursor.fetchone()
if row: if row:
avg_lat = row[13] avg_lat = row[13]
@@ -477,10 +498,19 @@ class ProxyRepository:
return 0 return 0
@staticmethod @staticmethod
async def clean_invalid(db: aiosqlite.Connection) -> int: async def clean_invalid(
await db.execute( db: aiosqlite.Connection, low_score_threshold: int = 0
"DELETE FROM proxies WHERE validated = 1 AND score <= 0" ) -> int:
) thr = max(0, int(low_score_threshold))
if thr > 0:
await db.execute(
"DELETE FROM proxies WHERE validated = 1 AND (score <= 0 OR score < ?)",
(thr,),
)
else:
await db.execute(
"DELETE FROM proxies WHERE validated = 1 AND score <= 0"
)
await db.commit() await db.commit()
return db.total_changes return db.total_changes

View File

@@ -5,7 +5,12 @@ from typing import List, Optional
from app.core.db import get_db from app.core.db import get_db
from app.core.plugin_system.registry import registry from app.core.plugin_system.registry import registry
from app.core.plugin_system.base import BaseCrawlerPlugin from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.exceptions import PluginNotFoundException, ValidationException from app.core.exceptions import (
PluginNotFoundException,
ProxyPoolException,
ValidationException,
)
from app.core.config import settings as app_settings
from app.repositories.settings_repo import PluginSettingsRepository from app.repositories.settings_repo import PluginSettingsRepository
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
from app.core.log import logger from app.core.log import logger
@@ -110,7 +115,8 @@ class PluginService:
async def run_all_plugins(self, plugin_runner) -> List[ProxyRaw]: async def run_all_plugins(self, plugin_runner) -> List[ProxyRaw]:
"""执行所有启用插件的爬取,限制并发数以避免触发目标站反爬""" """执行所有启用插件的爬取,限制并发数以避免触发目标站反爬"""
all_results: List[ProxyRaw] = [] all_results: List[ProxyRaw] = []
semaphore = asyncio.Semaphore(5) n = max(1, int(app_settings.crawler_num_validators))
semaphore = asyncio.Semaphore(n)
async def _run_with_limit(plugin_name: str): async def _run_with_limit(plugin_name: str):
plugin = self.get_plugin_or_raise(plugin_name) plugin = self.get_plugin_or_raise(plugin_name)

View File

@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, AsyncIterator
from app.core.db import get_db from app.core.db import get_db
from app.repositories.proxy_repo import ProxyRepository from app.repositories.proxy_repo import ProxyRepository
from app.repositories.settings_repo import SettingsRepository
from app.models.domain import Proxy from app.models.domain import Proxy
from app.core.log import logger from app.core.log import logger
from app.core.config import settings as app_settings from app.core.config import settings as app_settings
@@ -19,7 +20,9 @@ class ProxyService:
async def get_stats(self) -> dict: async def get_stats(self) -> dict:
async with get_db() as db: async with get_db() as db:
stats = await self.proxy_repo.get_stats(db) s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
stats = await self.proxy_repo.get_stats(db, low_score_threshold=floor)
stats["today_new"] = await self.proxy_repo.get_today_new_count(db) stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
return stats return stats
@@ -49,7 +52,10 @@ class ProxyService:
async def get_random_proxy(self) -> Optional[Proxy]: async def get_random_proxy(self) -> Optional[Proxy]:
async with get_db() as db: async with get_db() as db:
p = await self.proxy_repo.get_random(db) s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
ms = max(1, floor)
p = await self.proxy_repo.get_random(db, min_score=ms)
if not p: if not p:
return None return None
new_uc = int(getattr(p, "use_count", 0) or 0) + 1 new_uc = int(getattr(p, "use_count", 0) or 0) + 1
@@ -73,7 +79,9 @@ class ProxyService:
async def clean_invalid(self) -> int: async def clean_invalid(self) -> int:
async with get_db() as db: async with get_db() as db:
return await self.proxy_repo.clean_invalid(db) s = await SettingsRepository.get_all(db)
floor = int(s.get("min_proxy_score", 0))
return await self.proxy_repo.clean_invalid(db, low_score_threshold=floor)
async def clean_expired(self, days: int) -> int: async def clean_expired(self, days: int) -> int:
async with get_db() as db: async with get_db() as db:
@@ -83,8 +91,11 @@ class ProxyService:
self, self,
fmt: str, fmt: str,
protocol: Optional[str] = None, protocol: Optional[str] = None,
limit: int = 10000, limit: Optional[int] = None,
) -> AsyncIterator[str]: ) -> AsyncIterator[str]:
cap = int(app_settings.export_max_records) if limit is None else int(limit)
if cap < 1:
cap = 1
if fmt == "csv": if fmt == "csv":
yield "\ufeffIP,Port,Protocol,Score,Last Check\n" yield "\ufeffIP,Port,Protocol,Score,Last Check\n"
elif fmt == "txt": elif fmt == "txt":
@@ -95,11 +106,17 @@ class ProxyService:
exported = 0 exported = 0
async with get_db() as db: async with get_db() as db:
s = await SettingsRepository.get_all(db)
floor = max(1, int(s.get("min_proxy_score", 0)))
async for batch in self.proxy_repo.iter_batches( async for batch in self.proxy_repo.iter_batches(
db, protocol=protocol, batch_size=1000, only_usable=True db,
protocol=protocol,
batch_size=1000,
only_usable=True,
usable_min_score=floor,
): ):
for p in batch: for p in batch:
if exported >= limit: if exported >= cap:
break break
if fmt == "csv": if fmt == "csv":
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n" yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
@@ -117,7 +134,7 @@ class ProxyService:
yield prefix + json.dumps(item, ensure_ascii=False) yield prefix + json.dumps(item, ensure_ascii=False)
first = False first = False
exported += 1 exported += 1
if exported >= limit: if exported >= cap:
break break
if fmt == "json": if fmt == "json":

View File

@@ -19,10 +19,14 @@ class SchedulerService:
executor: JobExecutor, executor: JobExecutor,
worker_pool: Optional[Any] = None, worker_pool: Optional[Any] = None,
interval_minutes: int = 30, interval_minutes: int = 30,
proxy_service: Optional[Any] = None,
settings_repo: Optional[Any] = None,
): ):
self.executor = executor self.executor = executor
self.worker_pool = worker_pool self.worker_pool = worker_pool
self.interval_minutes = interval_minutes self.interval_minutes = interval_minutes
self._proxy_service = proxy_service
self._settings_repo = settings_repo
self.running = False self.running = False
self._stop_event = asyncio.Event() self._stop_event = asyncio.Event()
self._task: Optional[asyncio.Task] = None self._task: Optional[asyncio.Task] = None
@@ -59,6 +63,22 @@ class SchedulerService:
async def _run_loop(self) -> None: async def _run_loop(self) -> None:
"""定时循环""" """定时循环"""
while self.running: while self.running:
if self._proxy_service is not None and self._settings_repo is not None:
try:
from app.core.db import get_db
async with get_db() as db:
s = await self._settings_repo.get_all(db)
days = int(s.get("proxy_expiry_days", 7))
removed = await self._proxy_service.clean_expired(days)
if removed:
logger.info(
"Scheduler removed %s proxies (last_check older than %s days)",
removed,
days,
)
except Exception as e:
logger.error("Scheduler clean_expired failed: %s", e, exc_info=True)
try: try:
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool)) self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
except Exception as e: except Exception as e:

View File

@@ -6,7 +6,7 @@
"validator_max_concurrency": 200, "validator_max_concurrency": 200,
"validator_connect_timeout": 3, "validator_connect_timeout": 3,
"crawler_num_validators": 50, "crawler_num_validators": 50,
"crawler_max_queue_size": 500, "crawler_max_queue_size": 48,
"log_level": "INFO", "log_level": "INFO",
"log_dir": "logs", "log_dir": "logs",
"ws_stats_interval_seconds": 1, "ws_stats_interval_seconds": 1,

View File

@@ -6,7 +6,7 @@
"validator_max_concurrency": 200, "validator_max_concurrency": 200,
"validator_connect_timeout": 3, "validator_connect_timeout": 3,
"crawler_num_validators": 50, "crawler_num_validators": 50,
"crawler_max_queue_size": 500, "crawler_max_queue_size": 48,
"log_level": "INFO", "log_level": "INFO",
"log_dir": "logs", "log_dir": "logs",
"ws_stats_interval_seconds": 1, "ws_stats_interval_seconds": 1,

102
script/reset_and_recrawl.py Normal file
View File

@@ -0,0 +1,102 @@
"""清空 proxies 表,并依次执行各启用插件爬取;可选触发运行中 API 的全量验证。
用法(在项目根目录)::
python script/reset_and_recrawl.py
python script/reset_and_recrawl.py --api-base http://127.0.0.1:18080
python script/reset_and_recrawl.py --skip-validate
"""
from __future__ import annotations
import argparse
import asyncio
import sys
from pathlib import Path
# 项目根
_ROOT = Path(__file__).resolve().parents[1]
if str(_ROOT) not in sys.path:
sys.path.insert(0, str(_ROOT))
async def _main(api_base: str, skip_validate: bool) -> None:
from app.core.db import init_db, get_db, transaction
from app.repositories.proxy_repo import ProxyRepository
from app.core.config import settings
import app.plugins # noqa: F401 — 注册内置与外部插件
from app.core.plugin_system.registry import registry
from app.services.plugin_runner import PluginRunner
await init_db()
async with get_db() as db:
await db.execute("DELETE FROM proxies")
await db.commit()
print("已清空表 proxies")
initial = max(
settings.score_min,
min(settings.score_max, int(settings.score_valid)),
)
runner = PluginRunner()
total_in = 0
for plugin in registry.list_plugins():
if not plugin.enabled:
print(f"[跳过] {plugin.name}(已禁用)")
continue
print(f"[爬取] {plugin.name}", flush=True)
try:
result = await runner.run(plugin)
proxies = result.proxies or []
if not proxies:
err = result.error or "无数据"
print(f" -> 0 条 ({err})")
continue
async with transaction() as db:
await ProxyRepository.upsert_many_from_crawl(db, proxies, initial)
total_in += len(proxies)
print(f" -> {len(proxies)} 条已入库(待验证)")
except Exception as e:
print(f" -> 失败: {e}")
print(f"爬取阶段结束,累计入库约 {total_in} 条(去重前按插件计)。")
if skip_validate:
print("已跳过远程全量验证。请启动 API 后执行 POST /api/scheduler/validate-now")
return
try:
import httpx
except ImportError:
print("未安装 httpx跳过远程全量验证。")
return
url = api_base.rstrip("/") + "/api/scheduler/validate-now"
try:
async with httpx.AsyncClient(timeout=60.0) as client:
r = await client.post(url)
data = r.json() if r.headers.get("content-type", "").startswith("application/json") else {}
if r.status_code == 200 and data.get("code") == 200:
print("已提交全量验证:", data.get("data"))
else:
print(f"全量验证请求异常 HTTP {r.status_code}: {data or r.text[:200]}")
except Exception as e:
print(f"无法连接 API{url}{e}")
def main() -> None:
p = argparse.ArgumentParser(description="清空代理并逐插件爬取")
p.add_argument(
"--api-base",
default="http://127.0.0.1:18080",
help="运行中的 ProxyPool API 根地址,用于提交全量验证",
)
p.add_argument(
"--skip-validate",
action="store_true",
help="不调用 HTTP 全量验证",
)
args = p.parse_args()
asyncio.run(_main(args.api_base, args.skip_validate))
if __name__ == "__main__":
main()