feat: external plugin loading, score threshold, expiry cleanup and more improvements
Made-with: Cursor
This commit is contained in:
@@ -14,6 +14,7 @@ from app.services.validator_service import ValidatorService
|
|||||||
from app.services.proxy_scoring import compute_proxy_quality_score
|
from app.services.proxy_scoring import compute_proxy_quality_score
|
||||||
from app.services.plugin_runner import PluginRunner
|
from app.services.plugin_runner import PluginRunner
|
||||||
from app.services.scheduler_service import SchedulerService
|
from app.services.scheduler_service import SchedulerService
|
||||||
|
from app.services.proxy_service import ProxyService
|
||||||
from app.api.ws_manager import ConnectionManager
|
from app.api.ws_manager import ConnectionManager
|
||||||
from app.api.realtime import stats_broadcaster_loop
|
from app.api.realtime import stats_broadcaster_loop
|
||||||
|
|
||||||
@@ -80,10 +81,14 @@ async def lifespan(app: FastAPI):
|
|||||||
proxy.protocol,
|
proxy.protocol,
|
||||||
score=q_score,
|
score=q_score,
|
||||||
)
|
)
|
||||||
if latency:
|
rt_ms = (
|
||||||
await proxy_repo.update_response_time(
|
float(latency)
|
||||||
db, proxy.ip, proxy.port, latency
|
if latency is not None and float(latency) > 0
|
||||||
)
|
else float(app_settings.score_default_latency_ms)
|
||||||
|
)
|
||||||
|
await proxy_repo.update_response_time(
|
||||||
|
db, proxy.ip, proxy.port, rt_ms
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
await proxy_repo.delete(db, proxy.ip, proxy.port)
|
await proxy_repo.delete(db, proxy.ip, proxy.port)
|
||||||
else:
|
else:
|
||||||
@@ -104,10 +109,14 @@ async def lifespan(app: FastAPI):
|
|||||||
proxy.protocol,
|
proxy.protocol,
|
||||||
score=q_score,
|
score=q_score,
|
||||||
)
|
)
|
||||||
if latency:
|
rt_ms = (
|
||||||
await proxy_repo.update_response_time(
|
float(latency)
|
||||||
db, proxy.ip, proxy.port, latency
|
if latency is not None and float(latency) > 0
|
||||||
)
|
else float(app_settings.score_default_latency_ms)
|
||||||
|
)
|
||||||
|
await proxy_repo.update_response_time(
|
||||||
|
db, proxy.ip, proxy.port, rt_ms
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
await proxy_repo.update_score(
|
await proxy_repo.update_score(
|
||||||
db,
|
db,
|
||||||
@@ -125,20 +134,25 @@ async def lifespan(app: FastAPI):
|
|||||||
)
|
)
|
||||||
await stack.enter_async_context(worker_pool)
|
await stack.enter_async_context(worker_pool)
|
||||||
|
|
||||||
# Job 执行器:槽位需覆盖「全部爬取」时 N 个 CrawlJob + 聚合任务 + 全量验证等
|
# Job 执行器:并发槽位(crawler_max_queue_size 与插件数共同约束,避免 crawl-all 死锁)
|
||||||
_n_plugins = len(registry.list_plugins())
|
_n_plugins = len(registry.list_plugins())
|
||||||
_max_jobs = max(24, _n_plugins + 8)
|
_floor = max(24, _n_plugins + 8)
|
||||||
|
_max_jobs = max(_floor, app_settings.crawler_max_queue_size)
|
||||||
executor = JobExecutor(worker_pool=worker_pool, max_concurrent_jobs=_max_jobs)
|
executor = JobExecutor(worker_pool=worker_pool, max_concurrent_jobs=_max_jobs)
|
||||||
await stack.enter_async_context(executor)
|
await stack.enter_async_context(executor)
|
||||||
|
|
||||||
# 插件运行器
|
# 插件运行器
|
||||||
plugin_runner = PluginRunner()
|
plugin_runner = PluginRunner()
|
||||||
|
|
||||||
|
proxy_service = ProxyService()
|
||||||
|
|
||||||
# 调度器
|
# 调度器
|
||||||
scheduler = SchedulerService(
|
scheduler = SchedulerService(
|
||||||
executor=executor,
|
executor=executor,
|
||||||
worker_pool=worker_pool,
|
worker_pool=worker_pool,
|
||||||
interval_minutes=db_settings.get("validate_interval_minutes", 30),
|
interval_minutes=db_settings.get("validate_interval_minutes", 30),
|
||||||
|
proxy_service=proxy_service,
|
||||||
|
settings_repo=settings_repo,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 挂载到 app.state
|
# 挂载到 app.state
|
||||||
|
|||||||
@@ -10,9 +10,12 @@ from app.models.schemas import ProxyListRequest, BatchDeleteRequest, ProxyDelete
|
|||||||
from app.api.deps import get_proxy_service, get_scheduler_service
|
from app.api.deps import get_proxy_service, get_scheduler_service
|
||||||
from app.api.common import success_response, format_proxy
|
from app.api.common import success_response, format_proxy
|
||||||
from app.core.exceptions import ProxyPoolException, ProxyNotFoundException
|
from app.core.exceptions import ProxyPoolException, ProxyNotFoundException
|
||||||
|
from app.core.config import settings as app_settings
|
||||||
|
|
||||||
router = APIRouter(prefix="/api/proxies", tags=["proxies"])
|
router = APIRouter(prefix="/api/proxies", tags=["proxies"])
|
||||||
|
|
||||||
|
_EXPORT_MAX = int(app_settings.export_max_records)
|
||||||
|
|
||||||
|
|
||||||
@router.get("/stats")
|
@router.get("/stats")
|
||||||
async def get_stats(
|
async def get_stats(
|
||||||
@@ -60,7 +63,7 @@ async def get_random_proxy(service: ProxyService = Depends(get_proxy_service)):
|
|||||||
async def export_proxies(
|
async def export_proxies(
|
||||||
fmt: str,
|
fmt: str,
|
||||||
protocol: Optional[str] = None,
|
protocol: Optional[str] = None,
|
||||||
limit: int = Query(default=10000, ge=1, le=100000),
|
limit: int = Query(default=_EXPORT_MAX, ge=1, le=_EXPORT_MAX),
|
||||||
service: ProxyService = Depends(get_proxy_service),
|
service: ProxyService = Depends(get_proxy_service),
|
||||||
):
|
):
|
||||||
if fmt not in ("csv", "txt", "json"):
|
if fmt not in ("csv", "txt", "json"):
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ _DEFAULTS: Dict[str, Any] = {
|
|||||||
"validator_max_concurrency": 200,
|
"validator_max_concurrency": 200,
|
||||||
"validator_connect_timeout": 3,
|
"validator_connect_timeout": 3,
|
||||||
"crawler_num_validators": 50,
|
"crawler_num_validators": 50,
|
||||||
"crawler_max_queue_size": 500,
|
"crawler_max_queue_size": 48,
|
||||||
"log_level": "INFO",
|
"log_level": "INFO",
|
||||||
"log_dir": "logs",
|
"log_dir": "logs",
|
||||||
"ws_stats_interval_seconds": 1,
|
"ws_stats_interval_seconds": 1,
|
||||||
|
|||||||
@@ -102,14 +102,22 @@ class CrawlJob(Job):
|
|||||||
proxies: List[ProxyRaw] = result.proxies if result else []
|
proxies: List[ProxyRaw] = result.proxies if result else []
|
||||||
|
|
||||||
if proxies:
|
if proxies:
|
||||||
|
from app.core.config import settings as app_settings
|
||||||
from app.core.db import transaction
|
from app.core.db import transaction
|
||||||
from app.repositories.proxy_repo import ProxyRepository
|
from app.repositories.proxy_repo import ProxyRepository
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
initial = max(
|
||||||
|
app_settings.score_min,
|
||||||
|
min(app_settings.score_max, int(app_settings.score_valid)),
|
||||||
|
)
|
||||||
async with transaction() as db:
|
async with transaction() as db:
|
||||||
await ProxyRepository.upsert_many_from_crawl(db, proxies, 0)
|
await ProxyRepository.upsert_many_from_crawl(
|
||||||
|
db, proxies, initial
|
||||||
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies as pending"
|
f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies "
|
||||||
|
f"as pending (initial score={initial})"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
"""插件注册中心 - 显式注册,类型安全,测试友好"""
|
"""插件注册中心 - 显式注册,类型安全,测试友好"""
|
||||||
import importlib
|
import importlib
|
||||||
|
import importlib.util
|
||||||
import inspect
|
import inspect
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
from typing import Dict, List, Type, Optional
|
from typing import Dict, List, Type, Optional
|
||||||
from app.core.plugin_system.base import BaseCrawlerPlugin
|
from app.core.plugin_system.base import BaseCrawlerPlugin
|
||||||
from app.core.log import logger
|
from app.core.log import logger
|
||||||
@@ -77,6 +80,57 @@ class PluginRegistry:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to load module {module_name}: {e}")
|
logger.error(f"Failed to load module {module_name}: {e}")
|
||||||
|
|
||||||
|
def load_external_plugins_directory(self, directory: Path) -> int:
|
||||||
|
"""从项目下任意目录加载 ``BaseCrawlerPlugin`` 子类(每个 ``.py`` 一个模块)。
|
||||||
|
|
||||||
|
与内置 ``app.plugins`` 并存;若 ``name`` 与已注册插件冲突则跳过并打日志。
|
||||||
|
"""
|
||||||
|
directory = Path(directory).resolve()
|
||||||
|
if not directory.is_dir():
|
||||||
|
logger.info("外部插件目录不存在,已跳过: %s", directory)
|
||||||
|
return 0
|
||||||
|
loaded = 0
|
||||||
|
for path in sorted(directory.glob("*.py")):
|
||||||
|
if path.name.startswith("_"):
|
||||||
|
continue
|
||||||
|
mod_name = f"proxypool_ext_{path.stem}_{abs(hash(str(path))) % 10_000_000_000}"
|
||||||
|
try:
|
||||||
|
spec = importlib.util.spec_from_file_location(mod_name, path)
|
||||||
|
if spec is None or spec.loader is None:
|
||||||
|
continue
|
||||||
|
module = importlib.util.module_from_spec(spec)
|
||||||
|
sys.modules[mod_name] = module
|
||||||
|
spec.loader.exec_module(module)
|
||||||
|
for attr_name in dir(module):
|
||||||
|
obj = getattr(module, attr_name)
|
||||||
|
if (
|
||||||
|
inspect.isclass(obj)
|
||||||
|
and issubclass(obj, BaseCrawlerPlugin)
|
||||||
|
and obj is not BaseCrawlerPlugin
|
||||||
|
and obj not in self._plugins.values()
|
||||||
|
):
|
||||||
|
if not getattr(obj, "name", None):
|
||||||
|
logger.warning(
|
||||||
|
"跳过外部插件类(缺少 name): %s in %s",
|
||||||
|
obj.__name__,
|
||||||
|
path,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if obj.name in self._plugins:
|
||||||
|
logger.warning(
|
||||||
|
"外部插件 %s 与已注册插件重名,已跳过: %s",
|
||||||
|
obj.name,
|
||||||
|
path,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
self.register(obj)
|
||||||
|
loaded += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("加载外部插件失败 %s: %s", path, e, exc_info=True)
|
||||||
|
if loaded:
|
||||||
|
logger.info("从 %s 额外加载 %s 个插件", directory, loaded)
|
||||||
|
return loaded
|
||||||
|
|
||||||
|
|
||||||
# 全局注册中心实例
|
# 全局注册中心实例
|
||||||
registry = PluginRegistry()
|
registry = PluginRegistry()
|
||||||
|
|||||||
@@ -37,3 +37,12 @@ registry.register(FpwPremproxyPlugin)
|
|||||||
registry.register(FpwFreeproxylistsPlugin)
|
registry.register(FpwFreeproxylistsPlugin)
|
||||||
registry.register(FpwGatherproxyPlugin)
|
registry.register(FpwGatherproxyPlugin)
|
||||||
registry.register(FpwCheckerproxyPlugin)
|
registry.register(FpwCheckerproxyPlugin)
|
||||||
|
|
||||||
|
# 可选:从 config 的 plugins_dir 加载用户插件(根目录下目录,非 app/plugins 包)
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from app.core.config import settings as _app_settings
|
||||||
|
from app.core.config_paths import project_root as _project_root
|
||||||
|
|
||||||
|
_ext_dir = _project_root() / _app_settings.plugins_dir
|
||||||
|
registry.load_external_plugins_directory(_ext_dir)
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ import aiosqlite
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import List, Optional, Tuple, Union
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
from app.core.config import settings as app_settings
|
||||||
|
|
||||||
from app.models.domain import Proxy, ProxyRaw
|
from app.models.domain import Proxy, ProxyRaw
|
||||||
from app.core.log import logger
|
from app.core.log import logger
|
||||||
|
|
||||||
@@ -54,10 +56,12 @@ class ProxyRepository:
|
|||||||
ip: str,
|
ip: str,
|
||||||
port: int,
|
port: int,
|
||||||
protocol: str = "http",
|
protocol: str = "http",
|
||||||
score: int = 10,
|
score: Optional[int] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if protocol not in VALID_PROTOCOLS:
|
if protocol not in VALID_PROTOCOLS:
|
||||||
protocol = "http"
|
protocol = "http"
|
||||||
|
if score is None:
|
||||||
|
score = int(app_settings.score_valid)
|
||||||
try:
|
try:
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"""
|
"""
|
||||||
@@ -85,7 +89,7 @@ class ProxyRepository:
|
|||||||
protocol: str = "http",
|
protocol: str = "http",
|
||||||
initial_score: int = 0,
|
initial_score: int = 0,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""爬取入库:待验证状态(validated=0, score=0);再次爬取同一条则重置为待验证。"""
|
"""爬取入库:待验证(validated=0);score 由 initial_score 决定(通常来自配置 score_valid)。"""
|
||||||
if protocol not in VALID_PROTOCOLS:
|
if protocol not in VALID_PROTOCOLS:
|
||||||
protocol = "http"
|
protocol = "http"
|
||||||
await db.execute(
|
await db.execute(
|
||||||
@@ -232,13 +236,17 @@ class ProxyRepository:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]:
|
async def get_random(
|
||||||
|
db: aiosqlite.Connection, min_score: int = 1
|
||||||
|
) -> Optional[Proxy]:
|
||||||
|
ms = max(1, int(min_score))
|
||||||
async with db.execute(
|
async with db.execute(
|
||||||
f"""
|
f"""
|
||||||
SELECT {_SELECT_PROXY_COLS} FROM proxies
|
SELECT {_SELECT_PROXY_COLS} FROM proxies
|
||||||
WHERE validated = 1 AND score > 0
|
WHERE validated = 1 AND score >= ?
|
||||||
ORDER BY RANDOM() LIMIT 1
|
ORDER BY RANDOM() LIMIT 1
|
||||||
"""
|
""",
|
||||||
|
(ms,),
|
||||||
) as cursor:
|
) as cursor:
|
||||||
row = await cursor.fetchone()
|
row = await cursor.fetchone()
|
||||||
if row:
|
if row:
|
||||||
@@ -306,12 +314,18 @@ class ProxyRepository:
|
|||||||
protocol: Optional[str] = None,
|
protocol: Optional[str] = None,
|
||||||
batch_size: int = 1000,
|
batch_size: int = 1000,
|
||||||
only_usable: bool = False,
|
only_usable: bool = False,
|
||||||
|
usable_min_score: int = 1,
|
||||||
):
|
):
|
||||||
"""流式分批读取代理,避免一次性加载大量数据到内存"""
|
"""流式分批读取代理,避免一次性加载大量数据到内存"""
|
||||||
offset = 0
|
offset = 0
|
||||||
while True:
|
while True:
|
||||||
batch = await ProxyRepository._list_batch_offset(
|
batch = await ProxyRepository._list_batch_offset(
|
||||||
db, protocol, batch_size, offset, only_usable=only_usable
|
db,
|
||||||
|
protocol,
|
||||||
|
batch_size,
|
||||||
|
offset,
|
||||||
|
only_usable=only_usable,
|
||||||
|
usable_min_score=usable_min_score,
|
||||||
)
|
)
|
||||||
if not batch:
|
if not batch:
|
||||||
break
|
break
|
||||||
@@ -325,12 +339,15 @@ class ProxyRepository:
|
|||||||
batch_size: int,
|
batch_size: int,
|
||||||
offset: int,
|
offset: int,
|
||||||
only_usable: bool,
|
only_usable: bool,
|
||||||
|
usable_min_score: int = 1,
|
||||||
) -> List[Proxy]:
|
) -> List[Proxy]:
|
||||||
query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
|
query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
|
||||||
params: List = []
|
params: List = []
|
||||||
clauses = []
|
clauses = []
|
||||||
if only_usable:
|
if only_usable:
|
||||||
clauses.append("validated = 1 AND score > 0")
|
ms = max(1, int(usable_min_score))
|
||||||
|
clauses.append("validated = 1 AND score >= ?")
|
||||||
|
params.append(ms)
|
||||||
if protocol:
|
if protocol:
|
||||||
clauses.append("protocol = ?")
|
clauses.append("protocol = ?")
|
||||||
params.append(protocol.lower())
|
params.append(protocol.lower())
|
||||||
@@ -396,12 +413,16 @@ class ProxyRepository:
|
|||||||
return proxies, total
|
return proxies, total
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def get_stats(db: aiosqlite.Connection) -> dict:
|
async def get_stats(
|
||||||
|
db: aiosqlite.Connection, low_score_threshold: int = 0
|
||||||
|
) -> dict:
|
||||||
"""统计快照。
|
"""统计快照。
|
||||||
|
|
||||||
协议计数(http/https/socks*)仅含已验证且 score>0 的可用代理,供首页图表与「可用」口径一致。
|
协议计数(http/https/socks*)仅含已验证且 score>0 的可用代理,供首页图表与「可用」口径一致。
|
||||||
pending_* 为待验证池(validated=0)按协议分布。
|
pending_* 为待验证池(validated=0)按协议分布。
|
||||||
|
invalid_count:已验证且 score<=0,或 score 低于系统「最低分」阈值(阈值>0 时)。
|
||||||
"""
|
"""
|
||||||
|
thr = max(0, int(low_score_threshold))
|
||||||
query = """
|
query = """
|
||||||
SELECT
|
SELECT
|
||||||
COUNT(*) as total,
|
COUNT(*) as total,
|
||||||
@@ -416,12 +437,12 @@ class ProxyRepository:
|
|||||||
COUNT(CASE WHEN validated = 0 AND protocol = 'https' THEN 1 END) as pending_https_count,
|
COUNT(CASE WHEN validated = 0 AND protocol = 'https' THEN 1 END) as pending_https_count,
|
||||||
COUNT(CASE WHEN validated = 0 AND protocol = 'socks4' THEN 1 END) as pending_socks4_count,
|
COUNT(CASE WHEN validated = 0 AND protocol = 'socks4' THEN 1 END) as pending_socks4_count,
|
||||||
COUNT(CASE WHEN validated = 0 AND protocol = 'socks5' THEN 1 END) as pending_socks5_count,
|
COUNT(CASE WHEN validated = 0 AND protocol = 'socks5' THEN 1 END) as pending_socks5_count,
|
||||||
COUNT(CASE WHEN validated = 1 AND score <= 0 THEN 1 END) as invalid_count,
|
COUNT(CASE WHEN validated = 1 AND (score <= 0 OR (? > 0 AND score < ?)) THEN 1 END) as invalid_count,
|
||||||
(SELECT AVG(response_time_ms) FROM proxies WHERE validated = 1 AND score > 0
|
(SELECT AVG(response_time_ms) FROM proxies WHERE validated = 1 AND score > 0
|
||||||
AND response_time_ms IS NOT NULL AND response_time_ms > 0) as avg_response_ms
|
AND response_time_ms IS NOT NULL AND response_time_ms > 0) as avg_response_ms
|
||||||
FROM proxies
|
FROM proxies
|
||||||
"""
|
"""
|
||||||
async with db.execute(query) as cursor:
|
async with db.execute(query, (thr, thr)) as cursor:
|
||||||
row = await cursor.fetchone()
|
row = await cursor.fetchone()
|
||||||
if row:
|
if row:
|
||||||
avg_lat = row[13]
|
avg_lat = row[13]
|
||||||
@@ -477,10 +498,19 @@ class ProxyRepository:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def clean_invalid(db: aiosqlite.Connection) -> int:
|
async def clean_invalid(
|
||||||
await db.execute(
|
db: aiosqlite.Connection, low_score_threshold: int = 0
|
||||||
"DELETE FROM proxies WHERE validated = 1 AND score <= 0"
|
) -> int:
|
||||||
)
|
thr = max(0, int(low_score_threshold))
|
||||||
|
if thr > 0:
|
||||||
|
await db.execute(
|
||||||
|
"DELETE FROM proxies WHERE validated = 1 AND (score <= 0 OR score < ?)",
|
||||||
|
(thr,),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await db.execute(
|
||||||
|
"DELETE FROM proxies WHERE validated = 1 AND score <= 0"
|
||||||
|
)
|
||||||
await db.commit()
|
await db.commit()
|
||||||
return db.total_changes
|
return db.total_changes
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,12 @@ from typing import List, Optional
|
|||||||
from app.core.db import get_db
|
from app.core.db import get_db
|
||||||
from app.core.plugin_system.registry import registry
|
from app.core.plugin_system.registry import registry
|
||||||
from app.core.plugin_system.base import BaseCrawlerPlugin
|
from app.core.plugin_system.base import BaseCrawlerPlugin
|
||||||
from app.core.exceptions import PluginNotFoundException, ValidationException
|
from app.core.exceptions import (
|
||||||
|
PluginNotFoundException,
|
||||||
|
ProxyPoolException,
|
||||||
|
ValidationException,
|
||||||
|
)
|
||||||
|
from app.core.config import settings as app_settings
|
||||||
from app.repositories.settings_repo import PluginSettingsRepository
|
from app.repositories.settings_repo import PluginSettingsRepository
|
||||||
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
|
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
|
||||||
from app.core.log import logger
|
from app.core.log import logger
|
||||||
@@ -110,7 +115,8 @@ class PluginService:
|
|||||||
async def run_all_plugins(self, plugin_runner) -> List[ProxyRaw]:
|
async def run_all_plugins(self, plugin_runner) -> List[ProxyRaw]:
|
||||||
"""执行所有启用插件的爬取,限制并发数以避免触发目标站反爬"""
|
"""执行所有启用插件的爬取,限制并发数以避免触发目标站反爬"""
|
||||||
all_results: List[ProxyRaw] = []
|
all_results: List[ProxyRaw] = []
|
||||||
semaphore = asyncio.Semaphore(5)
|
n = max(1, int(app_settings.crawler_num_validators))
|
||||||
|
semaphore = asyncio.Semaphore(n)
|
||||||
|
|
||||||
async def _run_with_limit(plugin_name: str):
|
async def _run_with_limit(plugin_name: str):
|
||||||
plugin = self.get_plugin_or_raise(plugin_name)
|
plugin = self.get_plugin_or_raise(plugin_name)
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, AsyncIterator
|
|||||||
|
|
||||||
from app.core.db import get_db
|
from app.core.db import get_db
|
||||||
from app.repositories.proxy_repo import ProxyRepository
|
from app.repositories.proxy_repo import ProxyRepository
|
||||||
|
from app.repositories.settings_repo import SettingsRepository
|
||||||
from app.models.domain import Proxy
|
from app.models.domain import Proxy
|
||||||
from app.core.log import logger
|
from app.core.log import logger
|
||||||
from app.core.config import settings as app_settings
|
from app.core.config import settings as app_settings
|
||||||
@@ -19,7 +20,9 @@ class ProxyService:
|
|||||||
|
|
||||||
async def get_stats(self) -> dict:
|
async def get_stats(self) -> dict:
|
||||||
async with get_db() as db:
|
async with get_db() as db:
|
||||||
stats = await self.proxy_repo.get_stats(db)
|
s = await SettingsRepository.get_all(db)
|
||||||
|
floor = int(s.get("min_proxy_score", 0))
|
||||||
|
stats = await self.proxy_repo.get_stats(db, low_score_threshold=floor)
|
||||||
stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
|
stats["today_new"] = await self.proxy_repo.get_today_new_count(db)
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
@@ -49,7 +52,10 @@ class ProxyService:
|
|||||||
|
|
||||||
async def get_random_proxy(self) -> Optional[Proxy]:
|
async def get_random_proxy(self) -> Optional[Proxy]:
|
||||||
async with get_db() as db:
|
async with get_db() as db:
|
||||||
p = await self.proxy_repo.get_random(db)
|
s = await SettingsRepository.get_all(db)
|
||||||
|
floor = int(s.get("min_proxy_score", 0))
|
||||||
|
ms = max(1, floor)
|
||||||
|
p = await self.proxy_repo.get_random(db, min_score=ms)
|
||||||
if not p:
|
if not p:
|
||||||
return None
|
return None
|
||||||
new_uc = int(getattr(p, "use_count", 0) or 0) + 1
|
new_uc = int(getattr(p, "use_count", 0) or 0) + 1
|
||||||
@@ -73,7 +79,9 @@ class ProxyService:
|
|||||||
|
|
||||||
async def clean_invalid(self) -> int:
|
async def clean_invalid(self) -> int:
|
||||||
async with get_db() as db:
|
async with get_db() as db:
|
||||||
return await self.proxy_repo.clean_invalid(db)
|
s = await SettingsRepository.get_all(db)
|
||||||
|
floor = int(s.get("min_proxy_score", 0))
|
||||||
|
return await self.proxy_repo.clean_invalid(db, low_score_threshold=floor)
|
||||||
|
|
||||||
async def clean_expired(self, days: int) -> int:
|
async def clean_expired(self, days: int) -> int:
|
||||||
async with get_db() as db:
|
async with get_db() as db:
|
||||||
@@ -83,8 +91,11 @@ class ProxyService:
|
|||||||
self,
|
self,
|
||||||
fmt: str,
|
fmt: str,
|
||||||
protocol: Optional[str] = None,
|
protocol: Optional[str] = None,
|
||||||
limit: int = 10000,
|
limit: Optional[int] = None,
|
||||||
) -> AsyncIterator[str]:
|
) -> AsyncIterator[str]:
|
||||||
|
cap = int(app_settings.export_max_records) if limit is None else int(limit)
|
||||||
|
if cap < 1:
|
||||||
|
cap = 1
|
||||||
if fmt == "csv":
|
if fmt == "csv":
|
||||||
yield "\ufeffIP,Port,Protocol,Score,Last Check\n"
|
yield "\ufeffIP,Port,Protocol,Score,Last Check\n"
|
||||||
elif fmt == "txt":
|
elif fmt == "txt":
|
||||||
@@ -95,11 +106,17 @@ class ProxyService:
|
|||||||
|
|
||||||
exported = 0
|
exported = 0
|
||||||
async with get_db() as db:
|
async with get_db() as db:
|
||||||
|
s = await SettingsRepository.get_all(db)
|
||||||
|
floor = max(1, int(s.get("min_proxy_score", 0)))
|
||||||
async for batch in self.proxy_repo.iter_batches(
|
async for batch in self.proxy_repo.iter_batches(
|
||||||
db, protocol=protocol, batch_size=1000, only_usable=True
|
db,
|
||||||
|
protocol=protocol,
|
||||||
|
batch_size=1000,
|
||||||
|
only_usable=True,
|
||||||
|
usable_min_score=floor,
|
||||||
):
|
):
|
||||||
for p in batch:
|
for p in batch:
|
||||||
if exported >= limit:
|
if exported >= cap:
|
||||||
break
|
break
|
||||||
if fmt == "csv":
|
if fmt == "csv":
|
||||||
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
|
yield f"{p.ip},{p.port},{p.protocol},{p.score},{self._fmt_time(p.last_check)}\n"
|
||||||
@@ -117,7 +134,7 @@ class ProxyService:
|
|||||||
yield prefix + json.dumps(item, ensure_ascii=False)
|
yield prefix + json.dumps(item, ensure_ascii=False)
|
||||||
first = False
|
first = False
|
||||||
exported += 1
|
exported += 1
|
||||||
if exported >= limit:
|
if exported >= cap:
|
||||||
break
|
break
|
||||||
|
|
||||||
if fmt == "json":
|
if fmt == "json":
|
||||||
|
|||||||
@@ -19,10 +19,14 @@ class SchedulerService:
|
|||||||
executor: JobExecutor,
|
executor: JobExecutor,
|
||||||
worker_pool: Optional[Any] = None,
|
worker_pool: Optional[Any] = None,
|
||||||
interval_minutes: int = 30,
|
interval_minutes: int = 30,
|
||||||
|
proxy_service: Optional[Any] = None,
|
||||||
|
settings_repo: Optional[Any] = None,
|
||||||
):
|
):
|
||||||
self.executor = executor
|
self.executor = executor
|
||||||
self.worker_pool = worker_pool
|
self.worker_pool = worker_pool
|
||||||
self.interval_minutes = interval_minutes
|
self.interval_minutes = interval_minutes
|
||||||
|
self._proxy_service = proxy_service
|
||||||
|
self._settings_repo = settings_repo
|
||||||
self.running = False
|
self.running = False
|
||||||
self._stop_event = asyncio.Event()
|
self._stop_event = asyncio.Event()
|
||||||
self._task: Optional[asyncio.Task] = None
|
self._task: Optional[asyncio.Task] = None
|
||||||
@@ -59,6 +63,22 @@ class SchedulerService:
|
|||||||
async def _run_loop(self) -> None:
|
async def _run_loop(self) -> None:
|
||||||
"""定时循环"""
|
"""定时循环"""
|
||||||
while self.running:
|
while self.running:
|
||||||
|
if self._proxy_service is not None and self._settings_repo is not None:
|
||||||
|
try:
|
||||||
|
from app.core.db import get_db
|
||||||
|
|
||||||
|
async with get_db() as db:
|
||||||
|
s = await self._settings_repo.get_all(db)
|
||||||
|
days = int(s.get("proxy_expiry_days", 7))
|
||||||
|
removed = await self._proxy_service.clean_expired(days)
|
||||||
|
if removed:
|
||||||
|
logger.info(
|
||||||
|
"Scheduler removed %s proxies (last_check older than %s days)",
|
||||||
|
removed,
|
||||||
|
days,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Scheduler clean_expired failed: %s", e, exc_info=True)
|
||||||
try:
|
try:
|
||||||
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
|
self.executor.submit_job(ValidateAllJob(validator_pool=self.worker_pool))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
"validator_max_concurrency": 200,
|
"validator_max_concurrency": 200,
|
||||||
"validator_connect_timeout": 3,
|
"validator_connect_timeout": 3,
|
||||||
"crawler_num_validators": 50,
|
"crawler_num_validators": 50,
|
||||||
"crawler_max_queue_size": 500,
|
"crawler_max_queue_size": 48,
|
||||||
"log_level": "INFO",
|
"log_level": "INFO",
|
||||||
"log_dir": "logs",
|
"log_dir": "logs",
|
||||||
"ws_stats_interval_seconds": 1,
|
"ws_stats_interval_seconds": 1,
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
"validator_max_concurrency": 200,
|
"validator_max_concurrency": 200,
|
||||||
"validator_connect_timeout": 3,
|
"validator_connect_timeout": 3,
|
||||||
"crawler_num_validators": 50,
|
"crawler_num_validators": 50,
|
||||||
"crawler_max_queue_size": 500,
|
"crawler_max_queue_size": 48,
|
||||||
"log_level": "INFO",
|
"log_level": "INFO",
|
||||||
"log_dir": "logs",
|
"log_dir": "logs",
|
||||||
"ws_stats_interval_seconds": 1,
|
"ws_stats_interval_seconds": 1,
|
||||||
|
|||||||
102
script/reset_and_recrawl.py
Normal file
102
script/reset_and_recrawl.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
"""清空 proxies 表,并依次执行各启用插件爬取;可选触发运行中 API 的全量验证。
|
||||||
|
|
||||||
|
用法(在项目根目录)::
|
||||||
|
python script/reset_and_recrawl.py
|
||||||
|
python script/reset_and_recrawl.py --api-base http://127.0.0.1:18080
|
||||||
|
python script/reset_and_recrawl.py --skip-validate
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 项目根
|
||||||
|
_ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
if str(_ROOT) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_ROOT))
|
||||||
|
|
||||||
|
|
||||||
|
async def _main(api_base: str, skip_validate: bool) -> None:
|
||||||
|
from app.core.db import init_db, get_db, transaction
|
||||||
|
from app.repositories.proxy_repo import ProxyRepository
|
||||||
|
from app.core.config import settings
|
||||||
|
import app.plugins # noqa: F401 — 注册内置与外部插件
|
||||||
|
from app.core.plugin_system.registry import registry
|
||||||
|
from app.services.plugin_runner import PluginRunner
|
||||||
|
|
||||||
|
await init_db()
|
||||||
|
async with get_db() as db:
|
||||||
|
await db.execute("DELETE FROM proxies")
|
||||||
|
await db.commit()
|
||||||
|
print("已清空表 proxies")
|
||||||
|
|
||||||
|
initial = max(
|
||||||
|
settings.score_min,
|
||||||
|
min(settings.score_max, int(settings.score_valid)),
|
||||||
|
)
|
||||||
|
runner = PluginRunner()
|
||||||
|
total_in = 0
|
||||||
|
for plugin in registry.list_plugins():
|
||||||
|
if not plugin.enabled:
|
||||||
|
print(f"[跳过] {plugin.name}(已禁用)")
|
||||||
|
continue
|
||||||
|
print(f"[爬取] {plugin.name} …", flush=True)
|
||||||
|
try:
|
||||||
|
result = await runner.run(plugin)
|
||||||
|
proxies = result.proxies or []
|
||||||
|
if not proxies:
|
||||||
|
err = result.error or "无数据"
|
||||||
|
print(f" -> 0 条 ({err})")
|
||||||
|
continue
|
||||||
|
async with transaction() as db:
|
||||||
|
await ProxyRepository.upsert_many_from_crawl(db, proxies, initial)
|
||||||
|
total_in += len(proxies)
|
||||||
|
print(f" -> {len(proxies)} 条已入库(待验证)")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" -> 失败: {e}")
|
||||||
|
|
||||||
|
print(f"爬取阶段结束,累计入库约 {total_in} 条(去重前按插件计)。")
|
||||||
|
|
||||||
|
if skip_validate:
|
||||||
|
print("已跳过远程全量验证。请启动 API 后执行 POST /api/scheduler/validate-now")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
except ImportError:
|
||||||
|
print("未安装 httpx,跳过远程全量验证。")
|
||||||
|
return
|
||||||
|
|
||||||
|
url = api_base.rstrip("/") + "/api/scheduler/validate-now"
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||||
|
r = await client.post(url)
|
||||||
|
data = r.json() if r.headers.get("content-type", "").startswith("application/json") else {}
|
||||||
|
if r.status_code == 200 and data.get("code") == 200:
|
||||||
|
print("已提交全量验证:", data.get("data"))
|
||||||
|
else:
|
||||||
|
print(f"全量验证请求异常 HTTP {r.status_code}: {data or r.text[:200]}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"无法连接 API({url}):{e}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
p = argparse.ArgumentParser(description="清空代理并逐插件爬取")
|
||||||
|
p.add_argument(
|
||||||
|
"--api-base",
|
||||||
|
default="http://127.0.0.1:18080",
|
||||||
|
help="运行中的 ProxyPool API 根地址,用于提交全量验证",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--skip-validate",
|
||||||
|
action="store_true",
|
||||||
|
help="不调用 HTTP 全量验证",
|
||||||
|
)
|
||||||
|
args = p.parse_args()
|
||||||
|
asyncio.run(_main(args.api_base, args.skip_validate))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user