refactor(backend): optimize database safety, validator performance, and scheduler concurrency

- Fix SQL injection risks in proxy_repo and task_repo
- Atomic acquire_pending with UPDATE ... RETURNING
- Reuse aiohttp ClientSession in ValidatorService
- Replace polling with asyncio.Event in SchedulerService
- Optimize ValidationQueue.drain with asyncio.Condition
- Concurrent plugin crawling with asyncio.gather
- Unify ProxyRaw model import path
- Fix test baseline and remove tracked __pycache__ files
This commit is contained in:
祀梦
2026-04-04 14:43:31 +08:00
parent abb8b32ed3
commit 635c524a7e
27 changed files with 103 additions and 89 deletions

View File

@@ -1,20 +1,7 @@
"""插件基类 - 所有爬虫插件必须继承此基类"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Any
@dataclass
class ProxyRaw:
"""爬虫产出的原始代理数据"""
ip: str
port: int
protocol: str = "http"
def __post_init__(self):
self.protocol = self.protocol.lower().strip()
if self.protocol not in ("http", "https", "socks4", "socks5"):
self.protocol = "http"
from app.models.domain import ProxyRaw
class BaseCrawlerPlugin(ABC):

View File

@@ -40,6 +40,8 @@ class ValidationQueue:
self._workers: list[asyncio.Task] = []
self._running = False
self._db_lock = asyncio.Lock()
self._pending_count = 0
self._condition = asyncio.Condition()
# 统计
self.valid_count = 0
@@ -58,6 +60,8 @@ class ValidationQueue:
logger.info(f"ValidationQueue recovered {recovered} interrupted tasks")
if pending:
logger.info(f"ValidationQueue has {pending} pending tasks to process")
async with self._condition:
self._pending_count = pending
for i in range(self.worker_count):
self._workers.append(asyncio.create_task(self._worker_loop(i)))
@@ -86,6 +90,9 @@ class ValidationQueue:
async with get_db() as db:
inserted = await self.task_repo.insert_batch(db, proxies)
if inserted:
async with self._condition:
self._pending_count += inserted
self._condition.notify_all()
for _ in range(min(inserted, self.worker_count)):
self._signal.put_nowait(None)
@@ -94,12 +101,9 @@ class ValidationQueue:
async def drain(self):
"""等待队列中当前所有 pending 任务处理完毕"""
while True:
async with get_db() as db:
count = await self.task_repo.get_pending_count(db)
if count == 0:
break
await asyncio.sleep(0.5)
async with self._condition:
if self._pending_count > 0:
await self._condition.wait_for(lambda: self._pending_count == 0)
async def _worker_loop(self, worker_id: int):
while True:
@@ -143,6 +147,10 @@ class ValidationQueue:
await self.task_repo.complete_task(db, task["id"], False, 0.0)
self.invalid_count += 1
logger.debug(f"ValidationQueue: invalid {proxy.ip}:{proxy.port}")
async with self._condition:
self._pending_count = max(0, self._pending_count - 1)
if self._pending_count == 0:
self._condition.notify_all()
def reset_stats(self):
self.valid_count = 0