refactor(backend): optimize database safety, validator performance, and scheduler concurrency
- Fix SQL injection risks in proxy_repo and task_repo - Atomic acquire_pending with UPDATE ... RETURNING - Reuse aiohttp ClientSession in ValidatorService - Replace polling with asyncio.Event in SchedulerService - Optimize ValidationQueue.drain with asyncio.Condition - Concurrent plugin crawling with asyncio.gather - Unify ProxyRaw model import path - Fix test baseline and remove tracked __pycache__ files
This commit is contained in:
@@ -1,20 +1,7 @@
|
||||
"""插件基类 - 所有爬虫插件必须继承此基类"""
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProxyRaw:
|
||||
"""爬虫产出的原始代理数据"""
|
||||
ip: str
|
||||
port: int
|
||||
protocol: str = "http"
|
||||
|
||||
def __post_init__(self):
|
||||
self.protocol = self.protocol.lower().strip()
|
||||
if self.protocol not in ("http", "https", "socks4", "socks5"):
|
||||
self.protocol = "http"
|
||||
from app.models.domain import ProxyRaw
|
||||
|
||||
|
||||
class BaseCrawlerPlugin(ABC):
|
||||
|
||||
@@ -40,6 +40,8 @@ class ValidationQueue:
|
||||
self._workers: list[asyncio.Task] = []
|
||||
self._running = False
|
||||
self._db_lock = asyncio.Lock()
|
||||
self._pending_count = 0
|
||||
self._condition = asyncio.Condition()
|
||||
|
||||
# 统计
|
||||
self.valid_count = 0
|
||||
@@ -58,6 +60,8 @@ class ValidationQueue:
|
||||
logger.info(f"ValidationQueue recovered {recovered} interrupted tasks")
|
||||
if pending:
|
||||
logger.info(f"ValidationQueue has {pending} pending tasks to process")
|
||||
async with self._condition:
|
||||
self._pending_count = pending
|
||||
|
||||
for i in range(self.worker_count):
|
||||
self._workers.append(asyncio.create_task(self._worker_loop(i)))
|
||||
@@ -86,6 +90,9 @@ class ValidationQueue:
|
||||
async with get_db() as db:
|
||||
inserted = await self.task_repo.insert_batch(db, proxies)
|
||||
if inserted:
|
||||
async with self._condition:
|
||||
self._pending_count += inserted
|
||||
self._condition.notify_all()
|
||||
for _ in range(min(inserted, self.worker_count)):
|
||||
self._signal.put_nowait(None)
|
||||
|
||||
@@ -94,12 +101,9 @@ class ValidationQueue:
|
||||
|
||||
async def drain(self):
|
||||
"""等待队列中当前所有 pending 任务处理完毕"""
|
||||
while True:
|
||||
async with get_db() as db:
|
||||
count = await self.task_repo.get_pending_count(db)
|
||||
if count == 0:
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
async with self._condition:
|
||||
if self._pending_count > 0:
|
||||
await self._condition.wait_for(lambda: self._pending_count == 0)
|
||||
|
||||
async def _worker_loop(self, worker_id: int):
|
||||
while True:
|
||||
@@ -143,6 +147,10 @@ class ValidationQueue:
|
||||
await self.task_repo.complete_task(db, task["id"], False, 0.0)
|
||||
self.invalid_count += 1
|
||||
logger.debug(f"ValidationQueue: invalid {proxy.ip}:{proxy.port}")
|
||||
async with self._condition:
|
||||
self._pending_count = max(0, self._pending_count - 1)
|
||||
if self._pending_count == 0:
|
||||
self._condition.notify_all()
|
||||
|
||||
def reset_stats(self):
|
||||
self.valid_count = 0
|
||||
|
||||
Reference in New Issue
Block a user