"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装""" import random import asyncio import httpx from typing import List from app.core.plugin_system import BaseCrawlerPlugin class BaseHTTPPlugin(BaseCrawlerPlugin): """基于 HTTP 的爬虫插件基类""" def __init__(self): super().__init__() self.user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", ] self.urls: List[str] = [] self.current_url: str = "" def get_headers(self) -> dict: return { "User-Agent": random.choice(self.user_agents), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Connection": "keep-alive", } async def fetch(self, url: str, timeout: float = 15.0, retries: int = 2) -> str: """异步抓取指定 URL 的 HTML 内容""" from app.core.log import logger headers = self.get_headers() transport = httpx.AsyncHTTPTransport(retries=0) for attempt in range(retries): async with httpx.AsyncClient(headers=headers, transport=transport, follow_redirects=True) as client: try: response = await client.get(url, timeout=timeout) if response.status_code == 200: content = response.content encoding = response.encoding if encoding == "utf-8" or not encoding: try: return content.decode("utf-8") except UnicodeDecodeError: return content.decode("gbk", errors="ignore") return content.decode(encoding, errors="ignore") else: logger.warning(f"Fetch {url} returned status {response.status_code}") except Exception as e: logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}") if attempt < retries - 1: await asyncio.sleep(random.uniform(1, 3)) return "" async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]: """并发抓取多个 URL,限制单个插件内部并发为 3""" semaphore = asyncio.Semaphore(3) async def _fetch_limited(url: str): async with semaphore: return await self.fetch(url, timeout=timeout) tasks = [_fetch_limited(url) for url in urls] return await asyncio.gather(*tasks)