"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装""" import random import asyncio import aiohttp from typing import List from core.plugin_system import BaseCrawlerPlugin class BaseHTTPPlugin(BaseCrawlerPlugin): """基于 HTTP 的爬虫插件基类""" def __init__(self): super().__init__() self.user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", ] self.urls: List[str] = [] self.current_url: str = "" def get_headers(self) -> dict: return { "User-Agent": random.choice(self.user_agents), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Connection": "keep-alive", } async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str: """异步抓取指定 URL 的 HTML 内容""" headers = self.get_headers() async with aiohttp.ClientSession(headers=headers) as session: for attempt in range(retries): try: async with session.get( url, timeout=aiohttp.ClientTimeout(total=timeout) ) as response: if response.status == 200: content = await response.read() encoding = response.get_encoding() if encoding == "utf-8" or not encoding: try: return content.decode("utf-8") except UnicodeDecodeError: return content.decode("gbk", errors="ignore") return content.decode(encoding, errors="ignore") except Exception: pass await asyncio.sleep(random.uniform(1, 3)) return ""