ProxyPool/plugins/base.py

"""通用 HTTP 爬虫基类 - 为基于 HTTP 请求的插件提供封装"""
import random
import asyncio
import aiohttp
from typing import List
from core.plugin_system import BaseCrawlerPlugin


class BaseHTTPPlugin(BaseCrawlerPlugin):
    """基于 HTTP 的爬虫插件基类"""

    def __init__(self):
        super().__init__()
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
        ]
        self.urls: List[str] = []
        self.current_url: str = ""

    def get_headers(self) -> dict:
        return {
            "User-Agent": random.choice(self.user_agents),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
            "Connection": "keep-alive",
        }

    async def fetch(self, url: str, timeout: float = 10.0, retries: int = 3) -> str:
        """异步抓取指定 URL 的 HTML 内容"""
        headers = self.get_headers()
        async with aiohttp.ClientSession(headers=headers) as session:
            for attempt in range(retries):
                try:
                    async with session.get(
                        url, timeout=aiohttp.ClientTimeout(total=timeout)
                    ) as response:
                        if response.status == 200:
                            content = await response.read()
                            encoding = response.get_encoding()
                            if encoding == "utf-8" or not encoding:
                                try:
                                    return content.decode("utf-8")
                                except UnicodeDecodeError:
                                    return content.decode("gbk", errors="ignore")
                            return content.decode(encoding, errors="ignore")
                except Exception:
                    pass
                await asyncio.sleep(random.uniform(1, 3))
        return ""