ProxyPool/core/crawler.py

import aiohttp
import asyncio
import random
from core.log import logger

class BaseCrawler:
    def __init__(self):
        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
            "Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Mobile/15E148 Safari/604.1"
        ]

    def get_headers(self):
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Connection': 'keep-alive',
        }

    async def fetch(self, url, method='GET', params=None, data=None, proxies=None, timeout=10, retry_count=3):
        """异步抓取方法"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }
        async with aiohttp.ClientSession(headers=headers) as session:
            for i in range(retry_count):
                try:
                    # 注意：aiohttp 的代理格式与 requests 不同，通常为 http://user:pass@host:port
                    async with session.request(
                        method=method,
                        url=url,
                        params=params,
                        data=data,
                        proxy=proxies,
                        timeout=aiohttp.ClientTimeout(total=timeout)
                    ) as response:
                        if response.status == 200:
                            # 先读取内容，再处理编码
                            content = await response.read()

                            # 尝试获取编码
                            encoding = response.get_encoding()
                            if encoding == 'utf-8' or not encoding:
                                try:
                                    return content.decode('utf-8')
                                except UnicodeDecodeError:
                                    # 尝试从内容中检测编码或手动设置为 gbk (国内网站常见)
                                    return content.decode('gbk', errors='ignore')

                            return content.decode(encoding, errors='ignore')
                        else:
                            logger.warning(f"请求失败 [{response.status}]: {url}, 正在进行第 {i+1} 次重试...")
                except Exception as e:
                    logger.error(f"请求异常: {url}, 错误: {e}, 正在进行第 {i+1} 次重试...")

                await asyncio.sleep(random.uniform(1, 3))

        return None

class BasePlugin(BaseCrawler):
    def __init__(self):
        super().__init__()
        self.name = "BasePlugin"
        self.urls = []
        self.enabled = True

    async def parse(self, html):
        """异步解析网页内容，需在子类中实现"""
        raise NotImplementedError("Please implement parse method")

    async def run(self):
        """异步运行插件"""
        logger.info(f"正在运行插件: {self.name}")
        results = []
        for url in self.urls:
            self.current_url = url # 记录当前正在抓取的 URL，供 parse 使用
            html = await self.fetch(url)
            if html:
                async for proxy in self.parse(html):
                    results.append(proxy)
            await asyncio.sleep(random.uniform(1, 2))
        return results