first commit

2026-01-27 21:17:36 +08:00
commit b06044c91c
57 changed files with 6714 additions and 0 deletions
--- a/core/crawler.py
+++ b/core/crawler.py
@@ -0,0 +1,86 @@
+import aiohttp
+import asyncio
+import random
+from core.log import logger
+
+class BaseCrawler:
+    def __init__(self):
+        self.user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Mobile/15E148 Safari/604.1"
+        ]
+
+    def get_headers(self):
+        return {
+            'User-Agent': random.choice(self.user_agents),
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
+            'Connection': 'keep-alive',
+        }
+
+    async def fetch(self, url, method='GET', params=None, data=None, proxies=None, timeout=10, retry_count=3):
+        """异步抓取方法"""
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        }
+        async with aiohttp.ClientSession(headers=headers) as session:
+            for i in range(retry_count):
+                try:
+                    # 注意：aiohttp 的代理格式与 requests 不同，通常为 http://user:pass@host:port
+                    async with session.request(
+                        method=method,
+                        url=url,
+                        params=params,
+                        data=data,
+                        proxy=proxies,
+                        timeout=aiohttp.ClientTimeout(total=timeout)
+                    ) as response:
+                        if response.status == 200:
+                            # 先读取内容，再处理编码
+                            content = await response.read()
+                            
+                            # 尝试获取编码
+                            encoding = response.get_encoding()
+                            if encoding == 'utf-8' or not encoding:
+                                try:
+                                    return content.decode('utf-8')
+                                except UnicodeDecodeError:
+                                    # 尝试从内容中检测编码或手动设置为 gbk (国内网站常见)
+                                    return content.decode('gbk', errors='ignore')
+                            
+                            return content.decode(encoding, errors='ignore')
+                        else:
+                            logger.warning(f"请求失败 [{response.status}]: {url}, 正在进行第 {i+1} 次重试...")
+                except Exception as e:
+                    logger.error(f"请求异常: {url}, 错误: {e}, 正在进行第 {i+1} 次重试...")
+                
+                await asyncio.sleep(random.uniform(1, 3))
+                
+        return None
+
+class BasePlugin(BaseCrawler):
+    def __init__(self):
+        super().__init__()
+        self.name = "BasePlugin"
+        self.urls = []
+        self.enabled = True
+
+    async def parse(self, html):
+        """异步解析网页内容，需在子类中实现"""
+        raise NotImplementedError("Please implement parse method")
+
+    async def run(self):
+        """异步运行插件"""
+        logger.info(f"正在运行插件: {self.name}")
+        results = []
+        for url in self.urls:
+            self.current_url = url # 记录当前正在抓取的 URL，供 parse 使用
+            html = await self.fetch(url)
+            if html:
+                async for proxy in self.parse(html):
+                    results.append(proxy)
+            await asyncio.sleep(random.uniform(1, 2))
+        return results