first commit
This commit is contained in:
86
core/crawler.py
Normal file
86
core/crawler.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import random
|
||||
from core.log import logger
|
||||
|
||||
class BaseCrawler:
|
||||
def __init__(self):
|
||||
self.user_agents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
|
||||
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1.2 Mobile/15E148 Safari/604.1"
|
||||
]
|
||||
|
||||
def get_headers(self):
|
||||
return {
|
||||
'User-Agent': random.choice(self.user_agents),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
|
||||
async def fetch(self, url, method='GET', params=None, data=None, proxies=None, timeout=10, retry_count=3):
|
||||
"""异步抓取方法"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
async with aiohttp.ClientSession(headers=headers) as session:
|
||||
for i in range(retry_count):
|
||||
try:
|
||||
# 注意:aiohttp 的代理格式与 requests 不同,通常为 http://user:pass@host:port
|
||||
async with session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
params=params,
|
||||
data=data,
|
||||
proxy=proxies,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout)
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
# 先读取内容,再处理编码
|
||||
content = await response.read()
|
||||
|
||||
# 尝试获取编码
|
||||
encoding = response.get_encoding()
|
||||
if encoding == 'utf-8' or not encoding:
|
||||
try:
|
||||
return content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# 尝试从内容中检测编码或手动设置为 gbk (国内网站常见)
|
||||
return content.decode('gbk', errors='ignore')
|
||||
|
||||
return content.decode(encoding, errors='ignore')
|
||||
else:
|
||||
logger.warning(f"请求失败 [{response.status}]: {url}, 正在进行第 {i+1} 次重试...")
|
||||
except Exception as e:
|
||||
logger.error(f"请求异常: {url}, 错误: {e}, 正在进行第 {i+1} 次重试...")
|
||||
|
||||
await asyncio.sleep(random.uniform(1, 3))
|
||||
|
||||
return None
|
||||
|
||||
class BasePlugin(BaseCrawler):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.name = "BasePlugin"
|
||||
self.urls = []
|
||||
self.enabled = True
|
||||
|
||||
async def parse(self, html):
|
||||
"""异步解析网页内容,需在子类中实现"""
|
||||
raise NotImplementedError("Please implement parse method")
|
||||
|
||||
async def run(self):
|
||||
"""异步运行插件"""
|
||||
logger.info(f"正在运行插件: {self.name}")
|
||||
results = []
|
||||
for url in self.urls:
|
||||
self.current_url = url # 记录当前正在抓取的 URL,供 parse 使用
|
||||
html = await self.fetch(url)
|
||||
if html:
|
||||
async for proxy in self.parse(html):
|
||||
results.append(proxy)
|
||||
await asyncio.sleep(random.uniform(1, 2))
|
||||
return results
|
||||
Reference in New Issue
Block a user