import re import asyncio import random from typing import List from bs4 import BeautifulSoup from app.core.plugin_system import ProxyRaw from app.plugins.base import BaseHTTPPlugin from app.core.log import logger VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") class KuaiDaiLiPlugin(BaseHTTPPlugin): default_config = {"max_pages": 5} name = "kuaidaili" display_name = "快代理" description = "从快代理网站爬取免费代理" def __init__(self): super().__init__() # 减少页数,降低被反爬概率,确保至少能拿到数据 self.urls = [ "https://www.kuaidaili.com/free/inha/1/", "https://www.kuaidaili.com/free/intr/1/", ] def get_headers(self) -> dict: headers = super().get_headers() headers["Referer"] = "https://www.kuaidaili.com/free/inha/" headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" headers["Accept-Encoding"] = "gzip, deflate, br" headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8" headers["Sec-Fetch-Dest"] = "document" headers["Sec-Fetch-Mode"] = "navigate" headers["Sec-Fetch-Site"] = "same-origin" headers["Upgrade-Insecure-Requests"] = "1" return headers async def crawl(self) -> List[ProxyRaw]: results = [] # 先访问首页预热会话,获取 cookie,降低被反爬概率 await self.fetch("https://www.kuaidaili.com/", timeout=10) await asyncio.sleep(random.uniform(2, 4)) # 顺序请求免费代理页面 for url in self.urls: html = await self.fetch(url, timeout=10) if not html: continue soup = BeautifulSoup(html, "lxml") table = soup.find("table") if not table: logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}") continue for row in table.find_all("tr"): tds = row.find_all("td") if len(tds) >= 5: ip = tds[0].get_text(strip=True) port = tds[1].get_text(strip=True) protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http" if protocol not in VALID_PROTOCOLS: protocol = "http" if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit(): results.append(ProxyRaw(ip, int(port), protocol)) await asyncio.sleep(random.uniform(5, 8)) if results: logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理") return results