import re import asyncio import random from typing import List, Optional from bs4 import BeautifulSoup from app.core.plugin_system import ProxyRaw from app.plugins.base import BaseHTTPPlugin from app.core.log import logger VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") class KuaiDaiLiPlugin(BaseHTTPPlugin): default_config = {"max_pages": 5} name = "kuaidaili" display_name = "快代理" description = "从快代理网站爬取免费代理" def __init__(self): super().__init__() # fps/dps 列表页目前仍可 200;inha/intr 常返回 567(反爬),作末位兜底 self.urls = [ "https://www.kuaidaili.com/free/fps/", "https://www.kuaidaili.com/free/dps/", "https://www.kuaidaili.com/free/inha/1/", "https://www.kuaidaili.com/free/intr/1/", ] def get_headers(self, url: Optional[str] = None, **kwargs) -> dict: headers = super().get_headers(url=url, **kwargs) headers["Referer"] = "https://www.kuaidaili.com/free/" headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" headers["Accept-Encoding"] = "gzip, deflate" headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8" headers["Sec-Fetch-Dest"] = "document" headers["Sec-Fetch-Mode"] = "navigate" headers["Sec-Fetch-Site"] = "same-origin" headers["Upgrade-Insecure-Requests"] = "1" return headers @staticmethod def _infer_protocol(texts: List[str]) -> str: """从一行单元格文本中推断协议(兼容 fps / dps / inha 等版式)。""" for t in texts[2:]: tl = t.lower().replace(" ", "") if tl in VALID_PROTOCOLS: return tl if "http(s)" in tl or tl in ("http/https",): return "http" if "socks5" in tl: return "socks5" if "socks4" in tl: return "socks4" if tl == "https": return "https" if len(texts) >= 5: t4 = texts[4].lower().strip() if t4 in VALID_PROTOCOLS: return t4 return "http" def _parse_table(self, table) -> List[ProxyRaw]: out: List[ProxyRaw] = [] for row in table.find_all("tr"): tds = row.find_all("td") if len(tds) < 2: continue texts = [td.get_text(strip=True) for td in tds] ip = texts[0] port_s = texts[1] if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip): continue if not port_s.isdigit() or not (1 <= int(port_s) <= 65535): continue protocol = self._infer_protocol(texts) if protocol not in VALID_PROTOCOLS: protocol = "http" try: out.append(ProxyRaw(ip, int(port_s), protocol)) except ValueError: continue return out async def crawl(self) -> List[ProxyRaw]: results = [] await self.fetch("https://www.kuaidaili.com/free/", timeout=10) await asyncio.sleep(random.uniform(1, 2)) for url in self.urls: html = await self.fetch(url, timeout=15) if not html: continue soup = BeautifulSoup(html, "lxml") table = soup.find("table") if not table: logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}") continue batch = self._parse_table(table) if batch: results.extend(batch) logger.info(f"{self.display_name} {url} 解析 {len(batch)} 条") await asyncio.sleep(random.uniform(1, 2)) if results: logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理") return results