import re from typing import List from bs4 import BeautifulSoup from app.core.plugin_system import ProxyRaw from app.plugins.base import BaseHTTPPlugin from app.core.log import logger VALID_PROTOCOLS = ("http", "https", "socks4", "socks5") class Ip3366Plugin(BaseHTTPPlugin): name = "ip3366" display_name = "IP3366" description = "从 IP3366 网站爬取免费代理" default_config = {"max_pages": 3} def __init__(self): super().__init__() self._update_urls() def _update_urls(self): max_pages = self.config.get("max_pages", 3) self.urls = [ f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, max_pages + 1) ] + [ f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, max_pages + 1) ] def get_headers(self) -> dict: headers = super().get_headers() headers["Referer"] = "http://www.ip3366.net/free/" return headers async def crawl(self) -> List[ProxyRaw]: results = [] htmls = await self.fetch_all(self.urls) for html in htmls: if not html: continue soup = BeautifulSoup(html, "lxml") list_div = soup.find("div", id="list") if not list_div: continue table = list_div.find("table") if not table: continue for row in table.find_all("tr"): tds = row.find_all("td") if len(tds) >= 5: ip = tds[0].get_text(strip=True) port = tds[1].get_text(strip=True) protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http" if protocol not in VALID_PROTOCOLS: protocol = "http" if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit(): results.append(ProxyRaw(ip, int(port), protocol)) if results: logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理") return results