ProxyPool/app/plugins/ip3366.py

import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger

VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")


class Ip3366Plugin(BaseHTTPPlugin):
    name = "ip3366"
    display_name = "IP3366"
    description = "从 IP3366 网站爬取免费代理"
    default_config = {"max_pages": 5}

    def __init__(self):
        super().__init__()
        self._update_urls()

    def _update_urls(self):
        max_pages = self.config.get("max_pages", 5)
        self.urls = [
            f"http://www.ip3366.net/free/?stype=1&page={i}" for i in range(1, max_pages + 1)
        ] + [
            f"http://www.ip3366.net/free/?stype=2&page={i}" for i in range(1, max_pages + 1)
        ]

    async def crawl(self) -> List[ProxyRaw]:
        results = []
        for url in self.urls:
            html = await self.fetch(url, timeout=15)
            if not html:
                continue
            soup = BeautifulSoup(html, "lxml")
            list_div = soup.find("div", id="list")
            if not list_div:
                continue
            table = list_div.find("table")
            if not table:
                continue

            for row in table.find_all("tr"):
                tds = row.find_all("td")
                if len(tds) >= 5:
                    ip = tds[0].get_text(strip=True)
                    port = tds[1].get_text(strip=True)
                    protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
                    if protocol not in VALID_PROTOCOLS:
                        protocol = "http"
                    if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit():
                        results.append(ProxyRaw(ip, int(port), protocol))

        if results:
            logger.info(f"{self.display_name} 解析完成，获得 {len(results)} 个潜在代理")
        return results