- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
70 lines
2.4 KiB
Python
70 lines
2.4 KiB
Python
"""freeproxylists.net 及常见镜像路径(表格 / 纯文本)。"""
|
||
import re
|
||
from typing import List
|
||
|
||
from bs4 import BeautifulSoup
|
||
|
||
from app.core.plugin_system import ProxyRaw
|
||
from app.plugins.base import BaseHTTPPlugin
|
||
from app.core.log import logger
|
||
|
||
|
||
class FpwFreeproxylistsPlugin(BaseHTTPPlugin):
|
||
name = "fpw_freeproxylists"
|
||
display_name = "FreeProxyLists"
|
||
description = "freeproxylists.net 系列页面(易被 403,多 URL 尝试)"
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.urls = [
|
||
"http://www.freeproxylists.net/",
|
||
"http://freeproxylists.net/",
|
||
"http://www.freeproxylists.net/en/http-txt.html",
|
||
]
|
||
|
||
def _parse_any(self, html: str) -> List[ProxyRaw]:
|
||
ipport = re.findall(
|
||
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
|
||
html,
|
||
)
|
||
if len(ipport) >= 5:
|
||
out: List[ProxyRaw] = []
|
||
for ip, ps in ipport:
|
||
if ps.isdigit() and 1 <= int(ps) <= 65535:
|
||
try:
|
||
out.append(ProxyRaw(ip, int(ps), "http"))
|
||
except ValueError:
|
||
pass
|
||
return out
|
||
soup = BeautifulSoup(html, "lxml")
|
||
results: List[ProxyRaw] = []
|
||
for tr in soup.find_all("tr"):
|
||
tds = tr.find_all("td")
|
||
if len(tds) < 2:
|
||
continue
|
||
ip = tds[0].get_text(strip=True)
|
||
port = tds[1].get_text(strip=True)
|
||
if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip) and port.isdigit():
|
||
if 1 <= int(port) <= 65535:
|
||
try:
|
||
results.append(ProxyRaw(ip, int(port), "http"))
|
||
except ValueError:
|
||
pass
|
||
return results
|
||
|
||
async def crawl(self) -> List[ProxyRaw]:
|
||
seen = set()
|
||
out: List[ProxyRaw] = []
|
||
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
||
for url, html in zip(self.urls, htmls):
|
||
if not html:
|
||
continue
|
||
for p in self._parse_any(html):
|
||
key = (p.ip, p.port, p.protocol)
|
||
if key not in seen:
|
||
seen.add(key)
|
||
out.append(p)
|
||
if out:
|
||
logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条")
|
||
return out
|