Files
ProxyPool/app/plugins/fpw_freeproxylists.py
祀梦 957cee3100 fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP
- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued
- BaseHTTPPlugin: longer connect budget for slow international links
- proxyscrape: jsDelivr mirror + longer GitHub/API phases
- fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins

Made-with: Cursor
2026-04-05 13:48:41 +08:00

70 lines
2.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""freeproxylists.net 及常见镜像路径(表格 / 纯文本)。"""
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwFreeproxylistsPlugin(BaseHTTPPlugin):
name = "fpw_freeproxylists"
display_name = "FreeProxyLists"
description = "freeproxylists.net 系列页面(易被 403多 URL 尝试)"
def __init__(self):
super().__init__()
self.urls = [
"http://www.freeproxylists.net/",
"http://freeproxylists.net/",
"http://www.freeproxylists.net/en/http-txt.html",
]
def _parse_any(self, html: str) -> List[ProxyRaw]:
ipport = re.findall(
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
html,
)
if len(ipport) >= 5:
out: List[ProxyRaw] = []
for ip, ps in ipport:
if ps.isdigit() and 1 <= int(ps) <= 65535:
try:
out.append(ProxyRaw(ip, int(ps), "http"))
except ValueError:
pass
return out
soup = BeautifulSoup(html, "lxml")
results: List[ProxyRaw] = []
for tr in soup.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip) and port.isdigit():
if 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), "http"))
except ValueError:
pass
return results
async def crawl(self) -> List[ProxyRaw]:
seen = set()
out: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for url, html in zip(self.urls, htmls):
if not html:
continue
for p in self._parse_any(html):
key = (p.ip, p.port, p.protocol)
if key not in seen:
seen.add(key)
out.append(p)
if out:
logger.info(f"{self.display_name}{url} 累计 {len(out)}")
return out