Files
ProxyPool/app/plugins/fpw_checkerproxy.py
祀梦 957cee3100 fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP
- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued
- BaseHTTPPlugin: longer connect budget for slow international links
- proxyscrape: jsDelivr mirror + longer GitHub/API phases
- fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins

Made-with: Cursor
2026-04-05 13:48:41 +08:00

66 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""checkerproxy.net尝试常见导出路径 + 正文中的 ip:port排除示例占位"""
import re
from typing import List, Set, Tuple
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwCheckerproxyPlugin(BaseHTTPPlugin):
name = "fpw_checkerproxy"
display_name = "CheckerProxy.net"
description = "checkerproxy.net无稳定公开 API 时可能为空;多路径尝试)"
def __init__(self):
super().__init__()
self.urls = [
"https://checkerproxy.net/",
"https://checkerproxy.net/export",
"https://checkerproxy.net/api/export",
]
@staticmethod
def _parse_ip_ports(text: str) -> List[ProxyRaw]:
bad = {"123.123.123.123", "127.0.0.1", "0.0.0.0"}
seen: Set[Tuple[str, int]] = set()
out: List[ProxyRaw] = []
for m in re.finditer(
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
text,
):
ip, ps = m.group(1), m.group(2)
if ip in bad:
continue
if not ps.isdigit() or not (1 <= int(ps) <= 65535):
continue
key = (ip, int(ps))
if key in seen:
continue
seen.add(key)
try:
out.append(ProxyRaw(ip, int(ps), "http"))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = []
seen: Set[Tuple[str, int, str]] = set()
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for html in htmls:
if not html or len(html) < 200:
continue
for p in self._parse_ip_ports(html):
k = (p.ip, p.port, p.protocol)
if k not in seen:
seen.add(k)
merged.append(p)
if len(merged) >= 50:
break
if merged:
logger.info(f"{self.display_name} 解析 {len(merged)}")
else:
logger.warning(f"{self.display_name} 未解析到代理(站点可能仅提供在线检测)")
return merged