- fpw_proxy_list_download: parse JSON list/proxies bodies; jsDelivr monosans tier; crawl timeout 300s - fpw_socks_ssl: try parse_html_table before regex - fpw_hidemy: loose row scan when fixed columns fail - fpw_proxynova: plain IP/port row fallback - fpw_spys_one: HTTPS endpoints; crawl timeout 180s - fpw_gatherproxy: HTTPS + extra JSON key patterns - fpw_checkerproxy: lower min HTML length for parse - fpw_premproxy: ip:port regex fallback when few table rows Made-with: Cursor
73 lines
2.6 KiB
Python
73 lines
2.6 KiB
Python
"""gatherproxy.com 页面内嵌 JSON(PROXY_IP / PROXY_PORT)。"""
|
||
import re
|
||
from typing import List
|
||
|
||
from app.core.plugin_system import ProxyRaw
|
||
from app.plugins.base import BaseHTTPPlugin
|
||
from app.core.log import logger
|
||
|
||
|
||
class FpwGatherproxyPlugin(BaseHTTPPlugin):
|
||
name = "fpw_gatherproxy"
|
||
display_name = "GatherProxy"
|
||
description = "gatherproxy.com 内嵌代理 JSON(站点常有限流)"
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.urls = [
|
||
"https://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
|
||
"https://www.gatherproxy.com/proxylist/country/?c=United%20States",
|
||
]
|
||
|
||
def _extract_from_text(self, text: str) -> List[ProxyRaw]:
|
||
results: List[ProxyRaw] = []
|
||
for m in re.finditer(
|
||
r"PROXY_IP['\"]?\s*:\s*['\"]([\d.]+)['\"].{0,120}?PROXY_PORT['\"]?\s*:\s*['\"](\d+)['\"]",
|
||
text,
|
||
re.DOTALL | re.IGNORECASE,
|
||
):
|
||
ip, port = m.group(1), m.group(2)
|
||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||
try:
|
||
results.append(ProxyRaw(ip, int(port), "http"))
|
||
except ValueError:
|
||
continue
|
||
for m in re.finditer(
|
||
r"\{[^{}]*\"PROXY_IP\"\s*:\s*\"([\d.]+)\"[^{}]*\"PROXY_PORT\"\s*:\s*\"(\d+)\"[^{}]*\}",
|
||
text,
|
||
):
|
||
ip, port = m.group(1), m.group(2)
|
||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||
try:
|
||
results.append(ProxyRaw(ip, int(port), "http"))
|
||
except ValueError:
|
||
continue
|
||
for m in re.finditer(
|
||
r'"(?:proxy_)?ip"\s*:\s*"([\d.]+)"\s*,\s*"(?:proxy_)?port"\s*:\s*"?(\d+)"?',
|
||
text,
|
||
re.I,
|
||
):
|
||
ip, port = m.group(1), m.group(2)
|
||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||
try:
|
||
results.append(ProxyRaw(ip, int(port), "http"))
|
||
except ValueError:
|
||
continue
|
||
return results
|
||
|
||
async def crawl(self) -> List[ProxyRaw]:
|
||
seen = set()
|
||
out: List[ProxyRaw] = []
|
||
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
||
for url, html in zip(self.urls, htmls):
|
||
if not html:
|
||
continue
|
||
for p in self._extract_from_text(html):
|
||
k = (p.ip, p.port)
|
||
if k not in seen:
|
||
seen.add(k)
|
||
out.append(p)
|
||
if out:
|
||
logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条")
|
||
return out
|