- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued - BaseHTTPPlugin: longer connect budget for slow international links - proxyscrape: jsDelivr mirror + longer GitHub/API phases - fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins Made-with: Cursor
62 lines
2.2 KiB
Python
62 lines
2.2 KiB
Python
"""gatherproxy.com 页面内嵌 JSON(PROXY_IP / PROXY_PORT)。"""
|
||
import re
|
||
from typing import List
|
||
|
||
from app.core.plugin_system import ProxyRaw
|
||
from app.plugins.base import BaseHTTPPlugin
|
||
from app.core.log import logger
|
||
|
||
|
||
class FpwGatherproxyPlugin(BaseHTTPPlugin):
|
||
name = "fpw_gatherproxy"
|
||
display_name = "GatherProxy"
|
||
description = "gatherproxy.com 内嵌代理 JSON(站点常有限流)"
|
||
|
||
def __init__(self):
|
||
super().__init__()
|
||
self.urls = [
|
||
"http://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
|
||
"http://www.gatherproxy.com/proxylist/country/?c=United%20States",
|
||
]
|
||
|
||
def _extract_from_text(self, text: str) -> List[ProxyRaw]:
|
||
results: List[ProxyRaw] = []
|
||
for m in re.finditer(
|
||
r"PROXY_IP['\"]?\s*:\s*['\"]([\d.]+)['\"].{0,120}?PROXY_PORT['\"]?\s*:\s*['\"](\d+)['\"]",
|
||
text,
|
||
re.DOTALL | re.IGNORECASE,
|
||
):
|
||
ip, port = m.group(1), m.group(2)
|
||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||
try:
|
||
results.append(ProxyRaw(ip, int(port), "http"))
|
||
except ValueError:
|
||
continue
|
||
for m in re.finditer(
|
||
r"\{[^{}]*\"PROXY_IP\"\s*:\s*\"([\d.]+)\"[^{}]*\"PROXY_PORT\"\s*:\s*\"(\d+)\"[^{}]*\}",
|
||
text,
|
||
):
|
||
ip, port = m.group(1), m.group(2)
|
||
if port.isdigit() and 1 <= int(port) <= 65535:
|
||
try:
|
||
results.append(ProxyRaw(ip, int(port), "http"))
|
||
except ValueError:
|
||
continue
|
||
return results
|
||
|
||
async def crawl(self) -> List[ProxyRaw]:
|
||
seen = set()
|
||
out: List[ProxyRaw] = []
|
||
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
|
||
for url, html in zip(self.urls, htmls):
|
||
if not html:
|
||
continue
|
||
for p in self._extract_from_text(html):
|
||
k = (p.ip, p.port)
|
||
if k not in seen:
|
||
seen.add(k)
|
||
out.append(p)
|
||
if out:
|
||
logger.info(f"{self.display_name} 自 {url} 累计 {len(out)} 条")
|
||
return out
|