Files
ProxyPool/app/plugins/proxyscrape.py
祀梦 957cee3100 fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP
- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued
- BaseHTTPPlugin: longer connect budget for slow international links
- proxyscrape: jsDelivr mirror + longer GitHub/API phases
- fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins

Made-with: Cursor
2026-04-05 13:48:41 +08:00

146 lines
6.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""ProxyScrape 测试爬虫 - 用于验证架构,支持全协议类型"""
import asyncio
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyScrapePlugin(BaseHTTPPlugin):
default_config = {"max_pages": 5}
"""
从 ProxyScrape 公开 API 获取代理库
覆盖 http/https/socks4/socks5 全协议,专门用于测试插件系统的可扩展性
"""
name = "proxyscrape"
display_name = "ProxyScrape测试站"
description = "从 ProxyScrape API 获取各类型代理HTTP/HTTPS/SOCKS4/SOCKS5用于测试架构扩展"
enabled = True
def __init__(self):
super().__init__()
# GitHub raw 首选;国内/高负载时 jsDelivr 镜像常更稳
self.urls = [
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
]
self._mirror_prefix = (
"https://cdn.jsdelivr.net/gh/monosans/proxy-list@main/proxies/"
)
# ProxyScrape 官方 API 作为 fallback
self.api_urls = {
"http": "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
"https": "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all",
"socks4": "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all",
"socks5": "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all",
}
def _parse_proxies(self, text: str, protocol: str) -> List[ProxyRaw]:
"""解析 ip:port 每行的文本内容"""
proxies = []
for line in text.splitlines():
line = line.strip()
if not line or ":" not in line:
continue
ip, _, port_str = line.rpartition(":")
ip = ip.strip()
port_str = port_str.strip()
if port_str.isdigit() and 1 <= int(port_str) <= 65535:
try:
proxies.append(ProxyRaw(ip, int(port_str), protocol))
except ValueError:
continue
return proxies
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
protocols = [protocol for protocol, _ in self.urls]
urls = [url for _, url in self.urls]
fetch_timeout = 28.0
# 1. GitHub raw放宽总等待避免 crawl-all 时与其它插件抢带宽导致集体超时
tasks = [
asyncio.create_task(self.fetch(url, timeout=fetch_timeout))
for url in urls
]
done, pending = await asyncio.wait(tasks, timeout=45)
for task in pending:
task.cancel()
htmls: list[str] = []
done_protocols: set[str] = set()
for i, task in enumerate(tasks):
try:
if task in done:
htmls.append(task.result())
done_protocols.add(protocols[i])
else:
htmls.append("")
except Exception:
htmls.append("")
need_mirror: list[str] = []
for protocol, html in zip(protocols, htmls):
proxies = self._parse_proxies(html or "", protocol) if html else []
if proxies:
logger.info(
f"ProxyScrape {protocol.upper()} GitHub raw 获取 {len(proxies)} 个代理"
)
results.extend(proxies)
else:
if protocol in done_protocols:
logger.warning(
f"ProxyScrape {protocol.upper()} GitHub raw 返回空或无效,尝试镜像与 API"
)
else:
logger.warning(
f"ProxyScrape {protocol.upper()} GitHub raw 请求超时,尝试镜像与 API"
)
need_mirror.append(protocol)
# 2. jsDelivr 镜像(顺序请求,减轻与其它插件的瞬时并发叠加)
still_need_api: list[str] = []
for protocol in need_mirror:
mirror_url = f"{self._mirror_prefix}{protocol}.txt"
text = await self.fetch(mirror_url, timeout=fetch_timeout, retries=2)
proxies = self._parse_proxies(text or "", protocol) if text else []
if proxies:
logger.info(
f"ProxyScrape {protocol.upper()} jsDelivr 镜像获取 {len(proxies)} 个代理"
)
results.extend(proxies)
else:
still_need_api.append(protocol)
# 3. ProxyScrape 官方 API
if still_need_api:
fallback_urls = [self.api_urls[p] for p in still_need_api]
try:
api_htmls = await asyncio.wait_for(
self.fetch_all(fallback_urls, timeout=25), timeout=35
)
except asyncio.TimeoutError:
logger.warning(
f"ProxyScrape API fallback 批量请求超时,跳过 {len(still_need_api)} 个协议"
)
api_htmls = [""] * len(still_need_api)
for protocol, api_html in zip(still_need_api, api_htmls):
proxies = (
self._parse_proxies(api_html or "", protocol) if api_html else []
)
if proxies:
logger.info(
f"ProxyScrape {protocol.upper()} API 获取 {len(proxies)} 个代理"
)
results.extend(proxies)
else:
logger.warning(f"ProxyScrape {protocol.upper()} API 返回空或无效")
if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else:
logger.warning("ProxyScrape 所有真实源均不可用,返回空列表")
return results