Files
ProxyPool/app/plugins/proxylist_download.py
祀梦 f09a8e16c4 fix: 修复爬虫网络层、验证队列卡死及 API 500 错误
- 修复 BaseHTTPPlugin 连接池、并发控制、异常日志、超时策略
- 修复/增强 8 个爬虫插件的稳定性和 fallback 机制
- 清理 validation_tasks 表 4 万+ pending 任务,避免队列卡死
- 修复 app/api/main.py 缺失全局 app 实例导致的 500 错误
- 提升前端 Axios 超时到 120 秒,避免请求断开
- 修复插件统计持久化和调度器生命周期问题
2026-04-04 19:27:36 +08:00

96 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyListDownloadPlugin(BaseHTTPPlugin):
default_config = {"max_pages": 5}
name = "proxylist_download"
display_name = "ProxyListDownload"
description = "从 GitHub 公开代理列表获取代理"
def __init__(self):
super().__init__()
# 首选 GitHub raw + fallback 备用源jsdelivr CDN 或 ProxyScrape API
self.sources = [
{
"primary": "https://raw.githubusercontent.com/komutan234/Proxy-List-Free/main/proxies/http.txt",
"fallbacks": [
"https://cdn.jsdelivr.net/gh/komutan234/Proxy-List-Free@main/proxies/http.txt",
"https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
],
"protocol": "http",
},
{
"primary": "https://raw.githubusercontent.com/komutan234/Proxy-List-Free/main/proxies/socks4.txt",
"fallbacks": [
"https://cdn.jsdelivr.net/gh/komutan234/Proxy-List-Free@main/proxies/socks4.txt",
"https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all",
],
"protocol": "socks4",
},
{
"primary": "https://raw.githubusercontent.com/komutan234/Proxy-List-Free/main/proxies/socks5.txt",
"fallbacks": [
"https://cdn.jsdelivr.net/gh/komutan234/Proxy-List-Free@main/proxies/socks5.txt",
"https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all",
],
"protocol": "socks5",
},
]
def _detect_protocol(self, url: str) -> str:
"""根据 URL 判断协议(注意不要用 https:// 来判断)"""
if "socks4" in url:
return "socks4"
elif "socks5" in url:
return "socks5"
elif "/http.txt" in url or "protocol=http" in url:
return "http"
return "http"
def _parse_lines(self, html: str, protocol: str) -> List[ProxyRaw]:
"""解析代理文本,统一处理 \r\n\n 两种换行以及可能存在的空行"""
results = []
# 统一替换为 \n 后再分割
text = html.replace("\r\n", "\n").replace("\r", "\n")
for line in text.split("\n"):
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port = parts[1].strip()
if ip and port.isdigit():
results.append(ProxyRaw(ip, int(port), protocol))
return results
async def crawl(self) -> List[ProxyRaw]:
results = []
# 并发请求所有 primary URL
primary_urls = [s["primary"] for s in self.sources]
primary_htmls = await self.fetch_all(primary_urls, timeout=15)
for idx, html in enumerate(primary_htmls):
source = self.sources[idx]
protocol = source.get("protocol") or self._detect_protocol(source["primary"])
if html and html.strip():
results.extend(self._parse_lines(html, protocol))
continue
# primary 返回空或仅空白字符,依次尝试 fallback
logger.warning(f"{self.display_name} 主源返回空,尝试 fallback: {source['primary']}")
for fallback_url in source["fallbacks"]:
fallback_html = await self.fetch(fallback_url, timeout=15)
if fallback_html and fallback_html.strip():
fb_protocol = source.get("protocol") or self._detect_protocol(fallback_url)
results.extend(self._parse_lines(fallback_html, fb_protocol))
break
if results:
logger.info(f"{self.display_name} 解析完成,获得 {len(results)} 个潜在代理")
return results