Files
ProxyPool/app/plugins/yundaili.py
祀梦 f09a8e16c4 fix: 修复爬虫网络层、验证队列卡死及 API 500 错误
- 修复 BaseHTTPPlugin 连接池、并发控制、异常日志、超时策略
- 修复/增强 8 个爬虫插件的稳定性和 fallback 机制
- 清理 validation_tasks 表 4 万+ pending 任务,避免队列卡死
- 修复 app/api/main.py 缺失全局 app 实例导致的 500 错误
- 提升前端 Axios 超时到 120 秒,避免请求断开
- 修复插件统计持久化和调度器生命周期问题
2026-04-04 19:27:36 +08:00

82 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
VALID_PROTOCOLS = ("http", "https", "socks4", "socks5")
class YunDaiLiPlugin(BaseHTTPPlugin):
default_config = {"max_pages": 5}
name = "yundaili"
display_name = "云代理"
description = "从 GitHub 公开代理列表获取免费代理"
def __init__(self):
super().__init__()
# 主数据源GitHub raw
self.urls = [
("http", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt"),
("socks4", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt"),
]
# Fallbackjsdelivr CDN 加速
self.fallback_urls = [
("http", "https://cdn.jsdelivr.net/gh/mmpx12/proxy-list@master/http.txt"),
("socks4", "https://cdn.jsdelivr.net/gh/mmpx12/proxy-list@master/socks4.txt"),
("socks5", "https://cdn.jsdelivr.net/gh/mmpx12/proxy-list@master/socks5.txt"),
]
def _parse_htmls(self, htmls: List[str], url_mapping: List[tuple]) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
for (protocol, _), html in zip(url_mapping, htmls):
if not html:
logger.warning(f"{self.display_name} {protocol.upper()} 返回空内容,可能网络受限或源已失效")
continue
count = 0
for line in html.splitlines():
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) < 2:
continue
ip = parts[0].strip()
port_str = parts[1].strip()
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
continue
if not port_str.isdigit() or not (1 <= int(port_str) <= 65535):
continue
final_protocol = protocol if protocol in VALID_PROTOCOLS else "http"
results.append(ProxyRaw(ip, int(port_str), final_protocol))
count += 1
if count:
logger.info(f"{self.display_name} {protocol.upper()} 解析完成,获取 {count} 个潜在代理")
return results
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
# 顺序请求主源,避免某个 URL 卡住拖慢整体
for protocol, url in self.urls:
html = await self.fetch(url, timeout=12)
if html:
results.extend(self._parse_htmls([html], [(protocol, url)]))
# 主源为空时尝试 fallback也顺序请求
if not results:
logger.warning(f"{self.display_name} GitHub 主源全部返回空,尝试 jsdelivr fallback")
for protocol, url in self.fallback_urls:
html = await self.fetch(url, timeout=12)
if html:
results.extend(self._parse_htmls([html], [(protocol, url)]))
if results:
logger.info(f"{self.display_name} 总计解析完成,获取 {len(results)} 个潜在代理")
else:
logger.warning(f"{self.display_name} 未获取到任何代理")
return results