Files
ProxyPool/app/plugins/proxyscrape.py
祀梦 635c524a7e refactor(backend): optimize database safety, validator performance, and scheduler concurrency
- Fix SQL injection risks in proxy_repo and task_repo
- Atomic acquire_pending with UPDATE ... RETURNING
- Reuse aiohttp ClientSession in ValidatorService
- Replace polling with asyncio.Event in SchedulerService
- Optimize ValidationQueue.drain with asyncio.Condition
- Concurrent plugin crawling with asyncio.gather
- Unify ProxyRaw model import path
- Fix test baseline and remove tracked __pycache__ files
2026-04-04 14:43:31 +08:00

77 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""ProxyScrape 测试爬虫 - 用于验证架构,支持全协议类型"""
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class ProxyScrapePlugin(BaseHTTPPlugin):
default_config = {"max_pages": 5}
"""
从 ProxyScrape 公开 API 获取代理库
覆盖 http/https/socks4/socks5 全协议,专门用于测试插件系统的可扩展性
"""
name = "proxyscrape"
display_name = "ProxyScrape测试站"
description = "从 ProxyScrape API 获取各类型代理HTTP/HTTPS/SOCKS4/SOCKS5用于测试架构扩展"
enabled = True
def __init__(self):
super().__init__()
# 使用多个公开 GitHub 代理列表作为源,稳定性较差
self.urls = [
("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt"),
("https", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/https.txt"),
("socks4", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt"),
("socks5", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt"),
]
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
for protocol, url in self.urls:
try:
html = await self.fetch(url, timeout=30)
if not html:
logger.warning(f"ProxyScrape {protocol.upper()} 返回空内容")
continue
count = 0
for line in html.splitlines():
line = line.strip()
if not line or ":" not in line:
continue
parts = line.split(":")
if len(parts) >= 2:
ip = parts[0].strip()
port_str = parts[1].strip()
if port_str.isdigit():
results.append(ProxyRaw(ip, int(port_str), protocol))
count += 1
logger.info(f"ProxyScrape {protocol.upper()} 获取 {count} 个代理")
except Exception as e:
logger.error(f"ProxyScrape {protocol.upper()} 爬取失败: {e}")
if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else:
# Fallback生成测试代理确保在测试环境也能验证完整流程
logger.warning("ProxyScrape 所有真实源均不可用,生成测试代理用于架构验证")
results = self._generate_test_proxies()
return results
def _generate_test_proxies(self) -> List[ProxyRaw]:
"""生成测试代理数据,覆盖全协议类型,用于验证插件系统"""
import random
test_proxies = []
protocols = ["http", "https", "socks4", "socks5"]
for protocol in protocols:
for _ in range(3):
# 生成随机公网格式 IP仅用于测试流程
ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
port = random.randint(1024, 65535)
test_proxies.append(ProxyRaw(ip, port, protocol))
logger.info(f"生成 {len(test_proxies)} 个测试代理 HTTP/HTTPS/SOCKS4/SOCKS5 各 3 个")
return test_proxies