Files
ProxyPool/app/plugins/fpw_premproxy.py
祀梦 957cee3100 fix(crawl): throttle concurrent CrawlJobs and relax fpw/proxyscrape HTTP
- CrawlJob waits on crawl_slot before JobExecutor semaphore so crawl-all does not fill slots while queued
- BaseHTTPPlugin: longer connect budget for slow international links
- proxyscrape: jsDelivr mirror + longer GitHub/API phases
- fpw_*: higher timeouts/retries; lower internal concurrency on heavy multi-URL plugins

Made-with: Cursor
2026-04-05 13:48:41 +08:00

65 lines
2.1 KiB
Python

"""premproxy.com 列表页表格。"""
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwPremproxyPlugin(BaseHTTPPlugin):
name = "fpw_premproxy"
display_name = "PremProxy"
description = "premproxy.com HTTP/SOCKS 列表页"
def __init__(self):
super().__init__()
self.urls = [
"https://premproxy.com/list/",
"https://premproxy.com/socks-list/",
]
def _parse_html(self, html: str) -> List[ProxyRaw]:
soup = BeautifulSoup(html, "lxml")
results: List[ProxyRaw] = []
for tr in soup.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
continue
if not port.isdigit() or not (1 <= int(port) <= 65535):
continue
row = tr.get_text(" ", strip=True).lower()
if "socks5" in row:
proto = "socks5"
elif "socks4" in row or "socks" in row:
proto = "socks4"
elif "https" in row:
proto = "https"
else:
proto = "http"
try:
results.append(ProxyRaw(ip, int(port), proto))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=25, retries=2)
for url, html in zip(self.urls, htmls):
if not html:
continue
batch = self._parse_html(html)
if batch:
merged.extend(batch)
logger.info(f"{self.display_name} {url}: {len(batch)}")
if merged:
logger.info(f"{self.display_name} 合计 {len(merged)}")
return merged