feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation

- Add Free_Proxy_Website-style fpw_* plugins and register them
- Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting
- Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore
- Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait
- Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys
- WebSocket dashboard stats; ProxyList pool_filter and API alignment
- POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404
- pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB
- .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 13:39:19 +08:00
parent 92c7fa19e2
commit 0131c8b408
63 changed files with 2331 additions and 531 deletions

3
.gitignore vendored
View File

@@ -30,6 +30,9 @@ env/
*.sqlite *.sqlite
*.sqlite3 *.sqlite3
*.db *.db
# pytest 隔离库PROXYPOOL_DB_PATH=db/proxies.test.sqlite勿提交
**/proxies.test.sqlite
proxies.test.sqlite
*.db-shm *.db-shm
*.db-wal *.db-wal

View File

@@ -221,10 +221,15 @@ POST /api/settings
- **验证超时**: 3-30秒默认 5秒 - **验证超时**: 3-30秒默认 5秒
- **验证并发数**: 10-200默认 50 - **验证并发数**: 10-200默认 50
### 评分机制 ### 待验证与可用
- **爬取**:代理默认以「待验证」入库(`validated=0`,分数为 0不会立刻参与随机/导出。
- **验证**:在设置页「立即验证全部」或开启自动验证后,会**先验证待验证队列**,再按检查时间**复检已入库代理**;通过后标记为已验证并赋予分数。
- **设置**:「爬取后立即验证」默认关闭;开启后爬取完成会像旧版一样立刻排队验证。
### 评分机制(仅针对已验证入池的代理)
- **验证成功**: +10 分 - **验证成功**: +10 分
- **验证失败**: -5 分 - **验证失败**: -5 分
- **分数为 0**: 自动删除 - **分数为 0**: 自动删除(待验证阶段验证失败则直接丢弃该条)
## 🔧 常见问题 ## 🔧 常见问题

View File

@@ -64,7 +64,8 @@ export const proxiesAPI = {
getProxies: (params, signal) => getProxies: (params, signal) =>
api.post('/api/proxies', cleanParams(params), { signal }), api.post('/api/proxies', cleanParams(params), { signal }),
deleteProxy: (ip, port) => api.delete(`/api/proxies/${ip}/${port}`), deleteProxy: (ip, port) =>
api.post('/api/proxies/delete-one', { ip, port }),
batchDeleteProxies: (proxies) => api.post('/api/proxies/batch-delete', { proxies }), batchDeleteProxies: (proxies) => api.post('/api/proxies/batch-delete', { proxies }),

View File

@@ -24,7 +24,8 @@ const props = defineProps({
type: { type: {
type: String, type: String,
default: 'default', default: 'default',
validator: (value) => ['default', 'total', 'available', 'new', 'score'].includes(value) validator: (value) =>
['default', 'total', 'pending', 'available', 'new', 'score'].includes(value)
}, },
/** 图标组件 */ /** 图标组件 */
icon: { icon: {
@@ -79,6 +80,11 @@ const displayValue = computed(() => {
filter: drop-shadow(0 0 8px rgba(34, 197, 94, 0.4)); filter: drop-shadow(0 0 8px rgba(34, 197, 94, 0.4));
} }
.stat-card.pending .stat-icon {
color: var(--warning);
filter: drop-shadow(0 0 8px rgba(250, 204, 21, 0.45));
}
.stat-card.new .stat-icon { .stat-card.new .stat-icon {
color: var(--warning); color: var(--warning);
filter: drop-shadow(0 0 8px rgba(245, 158, 11, 0.4)); filter: drop-shadow(0 0 8px rgba(245, 158, 11, 0.4));

View File

@@ -0,0 +1,134 @@
import { onUnmounted } from 'vue'
import { useProxyStore } from '../stores/proxy'
const MAX_DELAY_MS = 30000
const INITIAL_DELAY_MS = 1000
/**
* 由 API Base 推导统计 WebSocket URL/api/ws
* @returns {string}
*/
export function resolveWebSocketStatsUrl() {
const explicit = import.meta.env.VITE_WS_URL
if (explicit) {
const t = String(explicit).trim().replace(/\/$/, '')
return t.endsWith('/api/ws') ? t : `${t}/api/ws`
}
const api = import.meta.env.VITE_API_BASE_URL || 'http://localhost:18080'
const u = new URL(api)
u.protocol = u.protocol === 'https:' ? 'wss:' : 'ws:'
u.pathname = '/api/ws'
u.search = ''
u.hash = ''
return u.toString()
}
/**
* 连接后端 WebSocket 接收实时统计;指数退避重连;页签隐藏时暂停连接。
*/
export function useStatsWebSocket() {
const store = useProxyStore()
let ws = null
let reconnectTimer = null
let attempt = 0
let stopped = false
let paused = false
function backoffDelayMs() {
return Math.min(INITIAL_DELAY_MS * 2 ** attempt, MAX_DELAY_MS)
}
function clearReconnectTimer() {
if (reconnectTimer) {
clearTimeout(reconnectTimer)
reconnectTimer = null
}
}
function connect() {
if (stopped || paused) return
clearReconnectTimer()
const url = resolveWebSocketStatsUrl()
ws = new WebSocket(url)
ws.onopen = () => {
attempt = 0
}
ws.onmessage = (ev) => {
try {
const msg = JSON.parse(ev.data)
if (msg.type === 'stats' && msg.data) {
store.applyStats(msg.data)
} else if (msg.type === 'pong') {
// optional heartbeat
}
} catch {
// ignore malformed
}
}
ws.onclose = () => {
ws = null
if (stopped || paused) return
attempt += 1
reconnectTimer = setTimeout(connect, backoffDelayMs())
}
ws.onerror = () => {
try {
ws?.close()
} catch {
// ignore
}
}
}
function handleVisibility() {
if (document.hidden) {
paused = true
clearReconnectTimer()
if (ws) {
const s = ws
ws = null
s.onclose = null
try {
s.close()
} catch {
// ignore
}
}
} else {
paused = false
if (!stopped) {
attempt = 0
connect()
}
}
}
function start() {
stopped = false
paused = false
attempt = 0
document.addEventListener('visibilitychange', handleVisibility)
connect()
}
function disconnect() {
stopped = true
paused = false
document.removeEventListener('visibilitychange', handleVisibility)
clearReconnectTimer()
if (ws) {
const s = ws
ws = null
s.onclose = null
try {
s.close()
} catch {
// ignore
}
}
}
onUnmounted(disconnect)
return { start, disconnect }
}

View File

@@ -1,7 +1,8 @@
import { tasksAPI } from '../api' import { tasksAPI } from '../api'
const POLL_INTERVAL = 1000 const POLL_INTERVAL = 1000
const MAX_POLL_ATTEMPTS = 30 /** 大批量爬取可能超过 30s适当放宽避免误报「任务进行中」 */
const MAX_POLL_ATTEMPTS = 300
/** /**
* 轮询任务状态直到完成或失败 * 轮询任务状态直到完成或失败
@@ -21,7 +22,14 @@ export async function pollTaskStatus(taskId) {
return response return response
} }
} catch (error) { } catch (error) {
// 网络异常时继续轮询,不中断 const status = error.response?.status
if (status === 404) {
return {
code: 404,
message: error.response?.data?.message || '任务不存在',
data: { task_id: taskId, status: 'failed', error: 'not_found' }
}
}
console.warn('轮询任务状态失败:', error) console.warn('轮询任务状态失败:', error)
} }
} }

View File

@@ -32,6 +32,12 @@ export const useProxyStore = defineStore('proxy', () => {
* 获取统计信息 * 获取统计信息
* @returns {Promise<boolean>} * @returns {Promise<boolean>}
*/ */
function applyStats(data) {
if (data && typeof data === 'object') {
stats.value = { ...data }
}
}
async function fetchStats() { async function fetchStats() {
try { try {
const response = await proxyService.getStats() const response = await proxyService.getStats()
@@ -174,6 +180,7 @@ export const useProxyStore = defineStore('proxy', () => {
isEmpty, isEmpty,
// Actions // Actions
fetchStats, fetchStats,
applyStats,
fetchProxies, fetchProxies,
deleteProxy, deleteProxy,
batchDeleteProxies, batchDeleteProxies,

View File

@@ -2,40 +2,38 @@
<div class="page-container"> <div class="page-container">
<PageHeader title="代理池管理系统" :icon="MagicStick" /> <PageHeader title="代理池管理系统" :icon="MagicStick" />
<el-row :gutter="20" class="stats-row"> <div class="stats-grid">
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6">
<StatCard <StatCard
type="total" type="total"
:icon="DataLine" :icon="DataLine"
:value="stats.total || 0" :value="stats.total || 0"
label="总代理数" label="总代理数"
/> />
</el-col> <StatCard
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6"> type="pending"
:icon="Clock"
:value="stats.pending || 0"
label="待验证"
/>
<StatCard <StatCard
type="available" type="available"
:icon="CircleCheck" :icon="CircleCheck"
:value="stats.available || 0" :value="stats.available || 0"
label="可用数量" label="可用数量"
/> />
</el-col>
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6">
<StatCard <StatCard
type="new" type="new"
:icon="Timer" :icon="Timer"
:value="stats.today_new || 0" :value="stats.today_new || 0"
label="今日新增" label="今日新增"
/> />
</el-col>
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6">
<StatCard <StatCard
type="score" type="score"
:icon="StarFilled" :icon="StarFilled"
:value="avgScore" :value="avgScore"
label="平均分数" label="平均分数"
/> />
</el-col> </div>
</el-row>
<el-row :gutter="20" class="charts-row"> <el-row :gutter="20" class="charts-row">
<el-col :xs="24" :lg="16"> <el-col :xs="24" :lg="16">
@@ -88,7 +86,7 @@
</template> </template>
<script setup> <script setup>
import { computed, onMounted, onUnmounted } from 'vue' import { computed, onMounted } from 'vue'
import { ElMessage, ElMessageBox } from 'element-plus' import { ElMessage, ElMessageBox } from 'element-plus'
import { import {
MagicStick, MagicStick,
@@ -96,7 +94,8 @@ import {
CircleCheck, CircleCheck,
Timer, Timer,
StarFilled, StarFilled,
InfoFilled InfoFilled,
Clock
} from '@element-plus/icons-vue' } from '@element-plus/icons-vue'
import { useProxyStore } from '../stores/proxy' import { useProxyStore } from '../stores/proxy'
import { formatNumber } from '../utils/format' import { formatNumber } from '../utils/format'
@@ -104,26 +103,16 @@ import StatCard from '../components/StatCard.vue'
import ProtocolChart from '../components/ProtocolChart.vue' import ProtocolChart from '../components/ProtocolChart.vue'
import QuickActions from '../components/QuickActions.vue' import QuickActions from '../components/QuickActions.vue'
import PageHeader from '../components/PageHeader.vue' import PageHeader from '../components/PageHeader.vue'
import { useStatsWebSocket } from '../composables/useStatsWebSocket'
// ==================== Store ==================== // ==================== Store ====================
const proxyStore = useProxyStore() const proxyStore = useProxyStore()
const { start: startStatsWs } = useStatsWebSocket()
// ==================== 计算属性 ==================== // ==================== 计算属性 ====================
const stats = computed(() => proxyStore.stats) const stats = computed(() => proxyStore.stats)
const avgScore = computed(() => formatNumber(stats.value.avg_score || 0, 1)) const avgScore = computed(() => formatNumber(stats.value.avg_score || 0, 1))
// ==================== 定时刷新 ====================
const REFRESH_INTERVAL = 5000
let refreshTimer = null
let isPageVisible = true
function handleVisibilityChange() {
isPageVisible = !document.hidden
if (isPageVisible) {
refreshData()
}
}
async function refreshData() { async function refreshData() {
await proxyStore.fetchStats() await proxyStore.fetchStats()
} }
@@ -165,26 +154,15 @@ async function handleClean() {
// ==================== 生命周期 ==================== // ==================== 生命周期 ====================
onMounted(async () => { onMounted(async () => {
await refreshData() await refreshData()
startStatsWs()
document.addEventListener('visibilitychange', handleVisibilityChange)
refreshTimer = setInterval(() => {
if (isPageVisible) {
refreshData()
}
}, REFRESH_INTERVAL)
})
onUnmounted(() => {
if (refreshTimer) {
clearInterval(refreshTimer)
refreshTimer = null
}
document.removeEventListener('visibilitychange', handleVisibilityChange)
}) })
</script> </script>
<style scoped> <style scoped>
.stats-row { .stats-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 20px; margin-bottom: 20px;
} }
@@ -237,14 +215,6 @@ onUnmounted(() => {
} }
@media (max-width: 768px) { @media (max-width: 768px) {
.stats-row .el-col {
margin-bottom: 16px;
}
.stats-row .el-col:last-child {
margin-bottom: 0;
}
.status-list { .status-list {
flex-direction: column; flex-direction: column;
gap: 16px; gap: 16px;

View File

@@ -15,7 +15,7 @@
</el-tag> </el-tag>
</div> </div>
<div class="header-actions"> <div class="header-actions">
<el-button type="success" @click="handleCrawlAll" size="large" :loading="crawlingAll"> <el-button type="success" @click="handleCrawlAll" size="large" :loading="crawlAllMask">
<el-icon class="btn-icon"><Promotion /></el-icon> <el-icon class="btn-icon"><Promotion /></el-icon>
全部爬取 全部爬取
</el-button> </el-button>
@@ -53,12 +53,12 @@
</template> </template>
</el-table-column> </el-table-column>
<el-table-column label="统计" width="180"> <el-table-column label="上次爬取" width="200">
<template #default="{ row }"> <template #default="{ row }">
<div class="plugin-stats"> <div class="plugin-stats" title="绿色为最近一轮爬到的代理条数红色为最近一轮是否失败0 成功 / 1 失败),不是验证通过数">
<div class="stat-item"> <div class="stat-item">
<el-icon class="stat-icon success"><CircleCheck /></el-icon> <el-icon class="stat-icon success"><CircleCheck /></el-icon>
<span class="stat-value success">{{ row.success_count || 0 }}</span> <span class="stat-value success">{{ row.success_count || 0 }} </span>
</div> </div>
<div class="stat-item"> <div class="stat-item">
<el-icon class="stat-icon failed"><CircleClose /></el-icon> <el-icon class="stat-icon failed"><CircleClose /></el-icon>
@@ -74,7 +74,35 @@
</template> </template>
</el-table-column> </el-table-column>
<el-table-column label="操作" width="220" fixed="right" align="center"> <el-table-column label="最近爬取" min-width="340" align="left">
<template #default="{ row }">
<div v-if="crawlAllMask && row.enabled" class="crawl-running-row">
<el-icon class="is-loading crawl-spin"><Loading /></el-icon>
<span>正在爬取</span>
</div>
<div v-else-if="crawlResults[row.id]" class="result-panel" :class="crawlResults[row.id].type">
<div class="result-panel-head">
<el-icon v-if="crawlResults[row.id].type === 'success'" class="result-head-icon success"><CircleCheck /></el-icon>
<el-icon v-else class="result-head-icon failed"><CircleClose /></el-icon>
<span class="result-panel-title">{{ crawlResults[row.id].message }}</span>
<el-icon class="result-close" @click="clearCrawlResult(row.id)"><Close /></el-icon>
</div>
<div class="result-panel-body">
<template v-if="crawlResults[row.id].data && crawlResults[row.id].data.proxy_count !== undefined">
<span class="result-pill fetched">爬取 {{ crawlResults[row.id].data.proxy_count }} </span>
</template>
<template v-if="crawlResults[row.id].data?.crawl_failed">
<div class="result-error-block" :title="crawlResults[row.id].data.error || ''">
{{ crawlResults[row.id].data.error || '爬取失败' }}
</div>
</template>
</div>
</div>
<span v-else class="result-placeholder"></span>
</template>
</el-table-column>
<el-table-column label="操作" width="200" fixed="right" align="center">
<template #default="{ row }"> <template #default="{ row }">
<div class="plugin-actions"> <div class="plugin-actions">
<el-button <el-button
@@ -89,27 +117,13 @@
type="success" type="success"
size="small" size="small"
@click="handleCrawl(row.id)" @click="handleCrawl(row.id)"
:loading="crawlingPlugins.has(row.id)" :loading="crawlingPlugins.has(row.id) || (crawlAllMask && row.enabled)"
:disabled="!row.enabled" :disabled="!row.enabled"
> >
<el-icon class="btn-icon"><Promotion /></el-icon> <el-icon class="btn-icon"><Promotion /></el-icon>
爬取 爬取
</el-button> </el-button>
</div> </div>
<div v-if="crawlResults[row.id]" class="plugin-crawl-result">
<div class="result-mini" :class="crawlResults[row.id].type">
<el-icon v-if="crawlResults[row.id].type === 'success'" class="result-icon success"><CircleCheck /></el-icon>
<el-icon v-else class="result-icon failed"><CircleClose /></el-icon>
<span class="result-text">{{ crawlResults[row.id].message }}</span>
<span v-if="crawlResults[row.id].data?.success_count !== undefined" class="result-count valid">
有效 {{ crawlResults[row.id].data.success_count }}
</span>
<span v-if="crawlResults[row.id].data?.failure_count !== undefined" class="result-count invalid">
无效 {{ crawlResults[row.id].data.failure_count }}
</span>
<el-icon class="result-close" @click="clearCrawlResult(row.id)"><Close /></el-icon>
</div>
</div>
</template> </template>
</el-table-column> </el-table-column>
</el-table> </el-table>
@@ -130,18 +144,37 @@
@close="allCrawlResult = null" @close="allCrawlResult = null"
> >
<template v-if="allCrawlResult.data"> <template v-if="allCrawlResult.data">
<div class="crawl-stats"> <div class="crawl-stats crawl-stats-summary">
<span v-if="allCrawlResult.data.total_crawled !== undefined"> <span v-if="allCrawlResult.data.total_crawled !== undefined">
爬取: {{ allCrawlResult.data.total_crawled }} 合计爬取: <strong>{{ allCrawlResult.data.total_crawled }}</strong>
</span> </span>
<span
<span v-if="allCrawlResult.data.valid_count !== undefined" class="valid-count"> v-if="allCrawlResult.data.plugins_failed !== undefined"
有效: {{ allCrawlResult.data.valid_count }} class="invalid-count"
</span> >
<span v-if="allCrawlResult.data.invalid_count !== undefined" class="invalid-count"> 失败插件: <strong>{{ allCrawlResult.data.plugins_failed }}</strong>
无效: {{ allCrawlResult.data.invalid_count }}
</span> </span>
</div> </div>
<ul
v-if="allCrawlResult.data.per_plugin?.length"
class="per-plugin-breakdown"
>
<li
v-for="(item, idx) in allCrawlResult.data.per_plugin"
:key="item.plugin_id || `pp-${idx}`"
class="per-plugin-line"
>
<span class="pp-name">{{ pluginDisplayName(item.plugin_id) }}</span>
<template v-if="item.crawl_failed">
<el-tag type="danger" size="small" effect="light">失败</el-tag>
<span class="pp-detail err">{{ item.error || '未知错误' }}</span>
</template>
<template v-else>
<el-tag type="success" size="small" effect="light">完成</el-tag>
<span class="pp-detail">爬取 <strong>{{ item.proxy_count }}</strong> </span>
</template>
</li>
</ul>
</template> </template>
</el-alert> </el-alert>
</el-card> </el-card>
@@ -198,7 +231,8 @@ import {
CircleClose, CircleClose,
Box, Box,
Setting, Setting,
Close Close,
Loading
} from '@element-plus/icons-vue' } from '@element-plus/icons-vue'
import { usePluginsStore } from '../stores/plugins' import { usePluginsStore } from '../stores/plugins'
import { pluginService } from '../services/pluginService' import { pluginService } from '../services/pluginService'
@@ -207,10 +241,17 @@ import PageHeader from '../components/PageHeader.vue'
const pluginsStore = usePluginsStore() const pluginsStore = usePluginsStore()
const crawlingPlugins = ref(new Set()) const crawlingPlugins = ref(new Set())
const crawlingAll = ref(false) /** 全部爬取进行中:各启用插件行显示「正在爬取」与按钮 loading */
const crawlAllMask = ref(false)
const crawlResults = ref({}) const crawlResults = ref({})
const allCrawlResult = ref(null) const allCrawlResult = ref(null)
function pluginDisplayName(pluginId) {
if (!pluginId) return '(未知插件)'
const p = pluginsStore.plugins.find((x) => x.id === pluginId)
return p?.name || pluginId
}
// 配置对话框 // 配置对话框
const configDialogVisible = ref(false) const configDialogVisible = ref(false)
const currentPlugin = ref(null) const currentPlugin = ref(null)
@@ -273,29 +314,40 @@ async function handleCrawl(pluginId) {
const response = await pluginService.crawlPlugin(pluginId) const response = await pluginService.crawlPlugin(pluginId)
if (response.code === 200) { if (response.code === 200) {
crawlResults.value[pluginId] = { crawlResults.value = {
...crawlResults.value,
[pluginId]: {
type: 'success', type: 'success',
message: response.message, message: response.message,
data: response.data data: response.data
} }
}
} else { } else {
crawlResults.value[pluginId] = { crawlResults.value = {
...crawlResults.value,
[pluginId]: {
type: 'error', type: 'error',
message: response.message || '爬取失败' message: response.message || '爬取失败'
} }
} }
}
} catch (error) { } catch (error) {
crawlResults.value[pluginId] = { crawlResults.value = {
...crawlResults.value,
[pluginId]: {
type: 'error', type: 'error',
message: '爬取过程出错' message: '爬取过程出错'
} }
}
} finally { } finally {
crawlingPlugins.value.delete(pluginId) crawlingPlugins.value.delete(pluginId)
} }
} }
function clearCrawlResult(pluginId) { function clearCrawlResult(pluginId) {
delete crawlResults.value[pluginId] const next = { ...crawlResults.value }
delete next[pluginId]
crawlResults.value = next
} }
async function handleCrawlAll() { async function handleCrawlAll() {
@@ -307,7 +359,7 @@ async function handleCrawlAll() {
} }
await ElMessageBox.confirm( await ElMessageBox.confirm(
`确定要运行所有 ${enabledPlugins.length} 个启用的插件吗?这将爬取并验证所有代理。`, `确定要运行所有 ${enabledPlugins.length} 个启用的插件吗?代理将先以「待验证」入库,需再执行「全部验证」后才会变为可用(除非已开启「爬取后立即验证」)`,
'批量爬取确认', '批量爬取确认',
{ {
confirmButtonText: '开始爬取', confirmButtonText: '开始爬取',
@@ -316,20 +368,46 @@ async function handleCrawlAll() {
} }
) )
crawlingAll.value = true
allCrawlResult.value = null allCrawlResult.value = null
{
const cleared = { ...crawlResults.value }
for (const p of enabledPlugins) {
delete cleared[p.id]
}
crawlResults.value = cleared
}
crawlAllMask.value = true
const response = await pluginService.crawlAll() const response = await pluginService.crawlAll()
if (response.code === 200) { if (response.code === 200) {
const data = response.data || {}
allCrawlResult.value = { allCrawlResult.value = {
type: response.data?.cancelled ? 'info' : 'success', type: data.cancelled ? 'info' : 'success',
message: response.message, message: response.message,
data: response.data data
} }
if (!response.data?.cancelled) { if (Array.isArray(data.per_plugin) && data.per_plugin.length) {
const merged = { ...crawlResults.value }
for (const item of data.per_plugin) {
if (!item.plugin_id) continue
merged[item.plugin_id] = {
type: item.crawl_failed ? 'error' : 'success',
message: '获取任务状态成功',
data: {
proxy_count: item.proxy_count,
crawl_failed: item.crawl_failed,
error: item.error
}
}
}
crawlResults.value = merged
}
if (!data.cancelled) {
ElMessage.success('批量爬取完成') ElMessage.success('批量爬取完成')
} }
await pluginsStore.fetchPlugins()
} else { } else {
allCrawlResult.value = { allCrawlResult.value = {
type: 'error', type: 'error',
@@ -345,7 +423,7 @@ async function handleCrawlAll() {
} }
} }
} finally { } finally {
crawlingAll.value = false crawlAllMask.value = false
} }
} }
@@ -487,66 +565,167 @@ onMounted(async () => {
.plugin-actions { .plugin-actions {
display: flex; display: flex;
justify-content: center; justify-content: center;
flex-wrap: wrap;
gap: 8px; gap: 8px;
} }
.plugin-crawl-result { .crawl-running-row {
margin-top: 8px; display: flex;
}
.result-mini {
display: inline-flex;
align-items: center; align-items: center;
gap: 6px; gap: 8px;
padding: 4px 8px; padding: 10px 12px;
border-radius: 4px; font-size: 14px;
font-size: 12px; color: var(--primary);
line-height: 1.4; background: var(--surface-2);
border-radius: var(--radius-md, 8px);
border: 1px solid var(--border);
} }
.result-mini.success { .crawl-spin {
background: rgba(103, 194, 58, 0.15); font-size: 18px;
animation: plugin-crawl-spin 1s linear infinite;
}
@keyframes plugin-crawl-spin {
to {
transform: rotate(360deg);
}
}
.result-placeholder {
color: var(--text-muted);
font-size: 14px;
}
.result-panel {
padding: 12px 14px;
border-radius: var(--radius-md, 8px);
border: 1px solid var(--border);
background: var(--surface-2);
min-height: 72px;
}
.result-panel.success {
border-color: rgba(103, 194, 58, 0.35);
}
.result-panel.error {
border-color: rgba(245, 108, 108, 0.35);
}
.result-panel-head {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 8px;
}
.result-head-icon {
font-size: 18px;
flex-shrink: 0;
}
.result-head-icon.success {
color: var(--success); color: var(--success);
} }
.result-mini.error { .result-head-icon.failed {
background: rgba(245, 108, 108, 0.15);
color: var(--danger); color: var(--danger);
} }
.result-icon { .result-panel-title {
flex: 1;
font-size: 13px; font-size: 13px;
}
.result-text {
font-weight: 500;
}
.result-count {
font-weight: 600; font-weight: 600;
padding: 0 4px; color: var(--text-secondary);
border-radius: 3px; line-height: 1.4;
} }
.result-count.valid { .result-panel-body {
display: flex;
flex-direction: column;
gap: 8px;
align-items: flex-start;
}
.result-pill {
display: inline-block;
padding: 4px 12px;
border-radius: 6px;
font-size: 14px;
font-weight: 600;
}
.result-pill.fetched {
background: rgba(103, 194, 58, 0.2); background: rgba(103, 194, 58, 0.2);
color: var(--success); color: var(--success);
} }
.result-count.invalid { .result-error-block {
background: rgba(245, 108, 108, 0.2); font-size: 13px;
line-height: 1.5;
color: var(--danger); color: var(--danger);
word-break: break-word;
white-space: pre-wrap;
max-width: 100%;
} }
.result-close { .result-close {
margin-left: 4px; margin-left: auto;
cursor: pointer; cursor: pointer;
font-size: 12px; font-size: 16px;
opacity: 0.7; opacity: 0.55;
flex-shrink: 0;
transition: opacity 0.2s; transition: opacity 0.2s;
} }
.result-close:hover { .result-close:hover {
opacity: 1; opacity: 1;
} }
.crawl-stats-summary {
flex-wrap: wrap;
font-size: 14px;
}
.per-plugin-breakdown {
list-style: none;
margin: 12px 0 0;
padding: 0;
max-height: 360px;
overflow-y: auto;
border-top: 1px solid var(--border);
padding-top: 12px;
}
.per-plugin-line {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 8px 12px;
padding: 8px 0;
border-bottom: 1px solid var(--border);
font-size: 14px;
}
.per-plugin-line:last-child {
border-bottom: none;
}
.pp-name {
font-weight: 600;
color: var(--text-primary);
min-width: 140px;
}
.pp-detail {
color: var(--text-secondary);
}
.pp-detail.err {
color: var(--danger);
flex: 1;
min-width: 120px;
word-break: break-word;
}
</style> </style>

View File

@@ -4,6 +4,18 @@
<el-card class="filter-card" shadow="hover"> <el-card class="filter-card" shadow="hover">
<el-form :inline="true" :model="filterForm" class="form-row"> <el-form :inline="true" :model="filterForm" class="form-row">
<el-form-item label="池范围">
<el-select
v-model="filterForm.poolFilter"
placeholder="全部"
style="width: 140px"
@change="handleSearch"
>
<el-option label="全部" value="all" />
<el-option label="待验证" value="pending" />
<el-option label="已验证可用" value="available" />
</el-select>
</el-form-item>
<el-form-item label="协议类型"> <el-form-item label="协议类型">
<el-select <el-select
v-model="filterForm.protocol" v-model="filterForm.protocol"
@@ -84,6 +96,16 @@
<el-table-column type="selection" width="55" /> <el-table-column type="selection" width="55" />
<el-table-column prop="ip" label="IP地址" width="150" /> <el-table-column prop="ip" label="IP地址" width="150" />
<el-table-column prop="port" label="端口" width="100" /> <el-table-column prop="port" label="端口" width="100" />
<el-table-column label="状态" width="100">
<template #default="{ row }">
<el-tag v-if="row.validated === 0" type="warning" effect="light" size="small">
待验证
</el-tag>
<el-tag v-else type="success" effect="light" size="small">
已验证
</el-tag>
</template>
</el-table-column>
<el-table-column prop="protocol" label="协议" width="100"> <el-table-column prop="protocol" label="协议" width="100">
<template #default="{ row }"> <template #default="{ row }">
<el-tag :type="getProtocolType(row.protocol)" effect="light" size="small"> <el-tag :type="getProtocolType(row.protocol)" effect="light" size="small">
@@ -164,6 +186,7 @@ const selectedProxies = ref([])
let abortController = null let abortController = null
const filterForm = reactive({ const filterForm = reactive({
poolFilter: 'all',
protocol: '', protocol: '',
minScore: 0, minScore: 0,
sortBy: 'last_check', sortBy: 'last_check',
@@ -194,6 +217,7 @@ async function fetchProxies() {
const success = await proxyStore.fetchProxies({ const success = await proxyStore.fetchProxies({
page: currentPage.value, page: currentPage.value,
page_size: pageSize.value, page_size: pageSize.value,
pool_filter: filterForm.poolFilter === 'all' ? null : filterForm.poolFilter,
protocol: filterForm.protocol || null, protocol: filterForm.protocol || null,
min_score: filterForm.minScore, min_score: filterForm.minScore,
sort_by: filterForm.sortBy, sort_by: filterForm.sortBy,
@@ -237,6 +261,7 @@ async function handleDelete(proxy) {
if (!confirmed) return if (!confirmed) return
const filters = { const filters = {
pool_filter: filterForm.poolFilter === 'all' ? null : filterForm.poolFilter,
protocol: filterForm.protocol || null, protocol: filterForm.protocol || null,
min_score: filterForm.minScore, min_score: filterForm.minScore,
sort_by: filterForm.sortBy, sort_by: filterForm.sortBy,
@@ -256,6 +281,7 @@ async function handleBatchDelete() {
if (!confirmed) return if (!confirmed) return
const filters = { const filters = {
pool_filter: filterForm.poolFilter === 'all' ? null : filterForm.poolFilter,
protocol: filterForm.protocol || null, protocol: filterForm.protocol || null,
min_score: filterForm.minScore, min_score: filterForm.minScore,
sort_by: filterForm.sortBy, sort_by: filterForm.sortBy,

View File

@@ -86,26 +86,9 @@
ref="formRef" ref="formRef"
> >
<el-divider content-position="left">爬虫配置</el-divider> <el-divider content-position="left">爬虫配置</el-divider>
<p class="setting-hint" style="margin: -8px 0 16px 0">
<el-form-item label="爬取超时" prop="crawl_timeout"> 每个爬虫插件单独限时 120 互不影响此处不再配置全局爬取超时
<el-input-number </p>
v-model="settings.crawl_timeout"
:min="5"
:max="120"
:step="5"
class="setting-input"
/>
<span class="setting-suffix"></span>
</el-form-item>
<el-form-item label="最大重试次数" prop="max_retries">
<el-input-number
v-model="settings.max_retries"
:min="0"
:max="10"
class="setting-input"
/>
</el-form-item>
<el-divider content-position="left">验证配置</el-divider> <el-divider content-position="left">验证配置</el-divider>
@@ -124,7 +107,7 @@
<el-input-number <el-input-number
v-model="settings.default_concurrency" v-model="settings.default_concurrency"
:min="10" :min="10"
:max="200" :max="400"
:step="10" :step="10"
class="setting-input" class="setting-input"
/> />
@@ -170,6 +153,15 @@
/> />
</el-form-item> </el-form-item>
<el-form-item label="爬取后立即验证" prop="auto_validate_after_crawl">
<el-switch
v-model="settings.auto_validate_after_crawl"
active-text="开启"
inactive-text="关闭"
/>
<span class="setting-hint">关闭时爬取仅入库为待验证需手动或定时全部验证消化队列推荐</span>
</el-form-item>
<el-divider content-position="left">代理评分配置</el-divider> <el-divider content-position="left">代理评分配置</el-divider>
<el-form-item label="最低代理分数" prop="min_proxy_score"> <el-form-item label="最低代理分数" prop="min_proxy_score">
@@ -232,13 +224,12 @@ const saving = ref(false)
const formRef = ref(null) const formRef = ref(null)
const settings = reactive({ const settings = reactive({
crawl_timeout: 30, validation_timeout: 6,
validation_timeout: 10, default_concurrency: 120,
max_retries: 3,
default_concurrency: 50,
min_proxy_score: 0, min_proxy_score: 0,
proxy_expiry_days: 7, proxy_expiry_days: 7,
auto_validate: true, auto_validate: true,
auto_validate_after_crawl: false,
validate_interval_minutes: 30, validate_interval_minutes: 30,
validation_targets: [] validation_targets: []
}) })
@@ -255,18 +246,15 @@ const defaultValidationTargets = [
// ==================== 计算属性 ==================== // ==================== 计算属性 ====================
const schedulerInfo = computed(() => { const schedulerInfo = computed(() => {
if (schedulerRunning.value) { if (schedulerRunning.value) {
return `验证调度器正在运行,每 ${settings.validate_interval_minutes} 分钟自动验证一次所有代理` return `验证调度器正在运行,每 ${settings.validate_interval_minutes} 分钟执行一次:优先验证待验证代理,再按检查时间复检已入库代理`
} else {
return '验证调度器已停止,代理不会自动验证,建议定期手动验证或开启自动验证'
} }
return '验证调度器已停止,待验证代理不会自动检查;可在下方开启自动验证或点击「立即验证全部」'
}) })
// ==================== 表单验证规则 ==================== // ==================== 表单验证规则 ====================
const formRules = { const formRules = {
crawl_timeout: [{ type: 'number', min: 5, max: 120, message: '范围 5-120 秒', trigger: 'blur' }],
validation_timeout: [{ type: 'number', min: 3, max: 60, message: '范围 3-60 秒', trigger: 'blur' }], validation_timeout: [{ type: 'number', min: 3, max: 60, message: '范围 3-60 秒', trigger: 'blur' }],
max_retries: [{ type: 'number', min: 0, max: 10, message: '范围 0-10', trigger: 'blur' }], default_concurrency: [{ type: 'number', min: 10, max: 400, message: '范围 10-400', trigger: 'blur' }],
default_concurrency: [{ type: 'number', min: 10, max: 200, message: '范围 10-200', trigger: 'blur' }],
validate_interval_minutes: [{ type: 'number', min: 5, max: 1440, message: '范围 5-1440 分钟', trigger: 'blur' }], validate_interval_minutes: [{ type: 'number', min: 5, max: 1440, message: '范围 5-1440 分钟', trigger: 'blur' }],
min_proxy_score: [{ type: 'number', min: 0, max: 100, message: '范围 0-100', trigger: 'blur' }], min_proxy_score: [{ type: 'number', min: 0, max: 100, message: '范围 0-100', trigger: 'blur' }],
proxy_expiry_days: [{ type: 'number', min: 1, max: 30, message: '范围 1-30 天', trigger: 'blur' }] proxy_expiry_days: [{ type: 'number', min: 1, max: 30, message: '范围 1-30 天', trigger: 'blur' }]
@@ -306,7 +294,7 @@ async function handleStopScheduler() {
async function handleValidateNow() { async function handleValidateNow() {
try { try {
await ElMessageBox.confirm( await ElMessageBox.confirm(
'确定要立即验证所有代理吗?这可能需要一些时间。', '将按顺序验证:先处理「待验证」代理,再复检已入库代理。任务在后台执行,可能需要较长时间。',
'确认验证', '确认验证',
{ {
confirmButtonText: '开始验证', confirmButtonText: '开始验证',

View File

@@ -25,6 +25,7 @@ def format_proxy(proxy) -> dict:
"score": proxy.score, "score": proxy.score,
"response_time_ms": proxy.response_time_ms, "response_time_ms": proxy.response_time_ms,
"last_check": proxy.last_check.isoformat() if proxy.last_check else None, "last_check": proxy.last_check.isoformat() if proxy.last_check else None,
"validated": getattr(proxy, "validated", 0),
} }

View File

@@ -3,7 +3,7 @@ import asyncio
from contextlib import AsyncExitStack, asynccontextmanager from contextlib import AsyncExitStack, asynccontextmanager
from fastapi import FastAPI from fastapi import FastAPI
from app.core.db import init_db, get_db from app.core.db import init_db, get_db, get_db_connection
from app.core.config import settings as app_settings from app.core.config import settings as app_settings
from app.core.log import logger from app.core.log import logger
from app.core.execution import AsyncWorkerPool, JobExecutor from app.core.execution import AsyncWorkerPool, JobExecutor
@@ -13,6 +13,8 @@ from app.repositories.settings_repo import SettingsRepository, DEFAULT_SETTINGS
from app.services.validator_service import ValidatorService from app.services.validator_service import ValidatorService
from app.services.plugin_runner import PluginRunner from app.services.plugin_runner import PluginRunner
from app.services.scheduler_service import SchedulerService from app.services.scheduler_service import SchedulerService
from app.api.ws_manager import ConnectionManager
from app.api.realtime import stats_broadcaster_loop
settings_repo = SettingsRepository() settings_repo = SettingsRepository()
proxy_repo = ProxyRepository() proxy_repo = ProxyRepository()
@@ -46,21 +48,49 @@ async def lifespan(app: FastAPI):
# 验证 WorkerPool # 验证 WorkerPool
async def validation_handler(proxy): async def validation_handler(proxy):
from app.models.domain import ProxyRaw async with get_db_connection() as db:
existing = await proxy_repo.get_by_ip_port(db, proxy.ip, proxy.port)
is_valid, latency = await validator.validate( is_valid, latency = await validator.validate(
proxy.ip, proxy.port, proxy.protocol proxy.ip, proxy.port, proxy.protocol
) )
async with get_db() as db: if not existing:
return
if existing.validated == 0:
if is_valid: if is_valid:
await proxy_repo.insert_or_update( await proxy_repo.insert_or_update(
db, proxy.ip, proxy.port, proxy.protocol, score=app_settings.score_valid db,
proxy.ip,
proxy.port,
proxy.protocol,
score=app_settings.score_valid,
) )
if latency: if latency:
await proxy_repo.update_response_time(db, proxy.ip, proxy.port, latency) await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
else:
await proxy_repo.delete(db, proxy.ip, proxy.port)
else:
if is_valid:
await proxy_repo.insert_or_update(
db,
proxy.ip,
proxy.port,
proxy.protocol,
score=app_settings.score_valid,
)
if latency:
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
else: else:
await proxy_repo.update_score( await proxy_repo.update_score(
db, proxy.ip, proxy.port, app_settings.score_invalid, db,
app_settings.score_min, app_settings.score_max proxy.ip,
proxy.port,
app_settings.score_invalid,
app_settings.score_min,
app_settings.score_max,
) )
worker_pool = AsyncWorkerPool( worker_pool = AsyncWorkerPool(
@@ -75,7 +105,7 @@ async def lifespan(app: FastAPI):
await stack.enter_async_context(executor) await stack.enter_async_context(executor)
# 插件运行器 # 插件运行器
plugin_runner = PluginRunner(timeout=db_settings.get("crawl_timeout", 30)) plugin_runner = PluginRunner()
# 调度器 # 调度器
scheduler = SchedulerService( scheduler = SchedulerService(
@@ -91,6 +121,9 @@ async def lifespan(app: FastAPI):
app.state.plugin_runner = plugin_runner app.state.plugin_runner = plugin_runner
app.state.scheduler = scheduler app.state.scheduler = scheduler
app.state.ws_manager = ConnectionManager()
app.state.stats_broadcaster_task = asyncio.create_task(stats_broadcaster_loop(app))
# 启动调度器 # 启动调度器
if db_settings.get("auto_validate", True): if db_settings.get("auto_validate", True):
try: try:
@@ -101,6 +134,13 @@ async def lifespan(app: FastAPI):
logger.info("API server started") logger.info("API server started")
yield yield
app.state.stats_broadcaster_task.cancel()
try:
await app.state.stats_broadcaster_task
except asyncio.CancelledError:
pass
await app.state.ws_manager.disconnect_all()
# 停止调度器 # 停止调度器
await scheduler.stop() await scheduler.stop()

View File

@@ -1,4 +1,11 @@
"""FastAPI 应用工厂""" """FastAPI 应用工厂"""
import asyncio
import sys
# Windows 上默认 Proactor 事件循环易导致 httpx 异步出站 ConnectTimeout与同步请求表现不一致
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from app.api.lifespan import lifespan from app.api.lifespan import lifespan

25
app/api/realtime.py Normal file
View File

@@ -0,0 +1,25 @@
"""实时统计广播后台任务"""
import asyncio
from fastapi import FastAPI
from app.core.config import settings
from app.core.log import logger
from app.services.dashboard_stats import get_dashboard_stats
async def stats_broadcaster_loop(app: FastAPI) -> None:
manager = app.state.ws_manager
interval = settings.ws_stats_interval_seconds
while True:
try:
await asyncio.sleep(interval)
if manager.connection_count == 0:
continue
scheduler = app.state.scheduler
stats = await get_dashboard_stats(scheduler.running)
await manager.broadcast_json({"type": "stats", "data": stats})
except asyncio.CancelledError:
break
except Exception:
logger.exception("stats broadcaster tick failed")

View File

@@ -1,9 +1,10 @@
"""路由包""" """路由包"""
from fastapi import APIRouter from fastapi import APIRouter
from app.api.routes import proxies, plugins, scheduler, settings, tasks from app.api.routes import proxies, plugins, scheduler, settings, tasks, ws
api_router = APIRouter() api_router = APIRouter()
api_router.include_router(proxies.router) api_router.include_router(proxies.router)
api_router.include_router(ws.router)
api_router.include_router(plugins.router) api_router.include_router(plugins.router)
api_router.include_router(scheduler.router) api_router.include_router(scheduler.router)
api_router.include_router(settings.router) api_router.include_router(settings.router)

View File

@@ -113,8 +113,8 @@ def _create_crawl_all_aggregator(job_ids, executor):
class CrawlAllAggregator(Job): class CrawlAllAggregator(Job):
async def run(self): async def run(self):
self._set_running() self._set_running()
# 等待所有子 job 完成(最多等 30 秒 # 等待所有子 job 完成(最多约 5 分钟,与前端轮询一致
for _ in range(300): for _ in range(3000):
if self.is_cancelled: if self.is_cancelled:
break break
all_done = all( all_done = all(
@@ -125,15 +125,56 @@ def _create_crawl_all_aggregator(job_ids, executor):
break break
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
total = 0 total = 0
valid = 0 plugins_failed = 0
invalid = 0 per_plugin = []
for jid in job_ids: for jid in job_ids:
job = executor.get_job(jid) job = executor.get_job(jid)
if job and job.result: plugin_id = getattr(job, "plugin_id", "") if job else ""
total += job.result.get("proxy_count", 0) proxy_count = 0
valid += job.result.get("success_count", 0) crawl_failed = False
invalid += job.result.get("failure_count", 0) err_msg = None
result = {"total_crawled": total, "valid_count": valid, "invalid_count": invalid} job_status = job.status.value if job else "missing"
if not job:
per_plugin.append({
"plugin_id": plugin_id,
"proxy_count": 0,
"crawl_failed": True,
"error": "任务不存在",
"job_status": job_status,
})
plugins_failed += 1
continue
if job.status.value == "failed":
crawl_failed = True
plugins_failed += 1
err_msg = job.error or "任务失败"
elif job.result:
r = job.result
plugin_id = r.get("plugin_id") or plugin_id
proxy_count = r.get("proxy_count", 0)
total += proxy_count
if r.get("crawl_failed") or r.get("failure_count", 0) > 0:
crawl_failed = True
plugins_failed += 1
err_msg = r.get("error")
else:
total += 0
per_plugin.append({
"plugin_id": plugin_id,
"proxy_count": proxy_count,
"crawl_failed": crawl_failed,
"error": err_msg,
"job_status": job_status,
})
result = {
"total_crawled": total,
"plugins_failed": plugins_failed,
"per_plugin": per_plugin,
}
if self.is_cancelled: if self.is_cancelled:
result["cancelled"] = True result["cancelled"] = True
return result return result

View File

@@ -5,7 +5,8 @@ from fastapi.responses import StreamingResponse
from app.services.proxy_service import ProxyService from app.services.proxy_service import ProxyService
from app.services.scheduler_service import SchedulerService from app.services.scheduler_service import SchedulerService
from app.models.schemas import ProxyListRequest, BatchDeleteRequest from app.services.dashboard_stats import get_dashboard_stats
from app.models.schemas import ProxyListRequest, BatchDeleteRequest, ProxyDeleteItem
from app.api.deps import get_proxy_service, get_scheduler_service from app.api.deps import get_proxy_service, get_scheduler_service
from app.api.common import success_response, format_proxy from app.api.common import success_response, format_proxy
from app.core.exceptions import ProxyPoolException, ProxyNotFoundException from app.core.exceptions import ProxyPoolException, ProxyNotFoundException
@@ -15,11 +16,9 @@ router = APIRouter(prefix="/api/proxies", tags=["proxies"])
@router.get("/stats") @router.get("/stats")
async def get_stats( async def get_stats(
proxy_service: ProxyService = Depends(get_proxy_service),
scheduler_service: SchedulerService = Depends(get_scheduler_service), scheduler_service: SchedulerService = Depends(get_scheduler_service),
): ):
stats = await proxy_service.get_stats() stats = await get_dashboard_stats(scheduler_service.running)
stats["scheduler_running"] = scheduler_service.running
return success_response("获取统计信息成功", stats) return success_response("获取统计信息成功", stats)
@@ -36,6 +35,7 @@ async def list_proxies(
max_score=request.max_score, max_score=request.max_score,
sort_by=request.sort_by, sort_by=request.sort_by,
sort_order=request.sort_order, sort_order=request.sort_order,
pool_filter=request.pool_filter,
) )
return success_response( return success_response(
"获取代理列表成功", "获取代理列表成功",
@@ -75,6 +75,16 @@ async def export_proxies(
) )
@router.post("/delete-one")
async def delete_proxy_one(
item: ProxyDeleteItem,
service: ProxyService = Depends(get_proxy_service),
):
"""JSON 删除推荐IPv6 等含冒号 IP 不受路径分段影响。"""
await service.delete_proxy(item.ip, item.port)
return success_response("删除代理成功")
@router.delete("/{ip}/{port}") @router.delete("/{ip}/{port}")
async def delete_proxy(ip: str, port: int, service: ProxyService = Depends(get_proxy_service)): async def delete_proxy(ip: str, port: int, service: ProxyService = Depends(get_proxy_service)):
await service.delete_proxy(ip, port) await service.delete_proxy(ip, port)

View File

@@ -1,10 +1,13 @@
"""设置相关路由""" """设置相关路由"""
import asyncio
from fastapi import APIRouter, Request, Depends from fastapi import APIRouter, Request, Depends
from app.core.db import get_db from app.core.db import get_db
from app.repositories.settings_repo import SettingsRepository from app.repositories.settings_repo import SettingsRepository
from app.models.schemas import SettingsSchema from app.models.schemas import SettingsSchema
from app.api.common import success_response from app.api.common import success_response
from app.api.deps import get_settings_repo from app.api.deps import get_settings_repo
from app.core.config import settings as app_settings
from app.core.exceptions import ProxyPoolException from app.core.exceptions import ProxyPoolException
from app.core.log import logger from app.core.log import logger
@@ -47,17 +50,21 @@ async def save_settings(
# 热更新验证器超时和并发(下次验证时生效) # 热更新验证器超时和并发(下次验证时生效)
if validator: if validator:
validator._init_timeout = request.validation_timeout vt = float(request.validation_timeout)
validator._init_connect_timeout = request.validation_timeout validator._init_timeout = vt
# 连接阶段单独收紧:勿与 total 等同,否则死代理会在 connect 上耗满整段超时
validator._init_connect_timeout = min(
float(app_settings.validator_connect_timeout), vt
)
validator._init_max_concurrency = request.default_concurrency validator._init_max_concurrency = request.default_concurrency
if request.validation_targets is not None: if request.validation_targets is not None:
validator.update_test_urls(request.validation_targets) validator.update_test_urls(request.validation_targets)
# 延迟关闭旧 session让正在验证的代理继续使用旧 session # 延迟关闭旧 session让正在验证的代理继续使用旧 session
# 新请求会通过 _ensure_session() 自动创建使用新配置的 session # 新请求会通过 _ensure_session() 自动创建使用新配置的 session
await validator.close_socks_sessions()
old_session = validator._http_session old_session = validator._http_session
validator._http_session = None validator._http_session = None
validator._http_connector = None validator._http_connector = None
validator._semaphore = None
if old_session and not old_session.closed: if old_session and not old_session.closed:
asyncio.create_task(old_session.close()) asyncio.create_task(old_session.close())
logger.info(f"Validator config updated: timeout={request.validation_timeout}, concurrency={request.default_concurrency}, targets={request.validation_targets}") logger.info(f"Validator config updated: timeout={request.validation_timeout}, concurrency={request.default_concurrency}, targets={request.validation_targets}")

32
app/api/routes/ws.py Normal file
View File

@@ -0,0 +1,32 @@
"""WebSocket 实时推送"""
import json
from fastapi import APIRouter, WebSocket
from starlette.websockets import WebSocketDisconnect
from app.services.dashboard_stats import get_dashboard_stats
router = APIRouter(prefix="/api", tags=["websocket"])
@router.websocket("/ws")
async def websocket_dashboard(websocket: WebSocket):
app = websocket.app
await websocket.accept()
manager = app.state.ws_manager
await manager.connect(websocket)
try:
stats = await get_dashboard_stats(app.state.scheduler.running)
await websocket.send_json({"type": "stats", "data": stats})
while True:
raw = await websocket.receive_text()
try:
msg = json.loads(raw)
except json.JSONDecodeError:
continue
if msg.get("type") == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
pass
finally:
await manager.disconnect(websocket)

52
app/api/ws_manager.py Normal file
View File

@@ -0,0 +1,52 @@
"""WebSocket 连接管理与广播"""
import asyncio
from typing import List
from starlette.websockets import WebSocket, WebSocketState
class ConnectionManager:
def __init__(self) -> None:
self._connections: List[WebSocket] = []
self._lock = asyncio.Lock()
@property
def connection_count(self) -> int:
return len(self._connections)
async def connect(self, websocket: WebSocket) -> None:
async with self._lock:
self._connections.append(websocket)
async def disconnect(self, websocket: WebSocket) -> None:
async with self._lock:
if websocket in self._connections:
self._connections.remove(websocket)
async def broadcast_json(self, payload: dict) -> None:
async with self._lock:
targets = list(self._connections)
stale: List[WebSocket] = []
for ws in targets:
try:
if ws.client_state != WebSocketState.CONNECTED:
stale.append(ws)
continue
await ws.send_json(payload)
except Exception:
stale.append(ws)
if stale:
async with self._lock:
for ws in stale:
if ws in self._connections:
self._connections.remove(ws)
async def disconnect_all(self) -> None:
async with self._lock:
targets = list(self._connections)
self._connections.clear()
for ws in targets:
try:
await ws.close()
except Exception:
pass

View File

@@ -1,6 +1,7 @@
"""全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件""" """全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件"""
import os import os
from typing import List from typing import List
from pydantic import AliasChoices, Field
from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -11,8 +12,11 @@ class Settings(BaseSettings):
extra="ignore", extra="ignore",
) )
# 数据库配置 # 数据库配置(环境变量 PROXYPOOL_DB_PATH 优先,供 pytest 与生产隔离)
db_path: str = "db/proxies.sqlite" db_path: str = Field(
default="db/proxies.sqlite",
validation_alias=AliasChoices("PROXYPOOL_DB_PATH", "DB_PATH", "db_path"),
)
# API 服务配置 # API 服务配置
host: str = "127.0.0.1" host: str = "127.0.0.1"
@@ -31,6 +35,9 @@ class Settings(BaseSettings):
log_level: str = "INFO" log_level: str = "INFO"
log_dir: str = "logs" log_dir: str = "logs"
# WebSocket统计广播间隔无连接时不查库
ws_stats_interval_seconds: int = 1
# 导出配置 # 导出配置
export_max_records: int = 10000 export_max_records: int = 10000

View File

@@ -54,10 +54,23 @@ async def init_db():
await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL") await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL")
logger.info("Migrated: added created_at column") logger.info("Migrated: added created_at column")
# 迁移validated 0=待验证 1=已验证入池(参与分数维护)
try:
await db.execute("SELECT validated FROM proxies LIMIT 1")
except Exception:
await db.execute(
"ALTER TABLE proxies ADD COLUMN validated INTEGER NOT NULL DEFAULT 0"
)
await db.execute(
"UPDATE proxies SET validated = 1 WHERE score > 0"
)
logger.info("Migrated: added validated column")
await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)") await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)") await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)") await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)") await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_validated ON proxies(validated)")
# 插件设置表 # 插件设置表
await db.execute(""" await db.execute("""
@@ -94,6 +107,10 @@ async def init_db():
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) )
""") """)
# 仅移除已废弃设置键,不碰 proxies 表数据
await db.execute(
"DELETE FROM settings WHERE key IN ('crawl_timeout', 'max_retries')"
)
await db.commit() await db.commit()
logger.info("Database initialized") logger.info("Database initialized")
@@ -112,6 +129,19 @@ async def get_db() -> AsyncIterator[aiosqlite.Connection]:
await db.close() await db.close()
@asynccontextmanager
async def get_db_connection() -> AsyncIterator[aiosqlite.Connection]:
"""单连接贯穿「读库 → await 网络 I/O → 写库」,减少验证 worker 每条代理两次 connect。"""
ensure_db_dir()
db = await aiosqlite.connect(DB_PATH)
try:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
yield db
finally:
await db.close()
@asynccontextmanager @asynccontextmanager
async def transaction() -> AsyncIterator[aiosqlite.Connection]: async def transaction() -> AsyncIterator[aiosqlite.Connection]:
"""获取带有显式事务控制的数据库连接 """获取带有显式事务控制的数据库连接

View File

@@ -101,17 +101,51 @@ class CrawlJob(Job):
result = await self.plugin_runner.run(plugin) result = await self.plugin_runner.run(plugin)
proxies: List[ProxyRaw] = result.proxies if result else [] proxies: List[ProxyRaw] = result.proxies if result else []
if proxies and self.validator_pool: if proxies:
await self.validator_pool.submit(proxies) from app.core.db import transaction
logger.info(f"CrawlJob {self.id}: submitted {len(proxies)} proxies for validation") from app.repositories.proxy_repo import ProxyRepository
try:
async with transaction() as db:
await ProxyRepository.upsert_many_from_crawl(db, proxies, 0)
logger.info(
f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies as pending"
)
except Exception as e:
logger.error(
f"CrawlJob {self.id}: failed to persist crawled proxies: {e}",
exc_info=True,
)
raise
if proxies and self.validator_pool:
from app.core.db import get_db as _get_db
from app.repositories.settings_repo import (
SettingsRepository,
DEFAULT_SETTINGS,
)
async with _get_db() as db:
db_settings = await SettingsRepository.get_all(db)
if db_settings.get(
"auto_validate_after_crawl",
DEFAULT_SETTINGS["auto_validate_after_crawl"],
):
await self.validator_pool.submit(proxies)
logger.info(
f"CrawlJob {self.id}: submitted {len(proxies)} proxies for immediate validation"
)
crawl_failed = bool(result and (result.failure_count > 0 or result.error))
payload = { payload = {
"plugin_id": self.plugin_id, "plugin_id": self.plugin_id,
"proxy_count": len(proxies), "proxy_count": len(proxies),
"crawl_failed": crawl_failed,
"error": result.error if result else None,
# 与持久化统计一致success_count=本次爬到的条数failure_count=是否失败(0/1)
"success_count": len(proxies),
"failure_count": result.failure_count if result else 0,
} }
if result:
payload["success_count"] = result.success_count
payload["failure_count"] = result.failure_count
self._set_completed(payload) self._set_completed(payload)
return payload return payload
@@ -133,7 +167,7 @@ class ValidateAllJob(Job):
repo = self.proxy_repo or ProxyRepository() repo = self.proxy_repo or ProxyRepository()
async with get_db() as db: async with get_db() as db:
proxies = await repo.list_all(db) proxies = await repo.list_for_validation(db)
if not proxies: if not proxies:
self._set_completed({"total": 0, "submitted": 0}) self._set_completed({"total": 0, "submitted": 0})

View File

@@ -65,8 +65,11 @@ class AsyncWorkerPool:
logger.info(f"{self.name} stopped") logger.info(f"{self.name} stopped")
async def submit(self, items: List[T]) -> None: async def submit(self, items: List[T]) -> None:
"""提交一批任务到队列(阻塞直到有空位,天然背压""" """提交一批任务到队列(优先 put_nowait队列满时再 await put"""
for item in items: for item in items:
try:
self._queue.put_nowait(item)
except asyncio.QueueFull:
await self._queue.put(item) await self._queue.put(item)
async def drain(self) -> None: async def drain(self) -> None:

View File

@@ -18,6 +18,8 @@ class BaseCrawlerPlugin(ABC):
description: str = "" description: str = ""
enabled: bool = True enabled: bool = True
default_config: Dict[str, Any] = {} default_config: Dict[str, Any] = {}
#: 单插件整段 crawl() 的 asyncio.wait_for 上限(秒),彼此独立、互不影响
crawl_timeout_seconds: float = 120.0
def __init__(self): def __init__(self):
self._config: Dict[str, Any] = dict(self.default_config or {}) self._config: Dict[str, Any] = dict(self.default_config or {})

View File

@@ -22,6 +22,7 @@ class ProxyRaw:
@dataclass @dataclass
class Proxy: class Proxy:
"""数据库中的代理实体""" """数据库中的代理实体"""
ip: str ip: str
port: int port: int
protocol: str protocol: str
@@ -29,6 +30,7 @@ class Proxy:
response_time_ms: Optional[float] = None response_time_ms: Optional[float] = None
last_check: Optional[datetime] = None last_check: Optional[datetime] = None
created_at: Optional[datetime] = None created_at: Optional[datetime] = None
validated: int = 0 # 0 待验证 1 已验证(可参与分数与对外取用)
@dataclass @dataclass
@@ -46,7 +48,12 @@ class PluginInfo:
@dataclass @dataclass
class CrawlResult: class CrawlResult:
"""插件爬取结果""" """插件爬取结果
success_count: 最近一轮成功爬取到的代理条数(去重后),非「验证通过数」
failure_count: 最近一轮是否爬取失败(健康检查/超时/异常为 1否则为 0
"""
plugin_name: str plugin_name: str
proxies: List[ProxyRaw] = field(default_factory=list) proxies: List[ProxyRaw] = field(default_factory=list)
success_count: int = 0 success_count: int = 0

View File

@@ -1,5 +1,5 @@
"""Pydantic 模型 - 用于 API 请求/响应校验""" """Pydantic 模型 - 用于 API 请求/响应校验"""
from pydantic import BaseModel, Field, field_validator from pydantic import BaseModel, Field, field_validator, ConfigDict
from typing import Optional, List from typing import Optional, List
@@ -25,6 +25,7 @@ class ProxyResponse(BaseModel):
score: int score: int
response_time_ms: Optional[float] = None response_time_ms: Optional[float] = None
last_check: Optional[str] = None last_check: Optional[str] = None
validated: int = 0
class PluginResponse(BaseModel): class PluginResponse(BaseModel):
@@ -39,13 +40,14 @@ class PluginResponse(BaseModel):
class SettingsSchema(BaseModel): class SettingsSchema(BaseModel):
crawl_timeout: int = Field(default=30, ge=5, le=120) model_config = ConfigDict(extra="ignore")
validation_timeout: int = Field(default=10, ge=3, le=60)
max_retries: int = Field(default=3, ge=0, le=10) validation_timeout: int = Field(default=6, ge=3, le=60)
default_concurrency: int = Field(default=50, ge=10, le=200) default_concurrency: int = Field(default=120, ge=10, le=400)
min_proxy_score: int = Field(default=0, ge=0, le=100) min_proxy_score: int = Field(default=0, ge=0, le=100)
proxy_expiry_days: int = Field(default=7, ge=1, le=30) proxy_expiry_days: int = Field(default=7, ge=1, le=30)
auto_validate: bool = True auto_validate: bool = True
auto_validate_after_crawl: bool = False
validate_interval_minutes: int = Field(default=30, ge=5, le=1440) validate_interval_minutes: int = Field(default=30, ge=5, le=1440)
validation_targets: List[str] = Field( validation_targets: List[str] = Field(
default=[ default=[
@@ -60,10 +62,14 @@ class SettingsSchema(BaseModel):
class CrawlSummarySchema(BaseModel): class CrawlSummarySchema(BaseModel):
"""单次爬取任务结果(与 CrawlJob 返回的 result 对齐)"""
plugin_id: str plugin_id: str
proxy_count: int proxy_count: int
valid_count: int crawl_failed: bool = False
invalid_count: int = 0 error: Optional[str] = None
success_count: int = 0 # 与 proxy_count 相同,兼容旧前端
failure_count: int = 0
class ProxyListRequest(BaseModel): class ProxyListRequest(BaseModel):
@@ -74,6 +80,20 @@ class ProxyListRequest(BaseModel):
max_score: Optional[int] = Field(default=None, ge=0) max_score: Optional[int] = Field(default=None, ge=0)
sort_by: str = "last_check" sort_by: str = "last_check"
sort_order: str = "DESC" sort_order: str = "DESC"
pool_filter: Optional[str] = Field(
default=None,
description="all 或不传=全部pending=待验证available=已验证且可用",
)
@field_validator("pool_filter")
@classmethod
def validate_pool_filter(cls, v: Optional[str]):
if v is None or v == "" or v == "all":
return None
allowed = ("pending", "available")
if v not in allowed:
raise ValueError(f"pool_filter 必须是 {allowed} 之一或 all")
return v
@field_validator("protocol") @field_validator("protocol")
@classmethod @classmethod

View File

@@ -9,6 +9,15 @@ from .kuaidaili import KuaiDaiLiPlugin
from .speedx import SpeedXPlugin from .speedx import SpeedXPlugin
from .yundaili import YunDaiLiPlugin from .yundaili import YunDaiLiPlugin
from .proxyscrape import ProxyScrapePlugin from .proxyscrape import ProxyScrapePlugin
from .fpw_proxy_list_download import FpwProxyListDownloadPlugin
from .fpw_socks_ssl_proxy import FpwSocksSslProxyPlugin
from .fpw_spys_one import FpwSpysOnePlugin
from .fpw_proxynova import FpwProxynovaPlugin
from .fpw_hidemy import FpwHidemyPlugin
from .fpw_premproxy import FpwPremproxyPlugin
from .fpw_freeproxylists import FpwFreeproxylistsPlugin
from .fpw_gatherproxy import FpwGatherproxyPlugin
from .fpw_checkerproxy import FpwCheckerproxyPlugin
# 显式注册所有插件 # 显式注册所有插件
registry.register(Fate0Plugin) registry.register(Fate0Plugin)
@@ -19,3 +28,12 @@ registry.register(KuaiDaiLiPlugin)
registry.register(SpeedXPlugin) registry.register(SpeedXPlugin)
registry.register(YunDaiLiPlugin) registry.register(YunDaiLiPlugin)
registry.register(ProxyScrapePlugin) registry.register(ProxyScrapePlugin)
registry.register(FpwProxyListDownloadPlugin)
registry.register(FpwSocksSslProxyPlugin)
registry.register(FpwSpysOnePlugin)
registry.register(FpwProxynovaPlugin)
registry.register(FpwHidemyPlugin)
registry.register(FpwPremproxyPlugin)
registry.register(FpwFreeproxylistsPlugin)
registry.register(FpwGatherproxyPlugin)
registry.register(FpwCheckerproxyPlugin)

View File

@@ -3,7 +3,7 @@ import re
import random import random
import asyncio import asyncio
import httpx import httpx
from typing import List, Optional from typing import Dict, List, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from app.core.plugin_system import BaseCrawlerPlugin from app.core.plugin_system import BaseCrawlerPlugin
from app.models.domain import ProxyRaw from app.models.domain import ProxyRaw
@@ -43,9 +43,56 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
self._client = httpx.AsyncClient( self._client = httpx.AsyncClient(
transport=transport, transport=transport,
follow_redirects=True, follow_redirects=True,
# 忽略系统 HTTP(S)_PROXY避免误配导致列表站全部连接失败
trust_env=False,
) )
return self._client return self._client
@staticmethod
def _http_timeout(seconds: float) -> httpx.Timeout:
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
t = max(2.0, float(seconds))
c = min(6.0, max(3.0, t * 0.35))
return httpx.Timeout(t, connect=c)
@staticmethod
def _decode_response_body(response: httpx.Response) -> str:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
"""同步 GET部分站点在 Windows 上 AsyncClient 易 ConnectTimeout同步 Client 正常)。"""
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.get(url, headers=headers, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
def _sync_post(
self, url: str, data: Dict[str, str], timeout: float, headers: dict
) -> str:
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.post(url, headers=headers, data=data, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
async def fetch( async def fetch(
self, self,
url: str, url: str,
@@ -56,35 +103,81 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
"""异步抓取指定 URL 的 HTML 内容""" """异步抓取指定 URL 的 HTML 内容"""
from app.core.log import logger from app.core.log import logger
client = self._get_client() client = self._get_client()
to = self._http_timeout(timeout)
for attempt in range(retries): for attempt in range(retries):
try: try:
response = await client.get(url, headers=self.get_headers(), timeout=timeout) response = await client.get(url, headers=self.get_headers(), timeout=to)
if raise_for_status: if raise_for_status:
response.raise_for_status() response.raise_for_status()
if response.status_code == 200: if response.status_code == 200:
content = response.content return self._decode_response_body(response)
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
else:
logger.warning(f"Fetch {url} returned status {response.status_code}") logger.warning(f"Fetch {url} returned status {response.status_code}")
except Exception as e: except Exception as e:
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}") logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1: if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3)) await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_get, url, timeout, self.get_headers()
)
if text:
logger.info(f"Fetch {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"Fetch {url} 同步回退失败: {e}")
return "" return ""
async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]: async def fetch_post(
self,
url: str,
data: Optional[Dict[str, str]] = None,
timeout: float = 15.0,
retries: int = 2,
) -> str:
"""POST application/x-www-form-urlencoded用于 spys.one 等表单页。"""
from app.core.log import logger
client = self._get_client()
payload = data or {}
to = self._http_timeout(timeout)
for attempt in range(retries):
try:
response = await client.post(
url,
headers=self.get_headers(),
data=payload,
timeout=to,
)
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"POST {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_post, url, payload, timeout, self.get_headers()
)
if text:
logger.info(f"POST {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"POST {url} 同步回退失败: {e}")
return ""
async def fetch_all(
self,
urls: List[str],
timeout: float = 15.0,
retries: int = 2,
) -> List[str]:
"""并发抓取多个 URL限制单个插件内部并发""" """并发抓取多个 URL限制单个插件内部并发"""
semaphore = asyncio.Semaphore(self.max_concurrency) semaphore = asyncio.Semaphore(self.max_concurrency)
async def _fetch_limited(url: str): async def _fetch_limited(url: str):
async with semaphore: async with semaphore:
return await self.fetch(url, timeout=timeout) return await self.fetch(url, timeout=timeout, retries=retries)
tasks = [_fetch_limited(url) for url in urls] tasks = [_fetch_limited(url) for url in urls]
return await asyncio.gather(*tasks) return await asyncio.gather(*tasks)

View File

@@ -0,0 +1,65 @@
"""checkerproxy.net尝试常见导出路径 + 正文中的 ip:port排除示例占位"""
import re
from typing import List, Set, Tuple
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwCheckerproxyPlugin(BaseHTTPPlugin):
name = "fpw_checkerproxy"
display_name = "CheckerProxy.net"
description = "checkerproxy.net无稳定公开 API 时可能为空;多路径尝试)"
def __init__(self):
super().__init__()
self.urls = [
"https://checkerproxy.net/",
"https://checkerproxy.net/export",
"https://checkerproxy.net/api/export",
]
@staticmethod
def _parse_ip_ports(text: str) -> List[ProxyRaw]:
bad = {"123.123.123.123", "127.0.0.1", "0.0.0.0"}
seen: Set[Tuple[str, int]] = set()
out: List[ProxyRaw] = []
for m in re.finditer(
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
text,
):
ip, ps = m.group(1), m.group(2)
if ip in bad:
continue
if not ps.isdigit() or not (1 <= int(ps) <= 65535):
continue
key = (ip, int(ps))
if key in seen:
continue
seen.add(key)
try:
out.append(ProxyRaw(ip, int(ps), "http"))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = []
seen: Set[Tuple[str, int, str]] = set()
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for html in htmls:
if not html or len(html) < 200:
continue
for p in self._parse_ip_ports(html):
k = (p.ip, p.port, p.protocol)
if k not in seen:
seen.add(k)
merged.append(p)
if len(merged) >= 50:
break
if merged:
logger.info(f"{self.display_name} 解析 {len(merged)}")
else:
logger.warning(f"{self.display_name} 未解析到代理(站点可能仅提供在线检测)")
return merged

View File

@@ -0,0 +1,69 @@
"""freeproxylists.net 及常见镜像路径(表格 / 纯文本)。"""
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwFreeproxylistsPlugin(BaseHTTPPlugin):
name = "fpw_freeproxylists"
display_name = "FreeProxyLists"
description = "freeproxylists.net 系列页面(易被 403多 URL 尝试)"
def __init__(self):
super().__init__()
self.urls = [
"http://www.freeproxylists.net/",
"http://freeproxylists.net/",
"http://www.freeproxylists.net/en/http-txt.html",
]
def _parse_any(self, html: str) -> List[ProxyRaw]:
ipport = re.findall(
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
html,
)
if len(ipport) >= 5:
out: List[ProxyRaw] = []
for ip, ps in ipport:
if ps.isdigit() and 1 <= int(ps) <= 65535:
try:
out.append(ProxyRaw(ip, int(ps), "http"))
except ValueError:
pass
return out
soup = BeautifulSoup(html, "lxml")
results: List[ProxyRaw] = []
for tr in soup.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip) and port.isdigit():
if 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), "http"))
except ValueError:
pass
return results
async def crawl(self) -> List[ProxyRaw]:
seen = set()
out: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=10, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
for p in self._parse_any(html):
key = (p.ip, p.port, p.protocol)
if key not in seen:
seen.add(key)
out.append(p)
if out:
logger.info(f"{self.display_name}{url} 累计 {len(out)}")
return out

View File

@@ -0,0 +1,61 @@
"""gatherproxy.com 页面内嵌 JSONPROXY_IP / PROXY_PORT"""
import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwGatherproxyPlugin(BaseHTTPPlugin):
name = "fpw_gatherproxy"
display_name = "GatherProxy"
description = "gatherproxy.com 内嵌代理 JSON站点常有限流"
def __init__(self):
super().__init__()
self.urls = [
"http://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
"http://www.gatherproxy.com/proxylist/country/?c=United%20States",
]
def _extract_from_text(self, text: str) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
for m in re.finditer(
r"PROXY_IP['\"]?\s*:\s*['\"]([\d.]+)['\"].{0,120}?PROXY_PORT['\"]?\s*:\s*['\"](\d+)['\"]",
text,
re.DOTALL | re.IGNORECASE,
):
ip, port = m.group(1), m.group(2)
if port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), "http"))
except ValueError:
continue
for m in re.finditer(
r"\{[^{}]*\"PROXY_IP\"\s*:\s*\"([\d.]+)\"[^{}]*\"PROXY_PORT\"\s*:\s*\"(\d+)\"[^{}]*\}",
text,
):
ip, port = m.group(1), m.group(2)
if port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), "http"))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
seen = set()
out: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=10, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
for p in self._extract_from_text(html):
k = (p.ip, p.port)
if k not in seen:
seen.add(k)
out.append(p)
if out:
logger.info(f"{self.display_name}{url} 累计 {len(out)}")
return out

38
app/plugins/fpw_hidemy.py Normal file
View File

@@ -0,0 +1,38 @@
"""hidemyna.me 免费代理列表表格。"""
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwHidemyPlugin(BaseHTTPPlugin):
name = "fpw_hidemy"
display_name = "HideMy.name"
description = "hidemyna.me 英文代理列表HTTP/HTTPS/SOCKS"
def __init__(self):
super().__init__()
self.urls = [
"https://hidemyna.me/en/proxy-list/",
"https://hidemyna.me/en/proxy-list/?type=hs",
"https://hidemyna.me/en/proxy-list/?type=socks4",
]
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
batch = self.parse_html_table(
html,
column_map={"ip": 0, "port": 1, "protocol": 4},
protocol="http",
)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} {url}: {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

View File

@@ -0,0 +1,64 @@
"""premproxy.com 列表页表格。"""
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwPremproxyPlugin(BaseHTTPPlugin):
name = "fpw_premproxy"
display_name = "PremProxy"
description = "premproxy.com HTTP/SOCKS 列表页"
def __init__(self):
super().__init__()
self.urls = [
"https://premproxy.com/list/",
"https://premproxy.com/socks-list/",
]
def _parse_html(self, html: str) -> List[ProxyRaw]:
soup = BeautifulSoup(html, "lxml")
results: List[ProxyRaw] = []
for tr in soup.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
continue
if not port.isdigit() or not (1 <= int(port) <= 65535):
continue
row = tr.get_text(" ", strip=True).lower()
if "socks5" in row:
proto = "socks5"
elif "socks4" in row or "socks" in row:
proto = "socks4"
elif "https" in row:
proto = "https"
else:
proto = "http"
try:
results.append(ProxyRaw(ip, int(port), proto))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
batch = self._parse_html(html)
if batch:
merged.extend(batch)
logger.info(f"{self.display_name} {url}: {len(batch)}")
if merged:
logger.info(f"{self.display_name} 合计 {len(merged)}")
return merged

View File

@@ -0,0 +1,54 @@
"""www.proxy-list.download 公开 APIREADME: Free_Proxy_Website"""
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
name = "fpw_proxy_list_download"
display_name = "Proxy-List.download"
description = "proxy-list.download 官方 APIhttp/https/socks4/socks5"
def __init__(self):
super().__init__()
self.max_concurrency = 8
self.api_pairs = [
("http", "https://www.proxy-list.download/api/v1/get?type=http"),
("https", "https://www.proxy-list.download/api/v1/get?type=https"),
("socks4", "https://www.proxy-list.download/api/v1/get?type=socks4"),
("socks5", "https://www.proxy-list.download/api/v1/get?type=socks5"),
]
self.fallback_pairs = [
("http", "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all"),
("https", "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all"),
("socks4", "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all"),
("socks5", "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all"),
]
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
urls = [u for _, u in self.api_pairs]
htmls = await self.fetch_all(urls, timeout=10, retries=1)
for (protocol, _), text in zip(self.api_pairs, htmls):
if not text:
continue
batch = self.parse_text_proxies(text, protocol)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} {protocol}: {len(batch)}")
if not results:
logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用")
fb_urls = [u for _, u in self.fallback_pairs]
fb_htmls = await self.fetch_all(fb_urls, timeout=10, retries=1)
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
if not text:
continue
batch = self.parse_text_proxies(text, protocol)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} fallback {protocol}: {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

View File

@@ -0,0 +1,74 @@
"""proxynova.com 表格内 JS 混淆 IP + 明文端口。"""
import re
from typing import List, Optional
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwProxynovaPlugin(BaseHTTPPlugin):
name = "fpw_proxynova"
display_name = "ProxyNova"
description = "proxynova.com 代理列表(解析 document.write 混淆 IP"
def __init__(self):
super().__init__()
self.urls = ["https://www.proxynova.com/proxy-server-list/"]
@staticmethod
def _decode_proxynova_ip(script_inner: str) -> Optional[str]:
"""解析 document.write(\".081.301\".split(\"\").reverse()...concat(\"118.174\"...))"""
m1 = re.search(r'document\.write\("([^"]+)"\.split', script_inner)
m2 = re.search(r'\.concat\("([^"]+)"', script_inner)
if not m1 or not m2:
return None
a, b = m1.group(1), m2.group(1)
part1 = "".join(reversed(a))
return part1 + b
def _parse_rows(self, html: str) -> List[ProxyRaw]:
soup = BeautifulSoup(html, "lxml")
tbody = soup.find("tbody")
if not tbody:
return []
out: List[ProxyRaw] = []
for tr in tbody.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
script = tds[0].find("script")
if not script or not script.string:
continue
ip = self._decode_proxynova_ip(script.string)
port_txt = tds[1].get_text(strip=True)
if not ip or not port_txt.isdigit():
continue
port = int(port_txt)
if not (1 <= port <= 65535):
continue
row_text = tr.get_text(" ", strip=True).upper()
if "SOCKS5" in row_text:
proto = "socks5"
elif "SOCKS4" in row_text:
proto = "socks4"
elif "HTTPS" in row_text:
proto = "https"
else:
proto = "http"
try:
out.append(ProxyRaw(ip, port, proto))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]:
html = await self.fetch(self.urls[0], timeout=14, retries=1)
if not html:
return []
results = self._parse_rows(html)
if results:
logger.info(f"{self.display_name} 解析 {len(results)}")
return results

View File

@@ -0,0 +1,56 @@
"""socks-proxy.net / sslproxies.org 表格README 参考 GetProxyFromSocks-proxy.py"""
import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
name = "fpw_socks_ssl_proxy"
display_name = "Socks-Proxy / SSLProxies"
description = "socks-proxy.net 与 sslproxies.org 首页表格HTTP/HTTPS 列表)"
def __init__(self):
super().__init__()
self.max_concurrency = 6
# 与 sslproxies 同模板的镜像站较多socks-proxy 在部分网络下不稳定,多源提高成功率
self.urls = [
"https://www.sslproxies.org/",
"https://free-proxy-list.net/",
"https://www.us-proxy.org/",
"https://www.socks-proxy.net/",
]
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
results = []
pattern = re.compile(
r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>\s*<td[^>]*>\s*(\d+)",
re.I,
)
for ip, port in pattern.findall(html):
if port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), default_protocol))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
if "socks-proxy" in url:
proto = "socks4"
else:
proto = "http"
batch = self._parse_page(html, proto)
results.extend(batch)
if batch:
logger.info(f"{self.display_name} {url}: {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

148
app/plugins/fpw_spys_one.py Normal file
View File

@@ -0,0 +1,148 @@
"""spys.one 表单 POST + 端口 XOR 解码README: GetProxyFromSPYSONE.py"""
import asyncio
import re
from typing import Dict, List, Tuple
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwSpysOnePlugin(BaseHTTPPlugin):
name = "fpw_spys_one"
display_name = "Spys.one"
description = "spys.one HTTP/SOCKS 列表POST 筛选 + XOR 端口解码)"
def __init__(self):
super().__init__()
self.pages: List[Tuple[str, str, str]] = [
("http", "http://spys.one/en/http-proxy-list/", "1"),
("socks5", "http://spys.one/en/socks-proxy-list/", "2"),
]
@staticmethod
def _exec_spys_decoder(body: str) -> Dict[str, int]:
body = re.sub(r"\s+", "", body)
stmts = [s.strip() for s in body.split(";") if s.strip() and "document" not in s]
env: Dict[str, int] = {}
for _ in range(8):
progressed = False
for stmt in stmts:
if "=" not in stmt:
continue
lhs, rhs = stmt.split("=", 1)
lhs = lhs.strip()
rhs = rhs.strip()
if lhs in env:
continue
if "^" not in rhs:
if rhs.isdigit():
env[lhs] = int(rhs)
progressed = True
continue
a, b = rhs.split("^", 1)
a, b = a.strip(), b.strip()
def gv(x: str) -> int:
if x.isdigit():
return int(x)
return env[x]
try:
env[lhs] = gv(a) ^ gv(b)
progressed = True
except KeyError:
continue
if not progressed:
break
return env
def _decoder_env_from_html(self, html: str) -> Dict[str, int]:
best: Dict[str, int] = {}
for m in re.finditer(r"<script[^>]*>([\s\S]*?)</script>", html, re.IGNORECASE):
chunk = m.group(1).strip()
if "document.write" in chunk:
continue
xor_assigns = len(re.findall(r"\w+=\d+\^\w+", chunk))
if xor_assigns < 4:
continue
env = self._exec_spys_decoder(chunk)
if len(env) > len(best):
best = env
return best
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
env = self._decoder_env_from_html(html)
if not env:
logger.warning(f"{self.display_name} 未解析到 XOR 变量表")
return []
results: List[ProxyRaw] = []
for m in re.finditer(
r"class=spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<script[^>]*>([\s\S]+?)</script>",
html,
re.IGNORECASE,
):
ip = m.group(1)
inner = m.group(2)
dw = re.search(
r'document\.write\("[^"]*"\+((?:\(\w+\^\w+\)\+?)+)\)',
inner,
)
if not dw:
continue
pairs = re.findall(r"\((\w+)\^(\w+)\)", dw.group(1))
if not pairs:
continue
try:
digits = "".join(str(env[a] ^ env[b]) for a, b in pairs)
port = int(digits)
except (KeyError, ValueError):
continue
if not (1 <= port <= 65535):
continue
tail = html[m.end() : m.end() + 2000]
u = tail.upper()
if "SOCKS5" in u:
proto = "socks5"
elif "SOCKS4" in u:
proto = "socks4"
elif "HTTPS" in u:
proto = "https"
elif "HTTP" in u:
proto = "http"
else:
proto = default_protocol
try:
results.append(ProxyRaw(ip, port, proto))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
form_base = {
"xpp": "3",
"xf1": "0",
"xf2": "0",
"xf4": "0",
}
async def _one(proto: str, url: str, xf5: str) -> Tuple[str, str]:
data = {**form_base, "xf5": xf5}
html = await self.fetch_post(url, data=data, timeout=14, retries=1)
return proto, html or ""
pairs = await asyncio.gather(
*(_one(proto, url, xf5) for proto, url, xf5 in self.pages)
)
for proto, html in pairs:
if not html:
continue
batch = self._parse_page(html, proto)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} ({proto}): {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

View File

@@ -18,17 +18,19 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
# 减少页数,降低被反爬概率,确保至少能拿到数据 # fps/dps 列表页目前仍可 200inha/intr 常返回 567反爬作末位兜底
self.urls = [ self.urls = [
"https://www.kuaidaili.com/free/fps/",
"https://www.kuaidaili.com/free/dps/",
"https://www.kuaidaili.com/free/inha/1/", "https://www.kuaidaili.com/free/inha/1/",
"https://www.kuaidaili.com/free/intr/1/", "https://www.kuaidaili.com/free/intr/1/",
] ]
def get_headers(self) -> dict: def get_headers(self) -> dict:
headers = super().get_headers() headers = super().get_headers()
headers["Referer"] = "https://www.kuaidaili.com/free/inha/" headers["Referer"] = "https://www.kuaidaili.com/free/"
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Accept-Encoding"] = "gzip, deflate, br" headers["Accept-Encoding"] = "gzip, deflate"
headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8" headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8"
headers["Sec-Fetch-Dest"] = "document" headers["Sec-Fetch-Dest"] = "document"
headers["Sec-Fetch-Mode"] = "navigate" headers["Sec-Fetch-Mode"] = "navigate"
@@ -36,15 +38,56 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
headers["Upgrade-Insecure-Requests"] = "1" headers["Upgrade-Insecure-Requests"] = "1"
return headers return headers
@staticmethod
def _infer_protocol(texts: List[str]) -> str:
"""从一行单元格文本中推断协议(兼容 fps / dps / inha 等版式)。"""
for t in texts[2:]:
tl = t.lower().replace(" ", "")
if tl in VALID_PROTOCOLS:
return tl
if "http(s)" in tl or tl in ("http/https",):
return "http"
if "socks5" in tl:
return "socks5"
if "socks4" in tl:
return "socks4"
if tl == "https":
return "https"
if len(texts) >= 5:
t4 = texts[4].lower().strip()
if t4 in VALID_PROTOCOLS:
return t4
return "http"
def _parse_table(self, table) -> List[ProxyRaw]:
out: List[ProxyRaw] = []
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) < 2:
continue
texts = [td.get_text(strip=True) for td in tds]
ip = texts[0]
port_s = texts[1]
if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip):
continue
if not port_s.isdigit() or not (1 <= int(port_s) <= 65535):
continue
protocol = self._infer_protocol(texts)
if protocol not in VALID_PROTOCOLS:
protocol = "http"
try:
out.append(ProxyRaw(ip, int(port_s), protocol))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]: async def crawl(self) -> List[ProxyRaw]:
results = [] results = []
# 先访问首页预热会话,获取 cookie降低被反爬概率 await self.fetch("https://www.kuaidaili.com/free/", timeout=10)
await self.fetch("https://www.kuaidaili.com/", timeout=10) await asyncio.sleep(random.uniform(1, 2))
await asyncio.sleep(random.uniform(2, 4))
# 顺序请求免费代理页面
for url in self.urls: for url in self.urls:
html = await self.fetch(url, timeout=10) html = await self.fetch(url, timeout=15)
if not html: if not html:
continue continue
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml")
@@ -53,20 +96,11 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}") logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}")
continue continue
for row in table.find_all("tr"): batch = self._parse_table(table)
tds = row.find_all("td") if batch:
if len(tds) >= 5: results.extend(batch)
ip = tds[0].get_text(strip=True) logger.info(f"{self.display_name} {url} 解析 {len(batch)}")
port = tds[1].get_text(strip=True) await asyncio.sleep(random.uniform(1, 2))
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), protocol))
except ValueError:
continue
await asyncio.sleep(random.uniform(5, 8))
if results: if results:
logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理") logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理")

View File

@@ -109,21 +109,5 @@ class ProxyScrapePlugin(BaseHTTPPlugin):
if results: if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理") logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else: else:
# Fallback生成测试代理确保在测试环境也能验证完整流程 logger.warning("ProxyScrape 所有真实源均不可用,返回空列表")
logger.warning("ProxyScrape 所有真实源均不可用,生成测试代理用于架构验证")
results = self._generate_test_proxies()
return results return results
def _generate_test_proxies(self) -> List[ProxyRaw]:
"""生成测试代理数据,覆盖全协议类型,用于验证插件系统"""
import random
test_proxies = []
protocols = ["http", "https", "socks4", "socks5"]
for protocol in protocols:
for _ in range(3):
# 生成随机公网格式 IP仅用于测试流程
ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
port = random.randint(1024, 65535)
test_proxies.append(ProxyRaw(ip, port, protocol))
logger.info(f"生成 {len(test_proxies)} 个测试代理 HTTP/HTTPS/SOCKS4/SOCKS5 各 3 个")
return test_proxies

View File

@@ -2,7 +2,8 @@
import aiosqlite import aiosqlite
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
from app.models.domain import Proxy
from app.models.domain import Proxy, ProxyRaw
from app.core.log import logger from app.core.log import logger
@@ -32,9 +33,15 @@ def _row_to_proxy(row: Tuple) -> Proxy:
response_time_ms=row[4], response_time_ms=row[4],
last_check=_to_datetime(row[5]), last_check=_to_datetime(row[5]),
created_at=_to_datetime(row[6]), created_at=_to_datetime(row[6]),
validated=int(row[7]) if len(row) > 7 and row[7] is not None else 0,
) )
_SELECT_PROXY_COLS = (
"ip, port, protocol, score, response_time_ms, last_check, created_at, validated"
)
class ProxyRepository: class ProxyRepository:
"""代理 Repository""" """代理 Repository"""
@@ -51,12 +58,13 @@ class ProxyRepository:
try: try:
await db.execute( await db.execute(
""" """
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at) INSERT INTO proxies (ip, port, protocol, score, last_check, created_at, validated)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 1)
ON CONFLICT(ip, port) DO UPDATE SET ON CONFLICT(ip, port) DO UPDATE SET
protocol = excluded.protocol, protocol = excluded.protocol,
score = excluded.score, score = excluded.score,
last_check = CURRENT_TIMESTAMP last_check = CURRENT_TIMESTAMP,
validated = 1
""", """,
(ip, port, protocol, score), (ip, port, protocol, score),
) )
@@ -66,6 +74,56 @@ class ProxyRepository:
logger.error(f"insert_or_update proxy failed: {e}", exc_info=True) logger.error(f"insert_or_update proxy failed: {e}", exc_info=True)
return False return False
@staticmethod
async def upsert_from_crawl(
db: aiosqlite.Connection,
ip: str,
port: int,
protocol: str = "http",
initial_score: int = 0,
) -> None:
"""爬取入库待验证状态validated=0, score=0再次爬取同一条则重置为待验证。"""
if protocol not in VALID_PROTOCOLS:
protocol = "http"
await db.execute(
"""
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at, validated)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 0)
ON CONFLICT(ip, port) DO UPDATE SET
protocol = excluded.protocol,
score = excluded.score,
last_check = CURRENT_TIMESTAMP,
validated = 0
""",
(ip, port, protocol, initial_score),
)
@staticmethod
async def upsert_many_from_crawl(
db: aiosqlite.Connection,
proxies: List[ProxyRaw],
initial_score: int = 0,
) -> None:
"""批量爬取入库;不 commit由外层 transaction 提交。"""
if not proxies:
return
rows = []
for p in proxies:
proto = p.protocol if p.protocol in VALID_PROTOCOLS else "http"
rows.append((p.ip, p.port, proto, initial_score))
await db.executemany(
"""
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at, validated)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 0)
ON CONFLICT(ip, port) DO UPDATE SET
protocol = excluded.protocol,
score = excluded.score,
last_check = CURRENT_TIMESTAMP,
validated = 0
""",
rows,
)
@staticmethod @staticmethod
async def update_score( async def update_score(
db: aiosqlite.Connection, db: aiosqlite.Connection,
@@ -86,9 +144,12 @@ class ProxyRepository:
""", """,
(min_score, max_score, delta, ip, port), (min_score, max_score, delta, ip, port),
) )
# 删除分数已降至 0 及以下的代理 # 删除已入池且分数耗尽者;待验证(score=0)不经过此路径
await db.execute( await db.execute(
"DELETE FROM proxies WHERE ip = ? AND port = ? AND score <= ?", """
DELETE FROM proxies
WHERE ip = ? AND port = ? AND score <= ? AND validated = 1
""",
(ip, port, min_score), (ip, port, min_score),
) )
await db.commit() await db.commit()
@@ -134,7 +195,7 @@ class ProxyRepository:
db: aiosqlite.Connection, ip: str, port: int db: aiosqlite.Connection, ip: str, port: int
) -> Optional[Proxy]: ) -> Optional[Proxy]:
async with db.execute( async with db.execute(
"SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE ip = ? AND port = ?", f"SELECT {_SELECT_PROXY_COLS} FROM proxies WHERE ip = ? AND port = ?",
(ip, port), (ip, port),
) as cursor: ) as cursor:
row = await cursor.fetchone() row = await cursor.fetchone()
@@ -145,7 +206,11 @@ class ProxyRepository:
@staticmethod @staticmethod
async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]: async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]:
async with db.execute( async with db.execute(
"SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1" f"""
SELECT {_SELECT_PROXY_COLS} FROM proxies
WHERE validated = 1 AND score > 0
ORDER BY RANDOM() LIMIT 1
"""
) as cursor: ) as cursor:
row = await cursor.fetchone() row = await cursor.fetchone()
if row: if row:
@@ -158,12 +223,19 @@ class ProxyRepository:
protocol: Optional[str] = None, protocol: Optional[str] = None,
limit: int = 100000, limit: int = 100000,
offset: int = 0, offset: int = 0,
validated: Optional[int] = None,
) -> List[Proxy]: ) -> List[Proxy]:
query = "SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies" query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
params: List = [] params: List = []
clauses = []
if protocol: if protocol:
query += " WHERE protocol = ?" clauses.append("protocol = ?")
params.append(protocol.lower()) params.append(protocol.lower())
if validated is not None:
clauses.append("validated = ?")
params.append(int(validated))
if clauses:
query += " WHERE " + " AND ".join(clauses)
query += " LIMIT ? OFFSET ?" query += " LIMIT ? OFFSET ?"
params.extend([limit, offset]) params.extend([limit, offset])
@@ -171,21 +243,77 @@ class ProxyRepository:
rows = await cursor.fetchall() rows = await cursor.fetchall()
return [_row_to_proxy(row) for row in rows] return [_row_to_proxy(row) for row in rows]
@staticmethod
async def list_for_validation(
db: aiosqlite.Connection,
protocol: Optional[str] = None,
) -> List[Proxy]:
"""待验证优先,其次已验证按 last_check 升序(用于全量/调度复检)。"""
pending: List[Proxy] = []
q = f"SELECT {_SELECT_PROXY_COLS} FROM proxies WHERE validated = 0"
params: List = []
if protocol:
q += " AND protocol = ?"
params.append(protocol.lower())
q += " ORDER BY created_at ASC"
async with db.execute(q, params) as cursor:
rows_p = await cursor.fetchall()
pending = [_row_to_proxy(r) for r in rows_p]
rest_q = f"SELECT {_SELECT_PROXY_COLS} FROM proxies WHERE validated = 1"
rparams: List = []
if protocol:
rest_q += " AND protocol = ?"
rparams.append(protocol.lower())
rest_q += " ORDER BY last_check ASC"
async with db.execute(rest_q, rparams) as cursor:
rows_r = await cursor.fetchall()
rest = [_row_to_proxy(r) for r in rows_r]
return pending + rest
@staticmethod @staticmethod
async def iter_batches( async def iter_batches(
db: aiosqlite.Connection, db: aiosqlite.Connection,
protocol: Optional[str] = None, protocol: Optional[str] = None,
batch_size: int = 1000, batch_size: int = 1000,
only_usable: bool = False,
): ):
"""流式分批读取代理,避免一次性加载大量数据到内存""" """流式分批读取代理,避免一次性加载大量数据到内存"""
offset = 0 offset = 0
while True: while True:
batch = await ProxyRepository.list_all(db, protocol, batch_size, offset) batch = await ProxyRepository._list_batch_offset(
db, protocol, batch_size, offset, only_usable=only_usable
)
if not batch: if not batch:
break break
yield batch yield batch
offset += batch_size offset += batch_size
@staticmethod
async def _list_batch_offset(
db: aiosqlite.Connection,
protocol: Optional[str],
batch_size: int,
offset: int,
only_usable: bool,
) -> List[Proxy]:
query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
params: List = []
clauses = []
if only_usable:
clauses.append("validated = 1 AND score > 0")
if protocol:
clauses.append("protocol = ?")
params.append(protocol.lower())
if clauses:
query += " WHERE " + " AND ".join(clauses)
query += " LIMIT ? OFFSET ?"
params.extend([batch_size, offset])
async with db.execute(query, params) as cursor:
rows = await cursor.fetchall()
return [_row_to_proxy(row) for row in rows]
@staticmethod @staticmethod
async def list_paginated( async def list_paginated(
db: aiosqlite.Connection, db: aiosqlite.Connection,
@@ -196,6 +324,7 @@ class ProxyRepository:
max_score: Optional[int] = None, max_score: Optional[int] = None,
sort_by: str = "last_check", sort_by: str = "last_check",
sort_order: str = "DESC", sort_order: str = "DESC",
pool_filter: Optional[str] = None,
) -> Tuple[List[Proxy], int]: ) -> Tuple[List[Proxy], int]:
conditions = ["score >= ?"] conditions = ["score >= ?"]
params: List = [min_score] params: List = [min_score]
@@ -206,6 +335,10 @@ class ProxyRepository:
if max_score is not None: if max_score is not None:
conditions.append("score <= ?") conditions.append("score <= ?")
params.append(max_score) params.append(max_score)
if pool_filter == "pending":
conditions.append("validated = 0")
elif pool_filter == "available":
conditions.append("validated = 1 AND score > 0")
where_clause = " AND ".join(conditions) where_clause = " AND ".join(conditions)
allowed_sort_by = {"ip", "port", "protocol", "score", "last_check"} allowed_sort_by = {"ip", "port", "protocol", "score", "last_check"}
@@ -222,7 +355,7 @@ class ProxyRepository:
total = row[0] if row else 0 total = row[0] if row else 0
data_query = f""" data_query = f"""
SELECT ip, port, protocol, score, response_time_ms, last_check, created_at SELECT {_SELECT_PROXY_COLS}
FROM proxies FROM proxies
WHERE {where_clause} WHERE {where_clause}
ORDER BY {order_clause} ORDER BY {order_clause}
@@ -239,8 +372,9 @@ class ProxyRepository:
query = """ query = """
SELECT SELECT
COUNT(*) as total, COUNT(*) as total,
COUNT(CASE WHEN score > 0 THEN 1 END) as available, COUNT(CASE WHEN validated = 0 THEN 1 END) as pending,
AVG(score) as avg_score, COUNT(CASE WHEN validated = 1 AND score > 0 THEN 1 END) as available,
(SELECT AVG(score) FROM proxies WHERE validated = 1 AND score > 0) as avg_score,
COUNT(CASE WHEN protocol = 'http' THEN 1 END) as http_count, COUNT(CASE WHEN protocol = 'http' THEN 1 END) as http_count,
COUNT(CASE WHEN protocol = 'https' THEN 1 END) as https_count, COUNT(CASE WHEN protocol = 'https' THEN 1 END) as https_count,
COUNT(CASE WHEN protocol = 'socks4' THEN 1 END) as socks4_count, COUNT(CASE WHEN protocol = 'socks4' THEN 1 END) as socks4_count,
@@ -252,15 +386,17 @@ class ProxyRepository:
if row: if row:
return { return {
"total": row[0] or 0, "total": row[0] or 0,
"available": row[1] or 0, "pending": row[1] or 0,
"avg_score": round(row[2], 2) if row[2] else 0, "available": row[2] or 0,
"http_count": row[3] or 0, "avg_score": round(row[3], 2) if row[3] is not None else 0,
"https_count": row[4] or 0, "http_count": row[4] or 0,
"socks4_count": row[5] or 0, "https_count": row[5] or 0,
"socks5_count": row[6] or 0, "socks4_count": row[6] or 0,
"socks5_count": row[7] or 0,
} }
return { return {
"total": 0, "total": 0,
"pending": 0,
"available": 0, "available": 0,
"avg_score": 0, "avg_score": 0,
"http_count": 0, "http_count": 0,
@@ -271,9 +407,15 @@ class ProxyRepository:
@staticmethod @staticmethod
async def get_today_new_count(db: aiosqlite.Connection) -> int: async def get_today_new_count(db: aiosqlite.Connection) -> int:
"""今日新增:仅统计今日入库且已验证可用(与 get_stats.available 语义一致)。"""
try: try:
async with db.execute( async with db.execute(
"SELECT COUNT(*) FROM proxies WHERE DATE(created_at) = DATE('now', 'localtime')" """
SELECT COUNT(*) FROM proxies
WHERE DATE(created_at) = DATE('now', 'localtime')
AND validated = 1
AND score > 0
"""
) as cursor: ) as cursor:
row = await cursor.fetchone() row = await cursor.fetchone()
return row[0] if row else 0 return row[0] if row else 0
@@ -283,7 +425,9 @@ class ProxyRepository:
@staticmethod @staticmethod
async def clean_invalid(db: aiosqlite.Connection) -> int: async def clean_invalid(db: aiosqlite.Connection) -> int:
await db.execute("DELETE FROM proxies WHERE score <= 0") await db.execute(
"DELETE FROM proxies WHERE validated = 1 AND score <= 0"
)
await db.commit() await db.commit()
return db.total_changes return db.total_changes

View File

@@ -6,13 +6,12 @@ from app.core.log import logger
DEFAULT_SETTINGS = { DEFAULT_SETTINGS = {
"crawl_timeout": 30, "validation_timeout": 6,
"validation_timeout": 10, "default_concurrency": 120,
"max_retries": 3,
"default_concurrency": 50,
"min_proxy_score": 0, "min_proxy_score": 0,
"proxy_expiry_days": 7, "proxy_expiry_days": 7,
"auto_validate": True, "auto_validate": True,
"auto_validate_after_crawl": False,
"validate_interval_minutes": 30, "validate_interval_minutes": 30,
"validation_targets": [ "validation_targets": [
"http://httpbin.org/ip", "http://httpbin.org/ip",
@@ -50,6 +49,8 @@ class SettingsRepository:
settings[key] = value settings[key] = value
except Exception as e: except Exception as e:
logger.error(f"get_all settings failed: {e}") logger.error(f"get_all settings failed: {e}")
# 已废弃:爬取限时改为每插件 crawl_timeout_seconds不再存全局项
settings.pop("crawl_timeout", None)
return settings return settings
@staticmethod @staticmethod

View File

@@ -0,0 +1,9 @@
"""首页 / 仪表盘统计快照(供 REST 与 WebSocket 复用)"""
from app.services.proxy_service import ProxyService
async def get_dashboard_stats(scheduler_running: bool) -> dict:
proxy_service = ProxyService()
stats = await proxy_service.get_stats()
stats["scheduler_running"] = scheduler_running
return stats

View File

@@ -4,7 +4,6 @@ from datetime import datetime
from typing import Optional from typing import Optional
from app.core.plugin_system.base import BaseCrawlerPlugin from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.config import settings as app_settings
from app.core.log import logger from app.core.log import logger
from app.models.domain import CrawlResult, ProxyRaw from app.models.domain import CrawlResult, ProxyRaw
@@ -12,14 +11,13 @@ from app.models.domain import CrawlResult, ProxyRaw
class PluginRunner: class PluginRunner:
"""统一插件执行器 """统一插件执行器
- 超时控制(从 settings 读取 crawl_timeout - 超时:每插件独立,使用 plugin.crawl_timeout_seconds默认 120s
- 异常捕获和统计更新 - 可选 crawl_timeout_override仅用于测试等场景覆盖插件自身限时
- 可选的健康检查前置 - 异常捕获和统计更新、健康检查前置、结果去重
- 结果去重
""" """
def __init__(self, timeout: Optional[float] = None): def __init__(self, crawl_timeout_override: Optional[float] = None):
self.timeout = timeout if timeout is not None else getattr(app_settings, "crawler_timeout", 30) self.crawl_timeout_override = crawl_timeout_override
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult: async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
"""执行单个插件爬取""" """执行单个插件爬取"""
@@ -42,19 +40,22 @@ class PluginRunner:
await self._save_stats(plugin, result) await self._save_stats(plugin, result)
return result return result
# 执行爬取 crawl_limit = float(getattr(plugin, "crawl_timeout_seconds", 120.0))
if self.crawl_timeout_override is not None:
crawl_limit = float(self.crawl_timeout_override)
try: try:
proxies = await asyncio.wait_for( proxies = await asyncio.wait_for(
plugin.crawl(), plugin.crawl(),
timeout=self.timeout, timeout=crawl_limit,
) )
result.proxies = self._dedup(proxies) result.proxies = self._dedup(proxies)
result.success_count = 1 if result.proxies else 0 result.success_count = len(result.proxies)
logger.info( logger.info(
f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies" f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies"
) )
except asyncio.TimeoutError: except asyncio.TimeoutError:
result.error = f"crawl timeout after {self.timeout}s" result.error = f"crawl timeout after {crawl_limit}s"
result.failure_count = 1 result.failure_count = 1
logger.error(f"Plugin {plugin.name} crawl timeout") logger.error(f"Plugin {plugin.name} crawl timeout")
except Exception as e: except Exception as e:

View File

@@ -5,7 +5,7 @@ from typing import List, Optional
from app.core.db import get_db from app.core.db import get_db
from app.core.plugin_system.registry import registry from app.core.plugin_system.registry import registry
from app.core.plugin_system.base import BaseCrawlerPlugin from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.exceptions import PluginNotFoundException from app.core.exceptions import PluginNotFoundException, ValidationException
from app.repositories.settings_repo import PluginSettingsRepository from app.repositories.settings_repo import PluginSettingsRepository
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
from app.core.log import logger from app.core.log import logger

View File

@@ -30,10 +30,19 @@ class ProxyService:
max_score: Optional[int] = None, max_score: Optional[int] = None,
sort_by: str = "last_check", sort_by: str = "last_check",
sort_order: str = "DESC", sort_order: str = "DESC",
pool_filter: Optional[str] = None,
) -> Tuple[List[Proxy], int]: ) -> Tuple[List[Proxy], int]:
async with get_db() as db: async with get_db() as db:
return await self.proxy_repo.list_paginated( return await self.proxy_repo.list_paginated(
db, page, page_size, protocol, min_score, max_score, sort_by, sort_order db,
page,
page_size,
protocol,
min_score,
max_score,
sort_by,
sort_order,
pool_filter=pool_filter,
) )
async def get_random_proxy(self) -> Optional[Proxy]: async def get_random_proxy(self) -> Optional[Proxy]:
@@ -72,7 +81,9 @@ class ProxyService:
exported = 0 exported = 0
async with get_db() as db: async with get_db() as db:
async for batch in self.proxy_repo.iter_batches(db, protocol=protocol, batch_size=1000): async for batch in self.proxy_repo.iter_batches(
db, protocol=protocol, batch_size=1000, only_usable=True
):
for p in batch: for p in batch:
if exported >= limit: if exported >= limit:
break break

View File

@@ -2,9 +2,11 @@
import asyncio import asyncio
import random import random
import time import time
from collections import OrderedDict
from typing import Tuple, Optional, List
import aiohttp import aiohttp
import aiohttp_socks import aiohttp_socks
from typing import Tuple, Optional, List
from app.core.config import settings as app_settings from app.core.config import settings as app_settings
from app.core.log import logger from app.core.log import logger
@@ -14,6 +16,7 @@ class ValidatorService:
"""代理验证器 """代理验证器
支持动态读取配置,实现设置热更新。 支持动态读取配置,实现设置热更新。
并发由 AsyncWorkerPool.worker_count 限制,此处不再套 Semaphore。
""" """
# 测试 URL 默认池 # 测试 URL 默认池
@@ -32,23 +35,30 @@ class ValidatorService:
], ],
} }
_SOCKS_CACHE_CAP = 128
def __init__( def __init__(
self, self,
timeout: Optional[float] = None, timeout: Optional[float] = None,
connect_timeout: Optional[float] = None, connect_timeout: Optional[float] = None,
max_concurrency: Optional[int] = None, max_concurrency: Optional[int] = None,
): ):
# 初始化时使用传入值或默认值,但运行期会动态读取 settings
self._init_timeout = timeout if timeout is not None else app_settings.validator_timeout self._init_timeout = timeout if timeout is not None else app_settings.validator_timeout
self._init_connect_timeout = connect_timeout if connect_timeout is not None else app_settings.validator_connect_timeout self._init_connect_timeout = (
self._init_max_concurrency = max_concurrency if max_concurrency is not None else app_settings.validator_max_concurrency connect_timeout if connect_timeout is not None else app_settings.validator_connect_timeout
)
self._init_max_concurrency = (
max_concurrency if max_concurrency is not None else app_settings.validator_max_concurrency
)
self._http_connector: Optional[aiohttp.TCPConnector] = None self._http_connector: Optional[aiohttp.TCPConnector] = None
self._http_session: Optional[aiohttp.ClientSession] = None self._http_session: Optional[aiohttp.ClientSession] = None
self._semaphore: Optional[asyncio.Semaphore] = None
self._lock = asyncio.Lock() self._lock = asyncio.Lock()
self._test_urls: Optional[List[str]] = None self._test_urls: Optional[List[str]] = None
self._socks_sessions: "OrderedDict[Tuple[str, str, int], aiohttp.ClientSession]" = OrderedDict()
self._socks_lock = asyncio.Lock()
@property @property
def timeout(self) -> float: def timeout(self) -> float:
return float(self._init_timeout) return float(self._init_timeout)
@@ -61,11 +71,16 @@ class ValidatorService:
def max_concurrency(self) -> int: def max_concurrency(self) -> int:
return int(self._init_max_concurrency) return int(self._init_max_concurrency)
def _client_timeout(self) -> aiohttp.ClientTimeout:
t = float(self.timeout)
c = min(float(self.connect_timeout), t)
sock_read = min(t, max(2.0, t * 0.85))
return aiohttp.ClientTimeout(total=t, connect=c, sock_read=sock_read)
async def _ensure_session(self) -> aiohttp.ClientSession: async def _ensure_session(self) -> aiohttp.ClientSession:
"""懒加载共享 HTTP session""" """懒加载共享 HTTP session"""
if self._http_session is None or self._http_session.closed: if self._http_session is None or self._http_session.closed:
async with self._lock: async with self._lock:
# 双重检查,避免多个协程在获取锁后重复创建
if self._http_session is None or self._http_session.closed: if self._http_session is None or self._http_session.closed:
connector = aiohttp.TCPConnector( connector = aiohttp.TCPConnector(
ssl=False, ssl=False,
@@ -73,28 +88,18 @@ class ValidatorService:
limit_per_host=self.max_concurrency, limit_per_host=self.max_concurrency,
force_close=False, force_close=False,
) )
timeout = aiohttp.ClientTimeout(
total=self.timeout, connect=self.connect_timeout
)
self._http_connector = connector self._http_connector = connector
self._http_session = aiohttp.ClientSession( self._http_session = aiohttp.ClientSession(
connector=connector, connector=connector,
timeout=timeout, timeout=self._client_timeout(),
) )
return self._http_session return self._http_session
def _ensure_semaphore(self) -> asyncio.Semaphore:
if self._semaphore is None:
self._semaphore = asyncio.Semaphore(self.max_concurrency)
return self._semaphore
def _get_test_url(self, protocol: str) -> str: def _get_test_url(self, protocol: str) -> str:
custom_urls = self._test_urls custom_urls = self._test_urls
if not custom_urls: if not custom_urls:
from app.core.config import settings as app_settings
custom_urls = getattr(app_settings, "validator_test_urls", None) custom_urls = getattr(app_settings, "validator_test_urls", None)
if custom_urls and isinstance(custom_urls, list) and len(custom_urls) > 0: if custom_urls and isinstance(custom_urls, list) and len(custom_urls) > 0:
# 按协议过滤自定义 URL如果没有匹配的则使用全部
filtered = [u for u in custom_urls if u.lower().startswith(protocol.lower())] filtered = [u for u in custom_urls if u.lower().startswith(protocol.lower())]
if filtered: if filtered:
return random.choice(filtered) return random.choice(filtered)
@@ -105,14 +110,10 @@ class ValidatorService:
async def validate(self, ip: str, port: int, protocol: str = "http") -> Tuple[bool, float]: async def validate(self, ip: str, port: int, protocol: str = "http") -> Tuple[bool, float]:
"""验证单个代理,返回 (是否有效, 延迟毫秒)""" """验证单个代理,返回 (是否有效, 延迟毫秒)"""
protocol = protocol.lower() protocol = protocol.lower()
semaphore = self._ensure_semaphore()
async with semaphore:
start = time.time() start = time.time()
try: try:
if protocol in ("socks4", "socks5"): if protocol in ("socks4", "socks5"):
return await self._validate_socks(ip, port, protocol, start) return await self._validate_socks(ip, port, protocol, start)
else:
return await self._validate_http(ip, port, protocol, start) return await self._validate_http(ip, port, protocol, start)
except asyncio.TimeoutError: except asyncio.TimeoutError:
logger.debug(f"Validation timeout: {ip}:{port} ({protocol})") logger.debug(f"Validation timeout: {ip}:{port} ({protocol})")
@@ -129,11 +130,24 @@ class ValidatorService:
async with session.get(test_url, proxy=proxy_url, allow_redirects=True) as response: async with session.get(test_url, proxy=proxy_url, allow_redirects=True) as response:
if response.status in (200, 301, 302): if response.status in (200, 301, 302):
latency = round((time.time() - start) * 1000, 2) latency = round((time.time() - start) * 1000, 2)
logger.info(f"HTTP valid: {ip}:{port} ({protocol}) {latency}ms") logger.debug(f"HTTP valid: {ip}:{port} ({protocol}) {latency}ms")
return True, latency return True, latency
return False, 0.0 return False, 0.0
async def _validate_socks(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]: async def _get_socks_session(self, protocol: str, ip: str, port: int) -> aiohttp.ClientSession:
key = (protocol, ip, port)
async with self._socks_lock:
sess = self._socks_sessions.get(key)
if sess is not None:
if sess.closed:
del self._socks_sessions[key]
else:
self._socks_sessions.move_to_end(key)
return sess
while len(self._socks_sessions) >= self._SOCKS_CACHE_CAP:
_, old = self._socks_sessions.popitem(last=False)
if old is not None and not old.closed:
await old.close()
proxy_type = ( proxy_type = (
aiohttp_socks.ProxyType.SOCKS4 aiohttp_socks.ProxyType.SOCKS4
if protocol == "socks4" if protocol == "socks4"
@@ -146,23 +160,33 @@ class ValidatorService:
rdns=True, rdns=True,
ssl=False, ssl=False,
) )
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout) sess = aiohttp.ClientSession(connector=connector, timeout=self._client_timeout())
test_url = self._get_test_url("http") self._socks_sessions[key] = sess
return sess
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: async def _validate_socks(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]:
test_url = self._get_test_url("http")
session = await self._get_socks_session(protocol, ip, port)
async with session.get(test_url, allow_redirects=True) as response: async with session.get(test_url, allow_redirects=True) as response:
if response.status in (200, 301, 302): if response.status in (200, 301, 302):
latency = round((time.time() - start) * 1000, 2) latency = round((time.time() - start) * 1000, 2)
logger.info(f"SOCKS valid: {ip}:{port} ({protocol}) {latency}ms") logger.debug(f"SOCKS valid: {ip}:{port} ({protocol}) {latency}ms")
return True, latency return True, latency
return False, 0.0 return False, 0.0
async def close_socks_sessions(self) -> None:
"""关闭 SOCKS 会话缓存(设置热更新或进程退出时调用)。"""
async with self._socks_lock:
for s in list(self._socks_sessions.values()):
if not s.closed:
await s.close()
self._socks_sessions.clear()
def update_test_urls(self, urls: List[str]) -> None: def update_test_urls(self, urls: List[str]) -> None:
"""运行时更新验证目标 URL 列表"""
self._test_urls = list(urls) if urls else None self._test_urls = list(urls) if urls else None
async def close(self) -> None: async def close(self) -> None:
"""关闭共享的 HTTP ClientSession""" await self.close_socks_sessions()
if self._http_session and not self._http_session.closed: if self._http_session and not self._http_session.closed:
await self._http_session.close() await self._http_session.close()
self._http_session = None self._http_session = None

View File

@@ -1,4 +1,4 @@
[tool:pytest] [pytest]
testpaths = tests testpaths = tests
python_files = test_*.py python_files = test_*.py
python_classes = Test* python_classes = Test*
@@ -13,5 +13,6 @@ markers =
integration: 集成测试 integration: 集成测试
e2e: 端到端测试 e2e: 端到端测试
slow: 慢速测试 slow: 慢速测试
network: 需要出站网络(真实爬取/验证)
async_test: 异步测试 async_test: 异步测试
asyncio_default_fixture_loop_scope = function asyncio_default_fixture_loop_scope = function

View File

@@ -0,0 +1,35 @@
"""对 SQLite settings 表执行维护 SQL见 db_optimize_settings.sql
使用当前应用配置的数据库路径app.core.db.DB_PATH。pytest 使用 PROXYPOOL_DB_PATH
指向 db/proxies.test.sqlite勿在生产库路径上误跑测试夹具。
"""
import asyncio
import os
import sys
# 保证可 import app
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
async def main() -> None:
from app.core.db import DB_PATH, ensure_db_dir
import aiosqlite
sql_path = os.path.join(os.path.dirname(__file__), "db_optimize_settings.sql")
with open(sql_path, encoding="utf-8") as f:
script = f.read()
ensure_db_dir()
if not os.path.isfile(DB_PATH):
print(f"数据库不存在,跳过: {DB_PATH}")
return
async with aiosqlite.connect(DB_PATH) as db:
await db.executescript(script)
await db.commit()
print(f"已执行设置维护: {DB_PATH}")
print("请重启应用或在 WebUI 保存一次设置以使并发/超时生效。")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,14 @@
-- ProxyPool设置表维护负优化清理 + 推荐验证参数)
-- 用法:在停服或确认无并发写入时执行;或运行 python scripts/apply_settings_maintenance.py
-- 注意:改库后需「重启应用」或在 WebUI「保存设置」才会让运行中的 WorkerPool / Validator 重载并发与超时。
-- 废弃键
DELETE FROM settings WHERE key = 'crawl_timeout';
DELETE FROM settings WHERE key = 'max_retries';
-- 推荐验证参数(可按机器与网络再调大 default_concurrency
INSERT INTO settings (key, value, updated_at) VALUES ('validation_timeout', '6', CURRENT_TIMESTAMP)
ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = CURRENT_TIMESTAMP;
INSERT INTO settings (key, value, updated_at) VALUES ('default_concurrency', '120', CURRENT_TIMESTAMP)
ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = CURRENT_TIMESTAMP;

View File

@@ -5,6 +5,8 @@
``` ```
tests/ tests/
├── conftest.py # pytest 配置和 fixtures ├── conftest.py # pytest 配置和 fixtures
├── task_utils.py # 异步任务轮询(集成/E2E 共用)
├── support/ # 测试专用插件类等(非 mock
├── README.md # 本文件 ├── README.md # 本文件
├── unit/ # 单元测试 ├── unit/ # 单元测试
│ ├── test_models.py # 模型测试 │ ├── test_models.py # 模型测试
@@ -12,6 +14,7 @@ tests/
├── integration/ # 集成测试 ├── integration/ # 集成测试
│ ├── test_proxies_api.py # 代理 API 测试 │ ├── test_proxies_api.py # 代理 API 测试
│ ├── test_plugins_api.py # 插件 API 测试 │ ├── test_plugins_api.py # 插件 API 测试
│ ├── test_plugins_live_crawl.py # 各插件真实爬取验收(须外网)
│ ├── test_scheduler_api.py # 调度器 API 测试 │ ├── test_scheduler_api.py # 调度器 API 测试
│ ├── test_settings_api.py # 设置 API 测试 │ ├── test_settings_api.py # 设置 API 测试
│ └── test_health_api.py # 健康检查测试 │ └── test_health_api.py # 健康检查测试
@@ -19,6 +22,25 @@ tests/
└── test_full_workflow.py # 完整工作流测试 └── test_full_workflow.py # 完整工作流测试
``` ```
## 网络与真实调用
集成测试与 E2E **不再 mock** `PluginRunner` / `ValidatorService`:会发起真实 HTTP 爬取与代理验证(视设置而定)。运行全量 `pytest` 需要 **可用的出站网络**,且含 `network` / `slow` 标记的用例可能耗时数分钟。
跳过需外网的用例(例如离线快速检查):
```bash
pytest -m "not network"
```
**插件爬取验收**`test_plugins_live_crawl.py`
- 核心 8 插件:必须至少 1 条代理且无 Runner 失败。
- `fpw_*`:对照 [Free_Proxy_Website](https://github.com/cyubuchen/Free_Proxy_Website) 的公开源,允许 0 条(国际网络差异),使用更长超时。
```bash
pytest tests/integration/test_plugins_live_crawl.py -v
```
## 运行测试 ## 运行测试
### 安装测试依赖 ### 安装测试依赖

View File

@@ -1,5 +1,15 @@
"""pytest 配置文件和 fixtures""" """pytest 配置文件和 fixtures"""
# 必须在任何 app.* 导入之前:下方 app fixture 会清空表,不可与生产共用 db/proxies.sqlite
import os
os.environ["PROXYPOOL_DB_PATH"] = "db/proxies.test.sqlite"
import asyncio import asyncio
import sys
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from typing import AsyncGenerator from typing import AsyncGenerator
@@ -17,22 +27,28 @@ from app.plugins import (
SpeedXPlugin, SpeedXPlugin,
YunDaiLiPlugin, YunDaiLiPlugin,
ProxyScrapePlugin, ProxyScrapePlugin,
FpwProxyListDownloadPlugin,
FpwSocksSslProxyPlugin,
FpwSpysOnePlugin,
FpwProxynovaPlugin,
FpwHidemyPlugin,
FpwPremproxyPlugin,
FpwFreeproxylistsPlugin,
FpwGatherproxyPlugin,
FpwCheckerproxyPlugin,
) )
from app.repositories.proxy_repo import ProxyRepository from app.repositories.proxy_repo import ProxyRepository
from app.models.domain import ProxyRaw
@pytest_asyncio.fixture(scope="function") @pytest_asyncio.fixture(scope="function")
async def app(): async def app():
"""创建应用实例""" """创建应用实例"""
# 初始化测试数据库并清空历史数据
await init_db() await init_db()
async with get_db() as db: async with get_db() as db:
await db.execute("DELETE FROM proxies") await db.execute("DELETE FROM proxies")
await db.execute("DELETE FROM settings") await db.execute("DELETE FROM settings")
await db.commit() await db.commit()
# 清理并重新注册插件,防止跨测试污染
registry.clear() registry.clear()
for plugin_cls in [ for plugin_cls in [
Fate0Plugin, Fate0Plugin,
@@ -43,6 +59,15 @@ async def app():
SpeedXPlugin, SpeedXPlugin,
YunDaiLiPlugin, YunDaiLiPlugin,
ProxyScrapePlugin, ProxyScrapePlugin,
FpwProxyListDownloadPlugin,
FpwSocksSslProxyPlugin,
FpwSpysOnePlugin,
FpwProxynovaPlugin,
FpwHidemyPlugin,
FpwPremproxyPlugin,
FpwFreeproxylistsPlugin,
FpwGatherproxyPlugin,
FpwCheckerproxyPlugin,
]: ]:
registry.register(plugin_cls) registry.register(plugin_cls)
@@ -50,7 +75,6 @@ async def app():
async with test_app.router.lifespan_context(test_app): async with test_app.router.lifespan_context(test_app):
yield test_app yield test_app
# 给 aiosqlite / aiohttp 后台线程留出收尾时间
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
@@ -80,32 +104,4 @@ async def sample_proxy(db, proxy_repo):
"""创建一个测试代理""" """创建一个测试代理"""
await proxy_repo.insert_or_update(db, "192.168.1.1", 8080, "http", 50) await proxy_repo.insert_or_update(db, "192.168.1.1", 8080, "http", 50)
yield {"ip": "192.168.1.1", "port": 8080, "protocol": "http", "score": 50} yield {"ip": "192.168.1.1", "port": 8080, "protocol": "http", "score": 50}
# 清理
await proxy_repo.delete(db, "192.168.1.1", 8080) await proxy_repo.delete(db, "192.168.1.1", 8080)
@pytest_asyncio.fixture(autouse=True)
async def mock_external_requests(monkeypatch, request):
"""
自动在集成/E2E 测试中 mock 外部网络请求:
1. 插件爬取返回固定测试代理,避免真实 HTTP 请求
2. 代理验证瞬间成功,避免连接超时等待
"""
if "/unit/" in request.node.nodeid:
return
from app.services.plugin_runner import PluginRunner
from app.services.validator_service import ValidatorService
async def _mock_run(self, plugin):
from app.models.domain import CrawlResult
return CrawlResult(
plugin_name=plugin.name,
proxies=[ProxyRaw("192.168.100.10", 8080, "http")],
success_count=1,
)
async def _mock_validate(self, ip: str, port: int, protocol: str = "http"):
return True, 1.23
monkeypatch.setattr(PluginRunner, "run", _mock_run)
monkeypatch.setattr(ValidatorService, "validate", _mock_validate)

View File

@@ -4,10 +4,14 @@
""" """
import pytest import pytest
from tests.task_utils import poll_task_until_terminal
class TestFullWorkflow: class TestFullWorkflow:
"""测试完整工作流""" """测试完整工作流"""
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_proxy_management_workflow(self, client): async def test_proxy_management_workflow(self, client):
"""测试代理管理完整工作流 """测试代理管理完整工作流
@@ -35,11 +39,17 @@ class TestFullWorkflow:
# 3. 触发所有插件爬取 # 3. 触发所有插件爬取
response = await client.post("/api/plugins/crawl-all") response = await client.post("/api/plugins/crawl-all")
assert response.status_code == 200 assert response.status_code == 200
crawl_result = response.json()["data"] task_id = response.json()["data"]["task_id"]
task_data = await poll_task_until_terminal(
client, task_id, max_rounds=400, interval=0.5
)
assert task_data is not None
assert task_data["status"] in ("completed", "failed", "cancelled")
# 4. 获取更新后的统计 # 4. 获取更新后的统计
response = await client.get("/api/proxies/stats") response = await client.get("/api/proxies/stats")
updated_stats = response.json()["data"] updated_stats = response.json()["data"]
assert "total" in initial_stats and "total" in updated_stats
# 5. 导出代理(所有格式) # 5. 导出代理(所有格式)
for fmt in ["csv", "txt", "json"]: for fmt in ["csv", "txt", "json"]:
@@ -50,6 +60,8 @@ class TestFullWorkflow:
response = await client.delete("/api/proxies/clean-invalid") response = await client.delete("/api/proxies/clean-invalid")
assert response.status_code == 200 assert response.status_code == 200
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_plugin_management_workflow(self, client): async def test_plugin_management_workflow(self, client):
"""测试插件管理完整工作流 """测试插件管理完整工作流
@@ -93,6 +105,12 @@ class TestFullWorkflow:
# 6. 触发爬取 # 6. 触发爬取
response = await client.post(f"/api/plugins/{plugin_id}/crawl") response = await client.post(f"/api/plugins/{plugin_id}/crawl")
assert response.status_code == 200 assert response.status_code == 200
crawl_task_id = response.json()["data"]["task_id"]
crawl_task = await poll_task_until_terminal(
client, crawl_task_id, max_rounds=140, interval=0.5
)
assert crawl_task is not None
assert crawl_task["status"] in ("completed", "failed", "cancelled")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_scheduler_workflow(self, client): async def test_scheduler_workflow(self, client):

View File

@@ -1,6 +1,8 @@
"""插件 API 集成测试 - 测试 /api/plugins/* 所有接口""" """插件 API 集成测试 - 测试 /api/plugins/* 所有接口"""
import pytest import pytest
from tests.task_utils import poll_task_until_terminal
class TestPluginsAPI: class TestPluginsAPI:
"""测试插件相关 API""" """测试插件相关 API"""
@@ -116,10 +118,11 @@ class TestPluginsAPI:
data = response.json() data = response.json()
assert data["code"] == 200 assert data["code"] == 200
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_crawl_plugin(self, client): async def test_crawl_plugin(self, client):
"""测试 POST /api/plugins/{id}/crawl - 异步任务模式""" """测试 POST /api/plugins/{id}/crawl - 异步任务模式"""
import asyncio
response = await client.get("/api/plugins") response = await client.get("/api/plugins")
plugins = response.json()["data"]["plugins"] plugins = response.json()["data"]["plugins"]
if not plugins: if not plugins:
@@ -133,18 +136,11 @@ class TestPluginsAPI:
assert "task_id" in data["data"] assert "task_id" in data["data"]
task_id = data["data"]["task_id"] task_id = data["data"]["task_id"]
# 轮询任务状态 task_data = await poll_task_until_terminal(
task_data = None client, task_id, max_rounds=140, interval=0.5
for _ in range(10): )
await asyncio.sleep(0.3)
res = await client.get(f"/api/tasks/{task_id}")
assert res.status_code == 200
task_data = res.json()["data"]
if task_data["status"] in ("completed", "failed", "cancelled"):
break
assert task_data is not None assert task_data is not None
assert task_data["status"] in ("completed", "cancelled") assert task_data["status"] in ("completed", "failed", "cancelled")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_crawl_nonexistent_plugin(self, client): async def test_crawl_nonexistent_plugin(self, client):
@@ -152,10 +148,11 @@ class TestPluginsAPI:
response = await client.post("/api/plugins/nonexistent_plugin/crawl") response = await client.post("/api/plugins/nonexistent_plugin/crawl")
assert response.status_code == 404 assert response.status_code == 404
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_crawl_all_plugins(self, client): async def test_crawl_all_plugins(self, client):
"""测试 POST /api/plugins/crawl-all - 异步任务模式""" """测试 POST /api/plugins/crawl-all - 异步任务模式"""
import asyncio
response = await client.post("/api/plugins/crawl-all") response = await client.post("/api/plugins/crawl-all")
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
@@ -163,15 +160,8 @@ class TestPluginsAPI:
assert "task_id" in data["data"] assert "task_id" in data["data"]
task_id = data["data"]["task_id"] task_id = data["data"]["task_id"]
# 轮询任务状态 task_data = await poll_task_until_terminal(
task_data = None client, task_id, max_rounds=400, interval=0.5
for _ in range(10): )
await asyncio.sleep(0.3)
res = await client.get(f"/api/tasks/{task_id}")
assert res.status_code == 200
task_data = res.json()["data"]
if task_data["status"] in ("completed", "failed", "cancelled"):
break
assert task_data is not None assert task_data is not None
assert task_data["status"] in ("completed", "cancelled") assert task_data["status"] in ("completed", "failed", "cancelled")

View File

@@ -14,6 +14,7 @@ class TestProxiesAPI:
assert data["code"] == 200 assert data["code"] == 200
assert "data" in data assert "data" in data
assert "total" in data["data"] assert "total" in data["data"]
assert "pending" in data["data"]
assert "available" in data["data"] assert "available" in data["data"]
assert "scheduler_running" in data["data"] assert "scheduler_running" in data["data"]
@@ -68,6 +69,17 @@ class TestProxiesAPI:
# 可能返回 200(有数据) 或 404(无数据) # 可能返回 200(有数据) 或 404(无数据)
assert response.status_code in [200, 404] assert response.status_code in [200, 404]
@pytest.mark.asyncio
async def test_delete_proxy_post_json(self, client, sample_proxy):
"""测试 POST /api/proxies/delete-one前端默认路径兼容 IPv6"""
response = await client.post(
"/api/proxies/delete-one",
json={"ip": sample_proxy["ip"], "port": sample_proxy["port"]},
)
assert response.status_code == 200
data = response.json()
assert data["code"] == 200
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_proxy(self, client, sample_proxy): async def test_delete_proxy(self, client, sample_proxy):
"""测试 DELETE /api/proxies/{ip}/{port}""" """测试 DELETE /api/proxies/{ip}/{port}"""
@@ -76,6 +88,19 @@ class TestProxiesAPI:
data = response.json() data = response.json()
assert data["code"] == 200 assert data["code"] == 200
@pytest.mark.asyncio
async def test_delete_one_ipv6(self, client, db, proxy_repo):
"""POST delete-one 可删除含冒号的 IP路径 DELETE 无法可靠表达)"""
await proxy_repo.insert_or_update(db, "2001:db8::1", 18080, "http", 40)
r = await client.post(
"/api/proxies/delete-one",
json={"ip": "2001:db8::1", "port": 18080},
)
assert r.status_code == 200
assert r.json()["code"] == 200
left = await proxy_repo.get_by_ip_port(db, "2001:db8::1", 18080)
assert left is None
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_delete_nonexistent_proxy(self, client): async def test_delete_nonexistent_proxy(self, client):
"""测试 DELETE /api/proxies/{ip}/{port} - 不存在的代理""" """测试 DELETE /api/proxies/{ip}/{port} - 不存在的代理"""

View File

@@ -1,6 +1,17 @@
"""调度器 API 集成测试 - 测试 /api/scheduler/* 所有接口""" """调度器 API 集成测试 - 测试 /api/scheduler/* 所有接口"""
import pytest import pytest
from app.api.deps import get_settings_repo
from app.repositories.settings_repo import SettingsRepository
class FailingSettingsRepository(SettingsRepository):
"""save 恒为 False用于覆盖「设置保存失败」分支非 MagicMock。"""
@staticmethod
async def save(db, settings):
return False
class TestSchedulerAPI: class TestSchedulerAPI:
"""测试调度器相关 API""" """测试调度器相关 API"""
@@ -93,18 +104,17 @@ class TestSchedulerAPI:
assert job is not None assert job is not None
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_start_scheduler_db_save_failure(self, client, monkeypatch): async def test_start_scheduler_db_save_failure(self, client, app):
"""测试启动调度器时数据库保存失败应返回 running=False""" """测试启动调度器时数据库保存失败应返回 running=False"""
from app.repositories.settings_repo import SettingsRepository
# lifespan 启动时调度器可能已自动启动,先停止它 # lifespan 启动时调度器可能已自动启动,先停止它
await client.post("/api/scheduler/stop") await client.post("/api/scheduler/stop")
async def mock_save(*args, **kwargs): app.dependency_overrides[get_settings_repo] = lambda: FailingSettingsRepository()
return False try:
monkeypatch.setattr(SettingsRepository, "save", mock_save)
response = await client.post("/api/scheduler/start") response = await client.post("/api/scheduler/start")
finally:
app.dependency_overrides.pop(get_settings_repo, None)
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
assert data["code"] == 200 assert data["code"] == 200

View File

@@ -12,7 +12,7 @@ class TestSettingsAPI:
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
assert data["code"] == 200 assert data["code"] == 200
assert "crawl_timeout" in data["data"] assert "crawl_timeout" not in data["data"]
assert "validation_timeout" in data["data"] assert "validation_timeout" in data["data"]
assert "auto_validate" in data["data"] assert "auto_validate" in data["data"]
@@ -22,16 +22,15 @@ class TestSettingsAPI:
response = await client.get("/api/settings") response = await client.get("/api/settings")
data = response.json()["data"] data = response.json()["data"]
# 验证所有预期的设置项
expected_keys = [ expected_keys = [
"crawl_timeout",
"validation_timeout", "validation_timeout",
"max_retries",
"default_concurrency", "default_concurrency",
"min_proxy_score", "min_proxy_score",
"proxy_expiry_days", "proxy_expiry_days",
"auto_validate", "auto_validate",
"auto_validate_after_crawl",
"validate_interval_minutes", "validate_interval_minutes",
"validation_targets",
] ]
for key in expected_keys: for key in expected_keys:
assert key in data, f"缺少设置项: {key}" assert key in data, f"缺少设置项: {key}"
@@ -40,65 +39,45 @@ class TestSettingsAPI:
async def test_save_settings(self, client): async def test_save_settings(self, client):
"""测试 POST /api/settings""" """测试 POST /api/settings"""
settings = { settings = {
"crawl_timeout": 45,
"validation_timeout": 15, "validation_timeout": 15,
"max_retries": 5,
"default_concurrency": 100, "default_concurrency": 100,
"min_proxy_score": 10, "min_proxy_score": 10,
"proxy_expiry_days": 14, "proxy_expiry_days": 14,
"auto_validate": True, "auto_validate": True,
"auto_validate_after_crawl": False,
"validate_interval_minutes": 60, "validate_interval_minutes": 60,
"validation_targets": [
"http://httpbin.org/ip",
],
} }
response = await client.post("/api/settings", json=settings) response = await client.post("/api/settings", json=settings)
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
assert data["code"] == 200 assert data["code"] == 200
# 验证返回的数据与提交的一致
for key, value in settings.items(): for key, value in settings.items():
assert data["data"][key] == value assert data["data"][key] == value
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_save_settings_partial(self, client): async def test_save_settings_partial(self, client):
"""测试 POST /api/settings - 部分更新(实际上会替换所有)""" """测试 POST /api/settings - 部分更新(实际上会替换所有)"""
# 先获取当前设置
response = await client.get("/api/settings") response = await client.get("/api/settings")
current_settings = response.json()["data"] current_settings = response.json()["data"]
# 修改部分设置
new_settings = current_settings.copy() new_settings = current_settings.copy()
new_settings["crawl_timeout"] = 60 new_settings["validation_timeout"] = 25
new_settings["auto_validate"] = False new_settings["auto_validate"] = False
response = await client.post("/api/settings", json=new_settings) response = await client.post("/api/settings", json=new_settings)
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
assert data["data"]["crawl_timeout"] == 60 assert data["data"]["validation_timeout"] == 25
assert data["data"]["auto_validate"] is False assert data["data"]["auto_validate"] is False
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_save_settings_validation_error(self, client): async def test_save_settings_validation_error(self, client):
"""测试 POST /api/settings - 验证错误""" """测试 POST /api/settings - 验证错误"""
# crawl_timeout 必须在 5-120 之间
invalid_settings = { invalid_settings = {
"crawl_timeout": 200, # 超出范围 "validation_timeout": 100,
"validation_timeout": 10,
"max_retries": 3,
"default_concurrency": 50,
"min_proxy_score": 0,
"proxy_expiry_days": 7,
"auto_validate": True,
"validate_interval_minutes": 30,
}
response = await client.post("/api/settings", json=invalid_settings)
assert response.status_code == 422 # 验证错误
@pytest.mark.asyncio
async def test_save_settings_invalid_type(self, client):
"""测试 POST /api/settings - 无效类型"""
invalid_settings = {
"crawl_timeout": "invalid", # 应该是整数
"validation_timeout": 10,
"max_retries": 3,
"default_concurrency": 50, "default_concurrency": 50,
"min_proxy_score": 0, "min_proxy_score": 0,
"proxy_expiry_days": 7, "proxy_expiry_days": 7,
@@ -108,15 +87,49 @@ class TestSettingsAPI:
response = await client.post("/api/settings", json=invalid_settings) response = await client.post("/api/settings", json=invalid_settings)
assert response.status_code == 422 assert response.status_code == 422
@pytest.mark.asyncio
async def test_save_settings_invalid_type(self, client):
"""测试 POST /api/settings - 无效类型"""
invalid_settings = {
"validation_timeout": 10,
"default_concurrency": "invalid",
"min_proxy_score": 0,
"proxy_expiry_days": 7,
"auto_validate": True,
"validate_interval_minutes": 30,
}
response = await client.post("/api/settings", json=invalid_settings)
assert response.status_code == 422
@pytest.mark.asyncio
async def test_save_settings_ignores_deprecated_crawl_timeout(self, client):
"""旧客户端若仍提交 crawl_timeout应忽略且保存成功"""
response = await client.get("/api/settings")
base = response.json()["data"]
payload = {**base, "crawl_timeout": 999}
response = await client.post("/api/settings", json=payload)
assert response.status_code == 200
again = (await client.get("/api/settings")).json()["data"]
assert "crawl_timeout" not in again
@pytest.mark.asyncio
async def test_save_settings_ignores_obsolete_max_retries(self, client):
"""已移除的 max_retries 键若仍被提交,应忽略。"""
response = await client.get("/api/settings")
base = response.json()["data"]
payload = {**base, "max_retries": 9}
response = await client.post("/api/settings", json=payload)
assert response.status_code == 200
again = (await client.get("/api/settings")).json()["data"]
assert "max_retries" not in again
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_settings_roundtrip(self, client): async def test_settings_roundtrip(self, client):
"""测试设置读写一致性""" """测试设置读写一致性"""
# 生成随机但有效的设置
import random import random
test_settings = { test_settings = {
"crawl_timeout": random.randint(10, 60),
"validation_timeout": random.randint(5, 30), "validation_timeout": random.randint(5, 30),
"max_retries": random.randint(1, 5),
"default_concurrency": random.randint(20, 100), "default_concurrency": random.randint(20, 100),
"min_proxy_score": random.randint(0, 50), "min_proxy_score": random.randint(0, 50),
"proxy_expiry_days": random.randint(1, 14), "proxy_expiry_days": random.randint(1, 14),
@@ -124,15 +137,12 @@ class TestSettingsAPI:
"validate_interval_minutes": random.randint(10, 120), "validate_interval_minutes": random.randint(10, 120),
} }
# 写入设置
response = await client.post("/api/settings", json=test_settings) response = await client.post("/api/settings", json=test_settings)
assert response.status_code == 200 assert response.status_code == 200
# 读取设置
response = await client.get("/api/settings") response = await client.get("/api/settings")
saved_settings = response.json()["data"] saved_settings = response.json()["data"]
# 验证一致性
for key, value in test_settings.items(): for key, value in test_settings.items():
assert saved_settings[key] == value, f"设置项 {key} 不一致" assert saved_settings[key] == value, f"设置项 {key} 不一致"
@@ -140,9 +150,7 @@ class TestSettingsAPI:
async def test_settings_roundtrip_with_validation_targets(self, client): async def test_settings_roundtrip_with_validation_targets(self, client):
"""测试设置读写一致性 - 包含数组类型的 validation_targets""" """测试设置读写一致性 - 包含数组类型的 validation_targets"""
test_settings = { test_settings = {
"crawl_timeout": 30,
"validation_timeout": 10, "validation_timeout": 10,
"max_retries": 3,
"default_concurrency": 50, "default_concurrency": 50,
"min_proxy_score": 0, "min_proxy_score": 0,
"proxy_expiry_days": 7, "proxy_expiry_days": 7,
@@ -154,13 +162,11 @@ class TestSettingsAPI:
], ],
} }
# 写入设置
response = await client.post("/api/settings", json=test_settings) response = await client.post("/api/settings", json=test_settings)
assert response.status_code == 200 assert response.status_code == 200
data = response.json() data = response.json()
assert data["data"]["validation_targets"] == test_settings["validation_targets"] assert data["data"]["validation_targets"] == test_settings["validation_targets"]
# 读取设置
response = await client.get("/api/settings") response = await client.get("/api/settings")
saved_settings = response.json()["data"] saved_settings = response.json()["data"]
assert saved_settings["validation_targets"] == test_settings["validation_targets"] assert saved_settings["validation_targets"] == test_settings["validation_targets"]
@@ -179,7 +185,6 @@ class TestSettingsAPI:
data = response.json() data = response.json()
assert data["data"]["validation_targets"] == [] assert data["data"]["validation_targets"] == []
# 读取确认
response = await client.get("/api/settings") response = await client.get("/api/settings")
saved_settings = response.json()["data"] saved_settings = response.json()["data"]
assert saved_settings["validation_targets"] == [] assert saved_settings["validation_targets"] == []

View File

@@ -0,0 +1 @@
# Test support package (non-mock plugin doubles, etc.)

View File

@@ -0,0 +1,19 @@
"""供 PluginRunner 等测试使用的真实插件子类(非 unittest.mock"""
from typing import List
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.models.domain import ProxyRaw
class UnhealthyPlugin(BaseCrawlerPlugin):
"""health_check 抛错,用于验证 Runner 对异常的统计与落库。"""
name = "test_unhealthy_runner"
display_name = "TestUnhealthy"
description = "PluginRunner health_check failure test double"
async def crawl(self) -> List[ProxyRaw]:
return []
async def health_check(self) -> bool:
raise RuntimeError("network down")

22
tests/task_utils.py Normal file
View File

@@ -0,0 +1,22 @@
"""测试用异步任务轮询工具"""
import asyncio
from typing import Any, Dict, Optional
async def poll_task_until_terminal(
client,
task_id: str,
*,
max_rounds: int,
interval: float,
) -> Optional[Dict[str, Any]]:
"""轮询任务直到终态或超时。返回最后一次 task data。"""
task_data = None
for _ in range(max_rounds):
await asyncio.sleep(interval)
res = await client.get(f"/api/tasks/{task_id}")
assert res.status_code == 200
task_data = res.json()["data"]
if task_data["status"] in ("completed", "failed", "cancelled"):
break
return task_data

View File

@@ -106,6 +106,14 @@ class TestProxyListRequest:
assert request.page_size == 50 assert request.page_size == 50
assert request.protocol == "https" assert request.protocol == "https"
def test_pool_filter_pending_available(self):
r1 = ProxyListRequest(pool_filter="pending")
assert r1.pool_filter == "pending"
r2 = ProxyListRequest(pool_filter="all")
assert r2.pool_filter is None
with pytest.raises(Exception):
ProxyListRequest(pool_filter="invalid")
class TestSettingsSchema: class TestSettingsSchema:
"""测试 SettingsSchema""" """测试 SettingsSchema"""
@@ -113,16 +121,22 @@ class TestSettingsSchema:
def test_default_settings(self): def test_default_settings(self):
"""测试默认设置""" """测试默认设置"""
settings = SettingsSchema() settings = SettingsSchema()
assert settings.crawl_timeout == 30 assert settings.validation_timeout == 6
assert settings.validation_timeout == 10 assert settings.default_concurrency == 120
assert settings.auto_validate is True assert settings.auto_validate is True
assert settings.auto_validate_after_crawl is False
def test_custom_settings(self): def test_custom_settings(self):
"""测试自定义设置""" """测试自定义设置"""
settings = SettingsSchema(crawl_timeout=60, auto_validate=False) settings = SettingsSchema(validation_timeout=25, auto_validate=False)
assert settings.crawl_timeout == 60 assert settings.validation_timeout == 25
assert settings.auto_validate is False assert settings.auto_validate is False
def test_settings_schema_ignores_unknown_fields(self):
s = SettingsSchema.model_validate({"validation_timeout": 10, "crawl_timeout": 99})
assert "crawl_timeout" not in s.model_dump()
assert s.validation_timeout == 10
class TestBatchDeleteRequest: class TestBatchDeleteRequest:
"""测试 BatchDeleteRequest""" """测试 BatchDeleteRequest"""

View File

@@ -81,23 +81,25 @@ class TestProxyRepository:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_iter_batches(self, db, proxy_repo): async def test_iter_batches(self, db, proxy_repo):
"""测试流式分批读取""" """测试流式分批读取(与库内已有数据共存,只校验增量与分批形状)"""
# 插入 5 条测试数据 async with db.execute("SELECT COUNT(*) FROM proxies") as c:
before = (await c.fetchone())[0]
for i in range(5): for i in range(5):
await proxy_repo.insert_or_update(db, f"192.168.1.{i}", 8000 + i, "http", 10) await proxy_repo.insert_or_update(db, f"192.168.99.{i}", 8000 + i, "http", 10)
async with db.execute("SELECT COUNT(*) FROM proxies") as c:
after = (await c.fetchone())[0]
assert after == before + 5
batches = [] batches = []
async for batch in proxy_repo.iter_batches(db, batch_size=2): async for batch in proxy_repo.iter_batches(db, batch_size=2):
batches.append(batch) batches.append(batch)
assert len(batches) == 3 assert sum(len(b) for b in batches) == after
assert len(batches[0]) == 2 assert len(batches[-1]) in (1, 2)
assert len(batches[1]) == 2 assert all(len(b) <= 2 for b in batches)
assert len(batches[2]) == 1
# 清理
for i in range(5): for i in range(5):
await proxy_repo.delete(db, f"192.168.1.{i}", 8000 + i) await proxy_repo.delete(db, f"192.168.99.{i}", 8000 + i)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_batch_delete(self, db, proxy_repo): async def test_batch_delete(self, db, proxy_repo):
@@ -121,6 +123,38 @@ class TestProxyRepository:
"""测试获取统计信息""" """测试获取统计信息"""
stats = await proxy_repo.get_stats(db) stats = await proxy_repo.get_stats(db)
assert "total" in stats assert "total" in stats
assert "pending" in stats
assert "available" in stats assert "available" in stats
assert "avg_score" in stats assert "avg_score" in stats
assert "http_count" in stats assert "http_count" in stats
@pytest.mark.asyncio
async def test_get_today_new_count_only_validated_available(self, db, proxy_repo):
"""今日新增不计待验证;仅今日创建且 validated=1、score>0"""
base = await proxy_repo.get_today_new_count(db)
await proxy_repo.upsert_from_crawl(db, "192.168.88.20", 9020, "http", 0)
assert await proxy_repo.get_today_new_count(db) == base
await proxy_repo.insert_or_update(db, "192.168.88.21", 9021, "http", 55)
assert await proxy_repo.get_today_new_count(db) == base + 1
await proxy_repo.delete(db, "192.168.88.20", 9020)
await proxy_repo.delete(db, "192.168.88.21", 9021)
@pytest.mark.asyncio
async def test_upsert_many_from_crawl(self, db, proxy_repo):
from app.models.domain import ProxyRaw
raws = [
ProxyRaw("10.0.0.1", 18080, "http"),
ProxyRaw("10.0.0.2", 18081, "socks5"),
]
await proxy_repo.upsert_many_from_crawl(db, raws, 0)
await db.commit()
p1 = await proxy_repo.get_by_ip_port(db, "10.0.0.1", 18080)
assert p1 is not None
assert p1.validated == 0
p2 = await proxy_repo.get_by_ip_port(db, "10.0.0.2", 18081)
assert p2.protocol == "socks5"
await proxy_repo.delete(db, "10.0.0.1", 18080)
await proxy_repo.delete(db, "10.0.0.2", 18081)