feat: fpw plugins, validation/crawl perf, WS stats, test DB isolation

- Add Free_Proxy_Website-style fpw_* plugins and register them
- Per-plugin crawl timeout (crawl_timeout_seconds=120); remove global crawl_timeout setting
- Validator: fix connect vs total timeout on save; SOCKS session LRU cache; drop redundant semaphore
- Validation handler uses single DB connection; batch upsert after crawl; WorkerPool put_nowait
- Remove unused max_retries from settings API/UI; settings maintenance SQL + init_db cleanup of deprecated keys
- WebSocket dashboard stats; ProxyList pool_filter and API alignment
- POST /api/proxies/delete-one for IPv6-safe deletes; task poll stops on 404
- pytest uses PROXYPOOL_DB_PATH=db/proxies.test.sqlite so tests do not wipe production DB
- .gitignore: explicit proxies.test.sqlite patterns; fix plugin_service ValidationException import

Made-with: Cursor
This commit is contained in:
祀梦
2026-04-05 13:39:19 +08:00
parent 92c7fa19e2
commit 0131c8b408
63 changed files with 2331 additions and 531 deletions

3
.gitignore vendored
View File

@@ -30,6 +30,9 @@ env/
*.sqlite
*.sqlite3
*.db
# pytest 隔离库PROXYPOOL_DB_PATH=db/proxies.test.sqlite勿提交
**/proxies.test.sqlite
proxies.test.sqlite
*.db-shm
*.db-wal

View File

@@ -221,10 +221,15 @@ POST /api/settings
- **验证超时**: 3-30秒默认 5秒
- **验证并发数**: 10-200默认 50
### 评分机制
### 待验证与可用
- **爬取**:代理默认以「待验证」入库(`validated=0`,分数为 0不会立刻参与随机/导出。
- **验证**:在设置页「立即验证全部」或开启自动验证后,会**先验证待验证队列**,再按检查时间**复检已入库代理**;通过后标记为已验证并赋予分数。
- **设置**:「爬取后立即验证」默认关闭;开启后爬取完成会像旧版一样立刻排队验证。
### 评分机制(仅针对已验证入池的代理)
- **验证成功**: +10 分
- **验证失败**: -5 分
- **分数为 0**: 自动删除
- **分数为 0**: 自动删除(待验证阶段验证失败则直接丢弃该条)
## 🔧 常见问题

View File

@@ -64,7 +64,8 @@ export const proxiesAPI = {
getProxies: (params, signal) =>
api.post('/api/proxies', cleanParams(params), { signal }),
deleteProxy: (ip, port) => api.delete(`/api/proxies/${ip}/${port}`),
deleteProxy: (ip, port) =>
api.post('/api/proxies/delete-one', { ip, port }),
batchDeleteProxies: (proxies) => api.post('/api/proxies/batch-delete', { proxies }),

View File

@@ -24,7 +24,8 @@ const props = defineProps({
type: {
type: String,
default: 'default',
validator: (value) => ['default', 'total', 'available', 'new', 'score'].includes(value)
validator: (value) =>
['default', 'total', 'pending', 'available', 'new', 'score'].includes(value)
},
/** 图标组件 */
icon: {
@@ -79,6 +80,11 @@ const displayValue = computed(() => {
filter: drop-shadow(0 0 8px rgba(34, 197, 94, 0.4));
}
.stat-card.pending .stat-icon {
color: var(--warning);
filter: drop-shadow(0 0 8px rgba(250, 204, 21, 0.45));
}
.stat-card.new .stat-icon {
color: var(--warning);
filter: drop-shadow(0 0 8px rgba(245, 158, 11, 0.4));

View File

@@ -0,0 +1,134 @@
import { onUnmounted } from 'vue'
import { useProxyStore } from '../stores/proxy'
const MAX_DELAY_MS = 30000
const INITIAL_DELAY_MS = 1000
/**
* 由 API Base 推导统计 WebSocket URL/api/ws
* @returns {string}
*/
export function resolveWebSocketStatsUrl() {
const explicit = import.meta.env.VITE_WS_URL
if (explicit) {
const t = String(explicit).trim().replace(/\/$/, '')
return t.endsWith('/api/ws') ? t : `${t}/api/ws`
}
const api = import.meta.env.VITE_API_BASE_URL || 'http://localhost:18080'
const u = new URL(api)
u.protocol = u.protocol === 'https:' ? 'wss:' : 'ws:'
u.pathname = '/api/ws'
u.search = ''
u.hash = ''
return u.toString()
}
/**
* 连接后端 WebSocket 接收实时统计;指数退避重连;页签隐藏时暂停连接。
*/
export function useStatsWebSocket() {
const store = useProxyStore()
let ws = null
let reconnectTimer = null
let attempt = 0
let stopped = false
let paused = false
function backoffDelayMs() {
return Math.min(INITIAL_DELAY_MS * 2 ** attempt, MAX_DELAY_MS)
}
function clearReconnectTimer() {
if (reconnectTimer) {
clearTimeout(reconnectTimer)
reconnectTimer = null
}
}
function connect() {
if (stopped || paused) return
clearReconnectTimer()
const url = resolveWebSocketStatsUrl()
ws = new WebSocket(url)
ws.onopen = () => {
attempt = 0
}
ws.onmessage = (ev) => {
try {
const msg = JSON.parse(ev.data)
if (msg.type === 'stats' && msg.data) {
store.applyStats(msg.data)
} else if (msg.type === 'pong') {
// optional heartbeat
}
} catch {
// ignore malformed
}
}
ws.onclose = () => {
ws = null
if (stopped || paused) return
attempt += 1
reconnectTimer = setTimeout(connect, backoffDelayMs())
}
ws.onerror = () => {
try {
ws?.close()
} catch {
// ignore
}
}
}
function handleVisibility() {
if (document.hidden) {
paused = true
clearReconnectTimer()
if (ws) {
const s = ws
ws = null
s.onclose = null
try {
s.close()
} catch {
// ignore
}
}
} else {
paused = false
if (!stopped) {
attempt = 0
connect()
}
}
}
function start() {
stopped = false
paused = false
attempt = 0
document.addEventListener('visibilitychange', handleVisibility)
connect()
}
function disconnect() {
stopped = true
paused = false
document.removeEventListener('visibilitychange', handleVisibility)
clearReconnectTimer()
if (ws) {
const s = ws
ws = null
s.onclose = null
try {
s.close()
} catch {
// ignore
}
}
}
onUnmounted(disconnect)
return { start, disconnect }
}

View File

@@ -1,7 +1,8 @@
import { tasksAPI } from '../api'
const POLL_INTERVAL = 1000
const MAX_POLL_ATTEMPTS = 30
/** 大批量爬取可能超过 30s适当放宽避免误报「任务进行中」 */
const MAX_POLL_ATTEMPTS = 300
/**
* 轮询任务状态直到完成或失败
@@ -21,7 +22,14 @@ export async function pollTaskStatus(taskId) {
return response
}
} catch (error) {
// 网络异常时继续轮询,不中断
const status = error.response?.status
if (status === 404) {
return {
code: 404,
message: error.response?.data?.message || '任务不存在',
data: { task_id: taskId, status: 'failed', error: 'not_found' }
}
}
console.warn('轮询任务状态失败:', error)
}
}

View File

@@ -32,6 +32,12 @@ export const useProxyStore = defineStore('proxy', () => {
* 获取统计信息
* @returns {Promise<boolean>}
*/
function applyStats(data) {
if (data && typeof data === 'object') {
stats.value = { ...data }
}
}
async function fetchStats() {
try {
const response = await proxyService.getStats()
@@ -174,6 +180,7 @@ export const useProxyStore = defineStore('proxy', () => {
isEmpty,
// Actions
fetchStats,
applyStats,
fetchProxies,
deleteProxy,
batchDeleteProxies,

View File

@@ -2,40 +2,38 @@
<div class="page-container">
<PageHeader title="代理池管理系统" :icon="MagicStick" />
<el-row :gutter="20" class="stats-row">
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6">
<div class="stats-grid">
<StatCard
type="total"
:icon="DataLine"
:value="stats.total || 0"
label="总代理数"
/>
</el-col>
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6">
<StatCard
type="pending"
:icon="Clock"
:value="stats.pending || 0"
label="待验证"
/>
<StatCard
type="available"
:icon="CircleCheck"
:value="stats.available || 0"
label="可用数量"
/>
</el-col>
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6">
<StatCard
type="new"
:icon="Timer"
:value="stats.today_new || 0"
label="今日新增"
/>
</el-col>
<el-col :xs="24" :sm="12" :md="12" :lg="6" :xl="6">
<StatCard
type="score"
:icon="StarFilled"
:value="avgScore"
label="平均分数"
/>
</el-col>
</el-row>
</div>
<el-row :gutter="20" class="charts-row">
<el-col :xs="24" :lg="16">
@@ -88,7 +86,7 @@
</template>
<script setup>
import { computed, onMounted, onUnmounted } from 'vue'
import { computed, onMounted } from 'vue'
import { ElMessage, ElMessageBox } from 'element-plus'
import {
MagicStick,
@@ -96,7 +94,8 @@ import {
CircleCheck,
Timer,
StarFilled,
InfoFilled
InfoFilled,
Clock
} from '@element-plus/icons-vue'
import { useProxyStore } from '../stores/proxy'
import { formatNumber } from '../utils/format'
@@ -104,26 +103,16 @@ import StatCard from '../components/StatCard.vue'
import ProtocolChart from '../components/ProtocolChart.vue'
import QuickActions from '../components/QuickActions.vue'
import PageHeader from '../components/PageHeader.vue'
import { useStatsWebSocket } from '../composables/useStatsWebSocket'
// ==================== Store ====================
const proxyStore = useProxyStore()
const { start: startStatsWs } = useStatsWebSocket()
// ==================== 计算属性 ====================
const stats = computed(() => proxyStore.stats)
const avgScore = computed(() => formatNumber(stats.value.avg_score || 0, 1))
// ==================== 定时刷新 ====================
const REFRESH_INTERVAL = 5000
let refreshTimer = null
let isPageVisible = true
function handleVisibilityChange() {
isPageVisible = !document.hidden
if (isPageVisible) {
refreshData()
}
}
async function refreshData() {
await proxyStore.fetchStats()
}
@@ -165,26 +154,15 @@ async function handleClean() {
// ==================== 生命周期 ====================
onMounted(async () => {
await refreshData()
document.addEventListener('visibilitychange', handleVisibilityChange)
refreshTimer = setInterval(() => {
if (isPageVisible) {
refreshData()
}
}, REFRESH_INTERVAL)
})
onUnmounted(() => {
if (refreshTimer) {
clearInterval(refreshTimer)
refreshTimer = null
}
document.removeEventListener('visibilitychange', handleVisibilityChange)
startStatsWs()
})
</script>
<style scoped>
.stats-row {
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 20px;
}
@@ -237,14 +215,6 @@ onUnmounted(() => {
}
@media (max-width: 768px) {
.stats-row .el-col {
margin-bottom: 16px;
}
.stats-row .el-col:last-child {
margin-bottom: 0;
}
.status-list {
flex-direction: column;
gap: 16px;

View File

@@ -15,7 +15,7 @@
</el-tag>
</div>
<div class="header-actions">
<el-button type="success" @click="handleCrawlAll" size="large" :loading="crawlingAll">
<el-button type="success" @click="handleCrawlAll" size="large" :loading="crawlAllMask">
<el-icon class="btn-icon"><Promotion /></el-icon>
全部爬取
</el-button>
@@ -53,12 +53,12 @@
</template>
</el-table-column>
<el-table-column label="统计" width="180">
<el-table-column label="上次爬取" width="200">
<template #default="{ row }">
<div class="plugin-stats">
<div class="plugin-stats" title="绿色为最近一轮爬到的代理条数红色为最近一轮是否失败0 成功 / 1 失败),不是验证通过数">
<div class="stat-item">
<el-icon class="stat-icon success"><CircleCheck /></el-icon>
<span class="stat-value success">{{ row.success_count || 0 }}</span>
<span class="stat-value success">{{ row.success_count || 0 }} </span>
</div>
<div class="stat-item">
<el-icon class="stat-icon failed"><CircleClose /></el-icon>
@@ -74,7 +74,35 @@
</template>
</el-table-column>
<el-table-column label="操作" width="220" fixed="right" align="center">
<el-table-column label="最近爬取" min-width="340" align="left">
<template #default="{ row }">
<div v-if="crawlAllMask && row.enabled" class="crawl-running-row">
<el-icon class="is-loading crawl-spin"><Loading /></el-icon>
<span>正在爬取</span>
</div>
<div v-else-if="crawlResults[row.id]" class="result-panel" :class="crawlResults[row.id].type">
<div class="result-panel-head">
<el-icon v-if="crawlResults[row.id].type === 'success'" class="result-head-icon success"><CircleCheck /></el-icon>
<el-icon v-else class="result-head-icon failed"><CircleClose /></el-icon>
<span class="result-panel-title">{{ crawlResults[row.id].message }}</span>
<el-icon class="result-close" @click="clearCrawlResult(row.id)"><Close /></el-icon>
</div>
<div class="result-panel-body">
<template v-if="crawlResults[row.id].data && crawlResults[row.id].data.proxy_count !== undefined">
<span class="result-pill fetched">爬取 {{ crawlResults[row.id].data.proxy_count }} </span>
</template>
<template v-if="crawlResults[row.id].data?.crawl_failed">
<div class="result-error-block" :title="crawlResults[row.id].data.error || ''">
{{ crawlResults[row.id].data.error || '爬取失败' }}
</div>
</template>
</div>
</div>
<span v-else class="result-placeholder"></span>
</template>
</el-table-column>
<el-table-column label="操作" width="200" fixed="right" align="center">
<template #default="{ row }">
<div class="plugin-actions">
<el-button
@@ -89,27 +117,13 @@
type="success"
size="small"
@click="handleCrawl(row.id)"
:loading="crawlingPlugins.has(row.id)"
:loading="crawlingPlugins.has(row.id) || (crawlAllMask && row.enabled)"
:disabled="!row.enabled"
>
<el-icon class="btn-icon"><Promotion /></el-icon>
爬取
</el-button>
</div>
<div v-if="crawlResults[row.id]" class="plugin-crawl-result">
<div class="result-mini" :class="crawlResults[row.id].type">
<el-icon v-if="crawlResults[row.id].type === 'success'" class="result-icon success"><CircleCheck /></el-icon>
<el-icon v-else class="result-icon failed"><CircleClose /></el-icon>
<span class="result-text">{{ crawlResults[row.id].message }}</span>
<span v-if="crawlResults[row.id].data?.success_count !== undefined" class="result-count valid">
有效 {{ crawlResults[row.id].data.success_count }}
</span>
<span v-if="crawlResults[row.id].data?.failure_count !== undefined" class="result-count invalid">
无效 {{ crawlResults[row.id].data.failure_count }}
</span>
<el-icon class="result-close" @click="clearCrawlResult(row.id)"><Close /></el-icon>
</div>
</div>
</template>
</el-table-column>
</el-table>
@@ -130,18 +144,37 @@
@close="allCrawlResult = null"
>
<template v-if="allCrawlResult.data">
<div class="crawl-stats">
<div class="crawl-stats crawl-stats-summary">
<span v-if="allCrawlResult.data.total_crawled !== undefined">
爬取: {{ allCrawlResult.data.total_crawled }}
合计爬取: <strong>{{ allCrawlResult.data.total_crawled }}</strong>
</span>
<span v-if="allCrawlResult.data.valid_count !== undefined" class="valid-count">
有效: {{ allCrawlResult.data.valid_count }}
</span>
<span v-if="allCrawlResult.data.invalid_count !== undefined" class="invalid-count">
无效: {{ allCrawlResult.data.invalid_count }}
<span
v-if="allCrawlResult.data.plugins_failed !== undefined"
class="invalid-count"
>
失败插件: <strong>{{ allCrawlResult.data.plugins_failed }}</strong>
</span>
</div>
<ul
v-if="allCrawlResult.data.per_plugin?.length"
class="per-plugin-breakdown"
>
<li
v-for="(item, idx) in allCrawlResult.data.per_plugin"
:key="item.plugin_id || `pp-${idx}`"
class="per-plugin-line"
>
<span class="pp-name">{{ pluginDisplayName(item.plugin_id) }}</span>
<template v-if="item.crawl_failed">
<el-tag type="danger" size="small" effect="light">失败</el-tag>
<span class="pp-detail err">{{ item.error || '未知错误' }}</span>
</template>
<template v-else>
<el-tag type="success" size="small" effect="light">完成</el-tag>
<span class="pp-detail">爬取 <strong>{{ item.proxy_count }}</strong> </span>
</template>
</li>
</ul>
</template>
</el-alert>
</el-card>
@@ -198,7 +231,8 @@ import {
CircleClose,
Box,
Setting,
Close
Close,
Loading
} from '@element-plus/icons-vue'
import { usePluginsStore } from '../stores/plugins'
import { pluginService } from '../services/pluginService'
@@ -207,10 +241,17 @@ import PageHeader from '../components/PageHeader.vue'
const pluginsStore = usePluginsStore()
const crawlingPlugins = ref(new Set())
const crawlingAll = ref(false)
/** 全部爬取进行中:各启用插件行显示「正在爬取」与按钮 loading */
const crawlAllMask = ref(false)
const crawlResults = ref({})
const allCrawlResult = ref(null)
function pluginDisplayName(pluginId) {
if (!pluginId) return '(未知插件)'
const p = pluginsStore.plugins.find((x) => x.id === pluginId)
return p?.name || pluginId
}
// 配置对话框
const configDialogVisible = ref(false)
const currentPlugin = ref(null)
@@ -273,29 +314,40 @@ async function handleCrawl(pluginId) {
const response = await pluginService.crawlPlugin(pluginId)
if (response.code === 200) {
crawlResults.value[pluginId] = {
crawlResults.value = {
...crawlResults.value,
[pluginId]: {
type: 'success',
message: response.message,
data: response.data
}
}
} else {
crawlResults.value[pluginId] = {
crawlResults.value = {
...crawlResults.value,
[pluginId]: {
type: 'error',
message: response.message || '爬取失败'
}
}
}
} catch (error) {
crawlResults.value[pluginId] = {
crawlResults.value = {
...crawlResults.value,
[pluginId]: {
type: 'error',
message: '爬取过程出错'
}
}
} finally {
crawlingPlugins.value.delete(pluginId)
}
}
function clearCrawlResult(pluginId) {
delete crawlResults.value[pluginId]
const next = { ...crawlResults.value }
delete next[pluginId]
crawlResults.value = next
}
async function handleCrawlAll() {
@@ -307,7 +359,7 @@ async function handleCrawlAll() {
}
await ElMessageBox.confirm(
`确定要运行所有 ${enabledPlugins.length} 个启用的插件吗?这将爬取并验证所有代理。`,
`确定要运行所有 ${enabledPlugins.length} 个启用的插件吗?代理将先以「待验证」入库,需再执行「全部验证」后才会变为可用(除非已开启「爬取后立即验证」)`,
'批量爬取确认',
{
confirmButtonText: '开始爬取',
@@ -316,20 +368,46 @@ async function handleCrawlAll() {
}
)
crawlingAll.value = true
allCrawlResult.value = null
{
const cleared = { ...crawlResults.value }
for (const p of enabledPlugins) {
delete cleared[p.id]
}
crawlResults.value = cleared
}
crawlAllMask.value = true
const response = await pluginService.crawlAll()
if (response.code === 200) {
const data = response.data || {}
allCrawlResult.value = {
type: response.data?.cancelled ? 'info' : 'success',
type: data.cancelled ? 'info' : 'success',
message: response.message,
data: response.data
data
}
if (!response.data?.cancelled) {
if (Array.isArray(data.per_plugin) && data.per_plugin.length) {
const merged = { ...crawlResults.value }
for (const item of data.per_plugin) {
if (!item.plugin_id) continue
merged[item.plugin_id] = {
type: item.crawl_failed ? 'error' : 'success',
message: '获取任务状态成功',
data: {
proxy_count: item.proxy_count,
crawl_failed: item.crawl_failed,
error: item.error
}
}
}
crawlResults.value = merged
}
if (!data.cancelled) {
ElMessage.success('批量爬取完成')
}
await pluginsStore.fetchPlugins()
} else {
allCrawlResult.value = {
type: 'error',
@@ -345,7 +423,7 @@ async function handleCrawlAll() {
}
}
} finally {
crawlingAll.value = false
crawlAllMask.value = false
}
}
@@ -487,66 +565,167 @@ onMounted(async () => {
.plugin-actions {
display: flex;
justify-content: center;
flex-wrap: wrap;
gap: 8px;
}
.plugin-crawl-result {
margin-top: 8px;
}
.result-mini {
display: inline-flex;
.crawl-running-row {
display: flex;
align-items: center;
gap: 6px;
padding: 4px 8px;
border-radius: 4px;
font-size: 12px;
line-height: 1.4;
gap: 8px;
padding: 10px 12px;
font-size: 14px;
color: var(--primary);
background: var(--surface-2);
border-radius: var(--radius-md, 8px);
border: 1px solid var(--border);
}
.result-mini.success {
background: rgba(103, 194, 58, 0.15);
.crawl-spin {
font-size: 18px;
animation: plugin-crawl-spin 1s linear infinite;
}
@keyframes plugin-crawl-spin {
to {
transform: rotate(360deg);
}
}
.result-placeholder {
color: var(--text-muted);
font-size: 14px;
}
.result-panel {
padding: 12px 14px;
border-radius: var(--radius-md, 8px);
border: 1px solid var(--border);
background: var(--surface-2);
min-height: 72px;
}
.result-panel.success {
border-color: rgba(103, 194, 58, 0.35);
}
.result-panel.error {
border-color: rgba(245, 108, 108, 0.35);
}
.result-panel-head {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 8px;
}
.result-head-icon {
font-size: 18px;
flex-shrink: 0;
}
.result-head-icon.success {
color: var(--success);
}
.result-mini.error {
background: rgba(245, 108, 108, 0.15);
.result-head-icon.failed {
color: var(--danger);
}
.result-icon {
.result-panel-title {
flex: 1;
font-size: 13px;
}
.result-text {
font-weight: 500;
}
.result-count {
font-weight: 600;
padding: 0 4px;
border-radius: 3px;
color: var(--text-secondary);
line-height: 1.4;
}
.result-count.valid {
.result-panel-body {
display: flex;
flex-direction: column;
gap: 8px;
align-items: flex-start;
}
.result-pill {
display: inline-block;
padding: 4px 12px;
border-radius: 6px;
font-size: 14px;
font-weight: 600;
}
.result-pill.fetched {
background: rgba(103, 194, 58, 0.2);
color: var(--success);
}
.result-count.invalid {
background: rgba(245, 108, 108, 0.2);
.result-error-block {
font-size: 13px;
line-height: 1.5;
color: var(--danger);
word-break: break-word;
white-space: pre-wrap;
max-width: 100%;
}
.result-close {
margin-left: 4px;
margin-left: auto;
cursor: pointer;
font-size: 12px;
opacity: 0.7;
font-size: 16px;
opacity: 0.55;
flex-shrink: 0;
transition: opacity 0.2s;
}
.result-close:hover {
opacity: 1;
}
.crawl-stats-summary {
flex-wrap: wrap;
font-size: 14px;
}
.per-plugin-breakdown {
list-style: none;
margin: 12px 0 0;
padding: 0;
max-height: 360px;
overflow-y: auto;
border-top: 1px solid var(--border);
padding-top: 12px;
}
.per-plugin-line {
display: flex;
flex-wrap: wrap;
align-items: center;
gap: 8px 12px;
padding: 8px 0;
border-bottom: 1px solid var(--border);
font-size: 14px;
}
.per-plugin-line:last-child {
border-bottom: none;
}
.pp-name {
font-weight: 600;
color: var(--text-primary);
min-width: 140px;
}
.pp-detail {
color: var(--text-secondary);
}
.pp-detail.err {
color: var(--danger);
flex: 1;
min-width: 120px;
word-break: break-word;
}
</style>

View File

@@ -4,6 +4,18 @@
<el-card class="filter-card" shadow="hover">
<el-form :inline="true" :model="filterForm" class="form-row">
<el-form-item label="池范围">
<el-select
v-model="filterForm.poolFilter"
placeholder="全部"
style="width: 140px"
@change="handleSearch"
>
<el-option label="全部" value="all" />
<el-option label="待验证" value="pending" />
<el-option label="已验证可用" value="available" />
</el-select>
</el-form-item>
<el-form-item label="协议类型">
<el-select
v-model="filterForm.protocol"
@@ -84,6 +96,16 @@
<el-table-column type="selection" width="55" />
<el-table-column prop="ip" label="IP地址" width="150" />
<el-table-column prop="port" label="端口" width="100" />
<el-table-column label="状态" width="100">
<template #default="{ row }">
<el-tag v-if="row.validated === 0" type="warning" effect="light" size="small">
待验证
</el-tag>
<el-tag v-else type="success" effect="light" size="small">
已验证
</el-tag>
</template>
</el-table-column>
<el-table-column prop="protocol" label="协议" width="100">
<template #default="{ row }">
<el-tag :type="getProtocolType(row.protocol)" effect="light" size="small">
@@ -164,6 +186,7 @@ const selectedProxies = ref([])
let abortController = null
const filterForm = reactive({
poolFilter: 'all',
protocol: '',
minScore: 0,
sortBy: 'last_check',
@@ -194,6 +217,7 @@ async function fetchProxies() {
const success = await proxyStore.fetchProxies({
page: currentPage.value,
page_size: pageSize.value,
pool_filter: filterForm.poolFilter === 'all' ? null : filterForm.poolFilter,
protocol: filterForm.protocol || null,
min_score: filterForm.minScore,
sort_by: filterForm.sortBy,
@@ -237,6 +261,7 @@ async function handleDelete(proxy) {
if (!confirmed) return
const filters = {
pool_filter: filterForm.poolFilter === 'all' ? null : filterForm.poolFilter,
protocol: filterForm.protocol || null,
min_score: filterForm.minScore,
sort_by: filterForm.sortBy,
@@ -256,6 +281,7 @@ async function handleBatchDelete() {
if (!confirmed) return
const filters = {
pool_filter: filterForm.poolFilter === 'all' ? null : filterForm.poolFilter,
protocol: filterForm.protocol || null,
min_score: filterForm.minScore,
sort_by: filterForm.sortBy,

View File

@@ -86,26 +86,9 @@
ref="formRef"
>
<el-divider content-position="left">爬虫配置</el-divider>
<el-form-item label="爬取超时" prop="crawl_timeout">
<el-input-number
v-model="settings.crawl_timeout"
:min="5"
:max="120"
:step="5"
class="setting-input"
/>
<span class="setting-suffix"></span>
</el-form-item>
<el-form-item label="最大重试次数" prop="max_retries">
<el-input-number
v-model="settings.max_retries"
:min="0"
:max="10"
class="setting-input"
/>
</el-form-item>
<p class="setting-hint" style="margin: -8px 0 16px 0">
每个爬虫插件单独限时 120 互不影响此处不再配置全局爬取超时
</p>
<el-divider content-position="left">验证配置</el-divider>
@@ -124,7 +107,7 @@
<el-input-number
v-model="settings.default_concurrency"
:min="10"
:max="200"
:max="400"
:step="10"
class="setting-input"
/>
@@ -170,6 +153,15 @@
/>
</el-form-item>
<el-form-item label="爬取后立即验证" prop="auto_validate_after_crawl">
<el-switch
v-model="settings.auto_validate_after_crawl"
active-text="开启"
inactive-text="关闭"
/>
<span class="setting-hint">关闭时爬取仅入库为待验证需手动或定时全部验证消化队列推荐</span>
</el-form-item>
<el-divider content-position="left">代理评分配置</el-divider>
<el-form-item label="最低代理分数" prop="min_proxy_score">
@@ -232,13 +224,12 @@ const saving = ref(false)
const formRef = ref(null)
const settings = reactive({
crawl_timeout: 30,
validation_timeout: 10,
max_retries: 3,
default_concurrency: 50,
validation_timeout: 6,
default_concurrency: 120,
min_proxy_score: 0,
proxy_expiry_days: 7,
auto_validate: true,
auto_validate_after_crawl: false,
validate_interval_minutes: 30,
validation_targets: []
})
@@ -255,18 +246,15 @@ const defaultValidationTargets = [
// ==================== 计算属性 ====================
const schedulerInfo = computed(() => {
if (schedulerRunning.value) {
return `验证调度器正在运行,每 ${settings.validate_interval_minutes} 分钟自动验证一次所有代理`
} else {
return '验证调度器已停止,代理不会自动验证,建议定期手动验证或开启自动验证'
return `验证调度器正在运行,每 ${settings.validate_interval_minutes} 分钟执行一次:优先验证待验证代理,再按检查时间复检已入库代理`
}
return '验证调度器已停止,待验证代理不会自动检查;可在下方开启自动验证或点击「立即验证全部」'
})
// ==================== 表单验证规则 ====================
const formRules = {
crawl_timeout: [{ type: 'number', min: 5, max: 120, message: '范围 5-120 秒', trigger: 'blur' }],
validation_timeout: [{ type: 'number', min: 3, max: 60, message: '范围 3-60 秒', trigger: 'blur' }],
max_retries: [{ type: 'number', min: 0, max: 10, message: '范围 0-10', trigger: 'blur' }],
default_concurrency: [{ type: 'number', min: 10, max: 200, message: '范围 10-200', trigger: 'blur' }],
default_concurrency: [{ type: 'number', min: 10, max: 400, message: '范围 10-400', trigger: 'blur' }],
validate_interval_minutes: [{ type: 'number', min: 5, max: 1440, message: '范围 5-1440 分钟', trigger: 'blur' }],
min_proxy_score: [{ type: 'number', min: 0, max: 100, message: '范围 0-100', trigger: 'blur' }],
proxy_expiry_days: [{ type: 'number', min: 1, max: 30, message: '范围 1-30 天', trigger: 'blur' }]
@@ -306,7 +294,7 @@ async function handleStopScheduler() {
async function handleValidateNow() {
try {
await ElMessageBox.confirm(
'确定要立即验证所有代理吗?这可能需要一些时间。',
'将按顺序验证:先处理「待验证」代理,再复检已入库代理。任务在后台执行,可能需要较长时间。',
'确认验证',
{
confirmButtonText: '开始验证',

View File

@@ -25,6 +25,7 @@ def format_proxy(proxy) -> dict:
"score": proxy.score,
"response_time_ms": proxy.response_time_ms,
"last_check": proxy.last_check.isoformat() if proxy.last_check else None,
"validated": getattr(proxy, "validated", 0),
}

View File

@@ -3,7 +3,7 @@ import asyncio
from contextlib import AsyncExitStack, asynccontextmanager
from fastapi import FastAPI
from app.core.db import init_db, get_db
from app.core.db import init_db, get_db, get_db_connection
from app.core.config import settings as app_settings
from app.core.log import logger
from app.core.execution import AsyncWorkerPool, JobExecutor
@@ -13,6 +13,8 @@ from app.repositories.settings_repo import SettingsRepository, DEFAULT_SETTINGS
from app.services.validator_service import ValidatorService
from app.services.plugin_runner import PluginRunner
from app.services.scheduler_service import SchedulerService
from app.api.ws_manager import ConnectionManager
from app.api.realtime import stats_broadcaster_loop
settings_repo = SettingsRepository()
proxy_repo = ProxyRepository()
@@ -46,21 +48,49 @@ async def lifespan(app: FastAPI):
# 验证 WorkerPool
async def validation_handler(proxy):
from app.models.domain import ProxyRaw
async with get_db_connection() as db:
existing = await proxy_repo.get_by_ip_port(db, proxy.ip, proxy.port)
is_valid, latency = await validator.validate(
proxy.ip, proxy.port, proxy.protocol
)
async with get_db() as db:
if not existing:
return
if existing.validated == 0:
if is_valid:
await proxy_repo.insert_or_update(
db, proxy.ip, proxy.port, proxy.protocol, score=app_settings.score_valid
db,
proxy.ip,
proxy.port,
proxy.protocol,
score=app_settings.score_valid,
)
if latency:
await proxy_repo.update_response_time(db, proxy.ip, proxy.port, latency)
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
else:
await proxy_repo.delete(db, proxy.ip, proxy.port)
else:
if is_valid:
await proxy_repo.insert_or_update(
db,
proxy.ip,
proxy.port,
proxy.protocol,
score=app_settings.score_valid,
)
if latency:
await proxy_repo.update_response_time(
db, proxy.ip, proxy.port, latency
)
else:
await proxy_repo.update_score(
db, proxy.ip, proxy.port, app_settings.score_invalid,
app_settings.score_min, app_settings.score_max
db,
proxy.ip,
proxy.port,
app_settings.score_invalid,
app_settings.score_min,
app_settings.score_max,
)
worker_pool = AsyncWorkerPool(
@@ -75,7 +105,7 @@ async def lifespan(app: FastAPI):
await stack.enter_async_context(executor)
# 插件运行器
plugin_runner = PluginRunner(timeout=db_settings.get("crawl_timeout", 30))
plugin_runner = PluginRunner()
# 调度器
scheduler = SchedulerService(
@@ -91,6 +121,9 @@ async def lifespan(app: FastAPI):
app.state.plugin_runner = plugin_runner
app.state.scheduler = scheduler
app.state.ws_manager = ConnectionManager()
app.state.stats_broadcaster_task = asyncio.create_task(stats_broadcaster_loop(app))
# 启动调度器
if db_settings.get("auto_validate", True):
try:
@@ -101,6 +134,13 @@ async def lifespan(app: FastAPI):
logger.info("API server started")
yield
app.state.stats_broadcaster_task.cancel()
try:
await app.state.stats_broadcaster_task
except asyncio.CancelledError:
pass
await app.state.ws_manager.disconnect_all()
# 停止调度器
await scheduler.stop()

View File

@@ -1,4 +1,11 @@
"""FastAPI 应用工厂"""
import asyncio
import sys
# Windows 上默认 Proactor 事件循环易导致 httpx 异步出站 ConnectTimeout与同步请求表现不一致
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.lifespan import lifespan

25
app/api/realtime.py Normal file
View File

@@ -0,0 +1,25 @@
"""实时统计广播后台任务"""
import asyncio
from fastapi import FastAPI
from app.core.config import settings
from app.core.log import logger
from app.services.dashboard_stats import get_dashboard_stats
async def stats_broadcaster_loop(app: FastAPI) -> None:
manager = app.state.ws_manager
interval = settings.ws_stats_interval_seconds
while True:
try:
await asyncio.sleep(interval)
if manager.connection_count == 0:
continue
scheduler = app.state.scheduler
stats = await get_dashboard_stats(scheduler.running)
await manager.broadcast_json({"type": "stats", "data": stats})
except asyncio.CancelledError:
break
except Exception:
logger.exception("stats broadcaster tick failed")

View File

@@ -1,9 +1,10 @@
"""路由包"""
from fastapi import APIRouter
from app.api.routes import proxies, plugins, scheduler, settings, tasks
from app.api.routes import proxies, plugins, scheduler, settings, tasks, ws
api_router = APIRouter()
api_router.include_router(proxies.router)
api_router.include_router(ws.router)
api_router.include_router(plugins.router)
api_router.include_router(scheduler.router)
api_router.include_router(settings.router)

View File

@@ -113,8 +113,8 @@ def _create_crawl_all_aggregator(job_ids, executor):
class CrawlAllAggregator(Job):
async def run(self):
self._set_running()
# 等待所有子 job 完成(最多等 30 秒
for _ in range(300):
# 等待所有子 job 完成(最多约 5 分钟,与前端轮询一致
for _ in range(3000):
if self.is_cancelled:
break
all_done = all(
@@ -125,15 +125,56 @@ def _create_crawl_all_aggregator(job_ids, executor):
break
await asyncio.sleep(0.1)
total = 0
valid = 0
invalid = 0
plugins_failed = 0
per_plugin = []
for jid in job_ids:
job = executor.get_job(jid)
if job and job.result:
total += job.result.get("proxy_count", 0)
valid += job.result.get("success_count", 0)
invalid += job.result.get("failure_count", 0)
result = {"total_crawled": total, "valid_count": valid, "invalid_count": invalid}
plugin_id = getattr(job, "plugin_id", "") if job else ""
proxy_count = 0
crawl_failed = False
err_msg = None
job_status = job.status.value if job else "missing"
if not job:
per_plugin.append({
"plugin_id": plugin_id,
"proxy_count": 0,
"crawl_failed": True,
"error": "任务不存在",
"job_status": job_status,
})
plugins_failed += 1
continue
if job.status.value == "failed":
crawl_failed = True
plugins_failed += 1
err_msg = job.error or "任务失败"
elif job.result:
r = job.result
plugin_id = r.get("plugin_id") or plugin_id
proxy_count = r.get("proxy_count", 0)
total += proxy_count
if r.get("crawl_failed") or r.get("failure_count", 0) > 0:
crawl_failed = True
plugins_failed += 1
err_msg = r.get("error")
else:
total += 0
per_plugin.append({
"plugin_id": plugin_id,
"proxy_count": proxy_count,
"crawl_failed": crawl_failed,
"error": err_msg,
"job_status": job_status,
})
result = {
"total_crawled": total,
"plugins_failed": plugins_failed,
"per_plugin": per_plugin,
}
if self.is_cancelled:
result["cancelled"] = True
return result

View File

@@ -5,7 +5,8 @@ from fastapi.responses import StreamingResponse
from app.services.proxy_service import ProxyService
from app.services.scheduler_service import SchedulerService
from app.models.schemas import ProxyListRequest, BatchDeleteRequest
from app.services.dashboard_stats import get_dashboard_stats
from app.models.schemas import ProxyListRequest, BatchDeleteRequest, ProxyDeleteItem
from app.api.deps import get_proxy_service, get_scheduler_service
from app.api.common import success_response, format_proxy
from app.core.exceptions import ProxyPoolException, ProxyNotFoundException
@@ -15,11 +16,9 @@ router = APIRouter(prefix="/api/proxies", tags=["proxies"])
@router.get("/stats")
async def get_stats(
proxy_service: ProxyService = Depends(get_proxy_service),
scheduler_service: SchedulerService = Depends(get_scheduler_service),
):
stats = await proxy_service.get_stats()
stats["scheduler_running"] = scheduler_service.running
stats = await get_dashboard_stats(scheduler_service.running)
return success_response("获取统计信息成功", stats)
@@ -36,6 +35,7 @@ async def list_proxies(
max_score=request.max_score,
sort_by=request.sort_by,
sort_order=request.sort_order,
pool_filter=request.pool_filter,
)
return success_response(
"获取代理列表成功",
@@ -75,6 +75,16 @@ async def export_proxies(
)
@router.post("/delete-one")
async def delete_proxy_one(
item: ProxyDeleteItem,
service: ProxyService = Depends(get_proxy_service),
):
"""JSON 删除推荐IPv6 等含冒号 IP 不受路径分段影响。"""
await service.delete_proxy(item.ip, item.port)
return success_response("删除代理成功")
@router.delete("/{ip}/{port}")
async def delete_proxy(ip: str, port: int, service: ProxyService = Depends(get_proxy_service)):
await service.delete_proxy(ip, port)

View File

@@ -1,10 +1,13 @@
"""设置相关路由"""
import asyncio
from fastapi import APIRouter, Request, Depends
from app.core.db import get_db
from app.repositories.settings_repo import SettingsRepository
from app.models.schemas import SettingsSchema
from app.api.common import success_response
from app.api.deps import get_settings_repo
from app.core.config import settings as app_settings
from app.core.exceptions import ProxyPoolException
from app.core.log import logger
@@ -47,17 +50,21 @@ async def save_settings(
# 热更新验证器超时和并发(下次验证时生效)
if validator:
validator._init_timeout = request.validation_timeout
validator._init_connect_timeout = request.validation_timeout
vt = float(request.validation_timeout)
validator._init_timeout = vt
# 连接阶段单独收紧:勿与 total 等同,否则死代理会在 connect 上耗满整段超时
validator._init_connect_timeout = min(
float(app_settings.validator_connect_timeout), vt
)
validator._init_max_concurrency = request.default_concurrency
if request.validation_targets is not None:
validator.update_test_urls(request.validation_targets)
# 延迟关闭旧 session让正在验证的代理继续使用旧 session
# 新请求会通过 _ensure_session() 自动创建使用新配置的 session
await validator.close_socks_sessions()
old_session = validator._http_session
validator._http_session = None
validator._http_connector = None
validator._semaphore = None
if old_session and not old_session.closed:
asyncio.create_task(old_session.close())
logger.info(f"Validator config updated: timeout={request.validation_timeout}, concurrency={request.default_concurrency}, targets={request.validation_targets}")

32
app/api/routes/ws.py Normal file
View File

@@ -0,0 +1,32 @@
"""WebSocket 实时推送"""
import json
from fastapi import APIRouter, WebSocket
from starlette.websockets import WebSocketDisconnect
from app.services.dashboard_stats import get_dashboard_stats
router = APIRouter(prefix="/api", tags=["websocket"])
@router.websocket("/ws")
async def websocket_dashboard(websocket: WebSocket):
app = websocket.app
await websocket.accept()
manager = app.state.ws_manager
await manager.connect(websocket)
try:
stats = await get_dashboard_stats(app.state.scheduler.running)
await websocket.send_json({"type": "stats", "data": stats})
while True:
raw = await websocket.receive_text()
try:
msg = json.loads(raw)
except json.JSONDecodeError:
continue
if msg.get("type") == "ping":
await websocket.send_json({"type": "pong"})
except WebSocketDisconnect:
pass
finally:
await manager.disconnect(websocket)

52
app/api/ws_manager.py Normal file
View File

@@ -0,0 +1,52 @@
"""WebSocket 连接管理与广播"""
import asyncio
from typing import List
from starlette.websockets import WebSocket, WebSocketState
class ConnectionManager:
def __init__(self) -> None:
self._connections: List[WebSocket] = []
self._lock = asyncio.Lock()
@property
def connection_count(self) -> int:
return len(self._connections)
async def connect(self, websocket: WebSocket) -> None:
async with self._lock:
self._connections.append(websocket)
async def disconnect(self, websocket: WebSocket) -> None:
async with self._lock:
if websocket in self._connections:
self._connections.remove(websocket)
async def broadcast_json(self, payload: dict) -> None:
async with self._lock:
targets = list(self._connections)
stale: List[WebSocket] = []
for ws in targets:
try:
if ws.client_state != WebSocketState.CONNECTED:
stale.append(ws)
continue
await ws.send_json(payload)
except Exception:
stale.append(ws)
if stale:
async with self._lock:
for ws in stale:
if ws in self._connections:
self._connections.remove(ws)
async def disconnect_all(self) -> None:
async with self._lock:
targets = list(self._connections)
self._connections.clear()
for ws in targets:
try:
await ws.close()
except Exception:
pass

View File

@@ -1,6 +1,7 @@
"""全局配置 - 使用 Pydantic Settings 支持环境变量和 .env 文件"""
import os
from typing import List
from pydantic import AliasChoices, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -11,8 +12,11 @@ class Settings(BaseSettings):
extra="ignore",
)
# 数据库配置
db_path: str = "db/proxies.sqlite"
# 数据库配置(环境变量 PROXYPOOL_DB_PATH 优先,供 pytest 与生产隔离)
db_path: str = Field(
default="db/proxies.sqlite",
validation_alias=AliasChoices("PROXYPOOL_DB_PATH", "DB_PATH", "db_path"),
)
# API 服务配置
host: str = "127.0.0.1"
@@ -31,6 +35,9 @@ class Settings(BaseSettings):
log_level: str = "INFO"
log_dir: str = "logs"
# WebSocket统计广播间隔无连接时不查库
ws_stats_interval_seconds: int = 1
# 导出配置
export_max_records: int = 10000

View File

@@ -54,10 +54,23 @@ async def init_db():
await db.execute("UPDATE proxies SET created_at = CURRENT_TIMESTAMP WHERE created_at IS NULL")
logger.info("Migrated: added created_at column")
# 迁移validated 0=待验证 1=已验证入池(参与分数维护)
try:
await db.execute("SELECT validated FROM proxies LIMIT 1")
except Exception:
await db.execute(
"ALTER TABLE proxies ADD COLUMN validated INTEGER NOT NULL DEFAULT 0"
)
await db.execute(
"UPDATE proxies SET validated = 1 WHERE score > 0"
)
logger.info("Migrated: added validated column")
await db.execute("CREATE INDEX IF NOT EXISTS idx_score ON proxies(score)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_protocol ON proxies(protocol)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_last_check ON proxies(last_check)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_ip_port ON proxies(ip, port)")
await db.execute("CREATE INDEX IF NOT EXISTS idx_validated ON proxies(validated)")
# 插件设置表
await db.execute("""
@@ -94,6 +107,10 @@ async def init_db():
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# 仅移除已废弃设置键,不碰 proxies 表数据
await db.execute(
"DELETE FROM settings WHERE key IN ('crawl_timeout', 'max_retries')"
)
await db.commit()
logger.info("Database initialized")
@@ -112,6 +129,19 @@ async def get_db() -> AsyncIterator[aiosqlite.Connection]:
await db.close()
@asynccontextmanager
async def get_db_connection() -> AsyncIterator[aiosqlite.Connection]:
"""单连接贯穿「读库 → await 网络 I/O → 写库」,减少验证 worker 每条代理两次 connect。"""
ensure_db_dir()
db = await aiosqlite.connect(DB_PATH)
try:
await db.execute("PRAGMA journal_mode=WAL")
await db.execute("PRAGMA synchronous=NORMAL")
yield db
finally:
await db.close()
@asynccontextmanager
async def transaction() -> AsyncIterator[aiosqlite.Connection]:
"""获取带有显式事务控制的数据库连接

View File

@@ -101,17 +101,51 @@ class CrawlJob(Job):
result = await self.plugin_runner.run(plugin)
proxies: List[ProxyRaw] = result.proxies if result else []
if proxies and self.validator_pool:
await self.validator_pool.submit(proxies)
logger.info(f"CrawlJob {self.id}: submitted {len(proxies)} proxies for validation")
if proxies:
from app.core.db import transaction
from app.repositories.proxy_repo import ProxyRepository
try:
async with transaction() as db:
await ProxyRepository.upsert_many_from_crawl(db, proxies, 0)
logger.info(
f"CrawlJob {self.id}: persisted {len(proxies)} crawled proxies as pending"
)
except Exception as e:
logger.error(
f"CrawlJob {self.id}: failed to persist crawled proxies: {e}",
exc_info=True,
)
raise
if proxies and self.validator_pool:
from app.core.db import get_db as _get_db
from app.repositories.settings_repo import (
SettingsRepository,
DEFAULT_SETTINGS,
)
async with _get_db() as db:
db_settings = await SettingsRepository.get_all(db)
if db_settings.get(
"auto_validate_after_crawl",
DEFAULT_SETTINGS["auto_validate_after_crawl"],
):
await self.validator_pool.submit(proxies)
logger.info(
f"CrawlJob {self.id}: submitted {len(proxies)} proxies for immediate validation"
)
crawl_failed = bool(result and (result.failure_count > 0 or result.error))
payload = {
"plugin_id": self.plugin_id,
"proxy_count": len(proxies),
"crawl_failed": crawl_failed,
"error": result.error if result else None,
# 与持久化统计一致success_count=本次爬到的条数failure_count=是否失败(0/1)
"success_count": len(proxies),
"failure_count": result.failure_count if result else 0,
}
if result:
payload["success_count"] = result.success_count
payload["failure_count"] = result.failure_count
self._set_completed(payload)
return payload
@@ -133,7 +167,7 @@ class ValidateAllJob(Job):
repo = self.proxy_repo or ProxyRepository()
async with get_db() as db:
proxies = await repo.list_all(db)
proxies = await repo.list_for_validation(db)
if not proxies:
self._set_completed({"total": 0, "submitted": 0})

View File

@@ -65,8 +65,11 @@ class AsyncWorkerPool:
logger.info(f"{self.name} stopped")
async def submit(self, items: List[T]) -> None:
"""提交一批任务到队列(阻塞直到有空位,天然背压"""
"""提交一批任务到队列(优先 put_nowait队列满时再 await put"""
for item in items:
try:
self._queue.put_nowait(item)
except asyncio.QueueFull:
await self._queue.put(item)
async def drain(self) -> None:

View File

@@ -18,6 +18,8 @@ class BaseCrawlerPlugin(ABC):
description: str = ""
enabled: bool = True
default_config: Dict[str, Any] = {}
#: 单插件整段 crawl() 的 asyncio.wait_for 上限(秒),彼此独立、互不影响
crawl_timeout_seconds: float = 120.0
def __init__(self):
self._config: Dict[str, Any] = dict(self.default_config or {})

View File

@@ -22,6 +22,7 @@ class ProxyRaw:
@dataclass
class Proxy:
"""数据库中的代理实体"""
ip: str
port: int
protocol: str
@@ -29,6 +30,7 @@ class Proxy:
response_time_ms: Optional[float] = None
last_check: Optional[datetime] = None
created_at: Optional[datetime] = None
validated: int = 0 # 0 待验证 1 已验证(可参与分数与对外取用)
@dataclass
@@ -46,7 +48,12 @@ class PluginInfo:
@dataclass
class CrawlResult:
"""插件爬取结果"""
"""插件爬取结果
success_count: 最近一轮成功爬取到的代理条数(去重后),非「验证通过数」
failure_count: 最近一轮是否爬取失败(健康检查/超时/异常为 1否则为 0
"""
plugin_name: str
proxies: List[ProxyRaw] = field(default_factory=list)
success_count: int = 0

View File

@@ -1,5 +1,5 @@
"""Pydantic 模型 - 用于 API 请求/响应校验"""
from pydantic import BaseModel, Field, field_validator
from pydantic import BaseModel, Field, field_validator, ConfigDict
from typing import Optional, List
@@ -25,6 +25,7 @@ class ProxyResponse(BaseModel):
score: int
response_time_ms: Optional[float] = None
last_check: Optional[str] = None
validated: int = 0
class PluginResponse(BaseModel):
@@ -39,13 +40,14 @@ class PluginResponse(BaseModel):
class SettingsSchema(BaseModel):
crawl_timeout: int = Field(default=30, ge=5, le=120)
validation_timeout: int = Field(default=10, ge=3, le=60)
max_retries: int = Field(default=3, ge=0, le=10)
default_concurrency: int = Field(default=50, ge=10, le=200)
model_config = ConfigDict(extra="ignore")
validation_timeout: int = Field(default=6, ge=3, le=60)
default_concurrency: int = Field(default=120, ge=10, le=400)
min_proxy_score: int = Field(default=0, ge=0, le=100)
proxy_expiry_days: int = Field(default=7, ge=1, le=30)
auto_validate: bool = True
auto_validate_after_crawl: bool = False
validate_interval_minutes: int = Field(default=30, ge=5, le=1440)
validation_targets: List[str] = Field(
default=[
@@ -60,10 +62,14 @@ class SettingsSchema(BaseModel):
class CrawlSummarySchema(BaseModel):
"""单次爬取任务结果(与 CrawlJob 返回的 result 对齐)"""
plugin_id: str
proxy_count: int
valid_count: int
invalid_count: int = 0
crawl_failed: bool = False
error: Optional[str] = None
success_count: int = 0 # 与 proxy_count 相同,兼容旧前端
failure_count: int = 0
class ProxyListRequest(BaseModel):
@@ -74,6 +80,20 @@ class ProxyListRequest(BaseModel):
max_score: Optional[int] = Field(default=None, ge=0)
sort_by: str = "last_check"
sort_order: str = "DESC"
pool_filter: Optional[str] = Field(
default=None,
description="all 或不传=全部pending=待验证available=已验证且可用",
)
@field_validator("pool_filter")
@classmethod
def validate_pool_filter(cls, v: Optional[str]):
if v is None or v == "" or v == "all":
return None
allowed = ("pending", "available")
if v not in allowed:
raise ValueError(f"pool_filter 必须是 {allowed} 之一或 all")
return v
@field_validator("protocol")
@classmethod

View File

@@ -9,6 +9,15 @@ from .kuaidaili import KuaiDaiLiPlugin
from .speedx import SpeedXPlugin
from .yundaili import YunDaiLiPlugin
from .proxyscrape import ProxyScrapePlugin
from .fpw_proxy_list_download import FpwProxyListDownloadPlugin
from .fpw_socks_ssl_proxy import FpwSocksSslProxyPlugin
from .fpw_spys_one import FpwSpysOnePlugin
from .fpw_proxynova import FpwProxynovaPlugin
from .fpw_hidemy import FpwHidemyPlugin
from .fpw_premproxy import FpwPremproxyPlugin
from .fpw_freeproxylists import FpwFreeproxylistsPlugin
from .fpw_gatherproxy import FpwGatherproxyPlugin
from .fpw_checkerproxy import FpwCheckerproxyPlugin
# 显式注册所有插件
registry.register(Fate0Plugin)
@@ -19,3 +28,12 @@ registry.register(KuaiDaiLiPlugin)
registry.register(SpeedXPlugin)
registry.register(YunDaiLiPlugin)
registry.register(ProxyScrapePlugin)
registry.register(FpwProxyListDownloadPlugin)
registry.register(FpwSocksSslProxyPlugin)
registry.register(FpwSpysOnePlugin)
registry.register(FpwProxynovaPlugin)
registry.register(FpwHidemyPlugin)
registry.register(FpwPremproxyPlugin)
registry.register(FpwFreeproxylistsPlugin)
registry.register(FpwGatherproxyPlugin)
registry.register(FpwCheckerproxyPlugin)

View File

@@ -3,7 +3,7 @@ import re
import random
import asyncio
import httpx
from typing import List, Optional
from typing import Dict, List, Optional
from bs4 import BeautifulSoup
from app.core.plugin_system import BaseCrawlerPlugin
from app.models.domain import ProxyRaw
@@ -43,9 +43,56 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
self._client = httpx.AsyncClient(
transport=transport,
follow_redirects=True,
# 忽略系统 HTTP(S)_PROXY避免误配导致列表站全部连接失败
trust_env=False,
)
return self._client
@staticmethod
def _http_timeout(seconds: float) -> httpx.Timeout:
"""连接阶段单独收紧,避免 AsyncClient 在部分环境下长时间卡在 connect。"""
t = max(2.0, float(seconds))
c = min(6.0, max(3.0, t * 0.35))
return httpx.Timeout(t, connect=c)
@staticmethod
def _decode_response_body(response: httpx.Response) -> str:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
def _sync_get(self, url: str, timeout: float, headers: dict) -> str:
"""同步 GET部分站点在 Windows 上 AsyncClient 易 ConnectTimeout同步 Client 正常)。"""
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.get(url, headers=headers, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
def _sync_post(
self, url: str, data: Dict[str, str], timeout: float, headers: dict
) -> str:
to = BaseHTTPPlugin._http_timeout(timeout)
with httpx.Client(
transport=httpx.HTTPTransport(retries=0),
follow_redirects=True,
trust_env=False,
) as c:
r = c.post(url, headers=headers, data=data, timeout=to)
if r.status_code != 200:
return ""
return self._decode_response_body(r)
async def fetch(
self,
url: str,
@@ -56,35 +103,81 @@ class BaseHTTPPlugin(BaseCrawlerPlugin):
"""异步抓取指定 URL 的 HTML 内容"""
from app.core.log import logger
client = self._get_client()
to = self._http_timeout(timeout)
for attempt in range(retries):
try:
response = await client.get(url, headers=self.get_headers(), timeout=timeout)
response = await client.get(url, headers=self.get_headers(), timeout=to)
if raise_for_status:
response.raise_for_status()
if response.status_code == 200:
content = response.content
encoding = response.encoding
if encoding == "utf-8" or not encoding:
try:
return content.decode("utf-8")
except UnicodeDecodeError:
return content.decode("gbk", errors="ignore")
return content.decode(encoding, errors="ignore")
else:
return self._decode_response_body(response)
logger.warning(f"Fetch {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"Fetch {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_get, url, timeout, self.get_headers()
)
if text:
logger.info(f"Fetch {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"Fetch {url} 同步回退失败: {e}")
return ""
async def fetch_all(self, urls: List[str], timeout: float = 15.0) -> List[str]:
async def fetch_post(
self,
url: str,
data: Optional[Dict[str, str]] = None,
timeout: float = 15.0,
retries: int = 2,
) -> str:
"""POST application/x-www-form-urlencoded用于 spys.one 等表单页。"""
from app.core.log import logger
client = self._get_client()
payload = data or {}
to = self._http_timeout(timeout)
for attempt in range(retries):
try:
response = await client.post(
url,
headers=self.get_headers(),
data=payload,
timeout=to,
)
if response.status_code == 200:
return self._decode_response_body(response)
logger.warning(f"POST {url} returned status {response.status_code}")
except Exception as e:
logger.warning(f"POST {url} failed (attempt {attempt + 1}/{retries}): {e}")
if attempt < retries - 1:
await asyncio.sleep(random.uniform(1, 3))
try:
text = await asyncio.to_thread(
self._sync_post, url, payload, timeout, self.get_headers()
)
if text:
logger.info(f"POST {url} 使用同步回退成功")
return text
except Exception as e:
logger.warning(f"POST {url} 同步回退失败: {e}")
return ""
async def fetch_all(
self,
urls: List[str],
timeout: float = 15.0,
retries: int = 2,
) -> List[str]:
"""并发抓取多个 URL限制单个插件内部并发"""
semaphore = asyncio.Semaphore(self.max_concurrency)
async def _fetch_limited(url: str):
async with semaphore:
return await self.fetch(url, timeout=timeout)
return await self.fetch(url, timeout=timeout, retries=retries)
tasks = [_fetch_limited(url) for url in urls]
return await asyncio.gather(*tasks)

View File

@@ -0,0 +1,65 @@
"""checkerproxy.net尝试常见导出路径 + 正文中的 ip:port排除示例占位"""
import re
from typing import List, Set, Tuple
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwCheckerproxyPlugin(BaseHTTPPlugin):
name = "fpw_checkerproxy"
display_name = "CheckerProxy.net"
description = "checkerproxy.net无稳定公开 API 时可能为空;多路径尝试)"
def __init__(self):
super().__init__()
self.urls = [
"https://checkerproxy.net/",
"https://checkerproxy.net/export",
"https://checkerproxy.net/api/export",
]
@staticmethod
def _parse_ip_ports(text: str) -> List[ProxyRaw]:
bad = {"123.123.123.123", "127.0.0.1", "0.0.0.0"}
seen: Set[Tuple[str, int]] = set()
out: List[ProxyRaw] = []
for m in re.finditer(
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
text,
):
ip, ps = m.group(1), m.group(2)
if ip in bad:
continue
if not ps.isdigit() or not (1 <= int(ps) <= 65535):
continue
key = (ip, int(ps))
if key in seen:
continue
seen.add(key)
try:
out.append(ProxyRaw(ip, int(ps), "http"))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = []
seen: Set[Tuple[str, int, str]] = set()
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for html in htmls:
if not html or len(html) < 200:
continue
for p in self._parse_ip_ports(html):
k = (p.ip, p.port, p.protocol)
if k not in seen:
seen.add(k)
merged.append(p)
if len(merged) >= 50:
break
if merged:
logger.info(f"{self.display_name} 解析 {len(merged)}")
else:
logger.warning(f"{self.display_name} 未解析到代理(站点可能仅提供在线检测)")
return merged

View File

@@ -0,0 +1,69 @@
"""freeproxylists.net 及常见镜像路径(表格 / 纯文本)。"""
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwFreeproxylistsPlugin(BaseHTTPPlugin):
name = "fpw_freeproxylists"
display_name = "FreeProxyLists"
description = "freeproxylists.net 系列页面(易被 403多 URL 尝试)"
def __init__(self):
super().__init__()
self.urls = [
"http://www.freeproxylists.net/",
"http://freeproxylists.net/",
"http://www.freeproxylists.net/en/http-txt.html",
]
def _parse_any(self, html: str) -> List[ProxyRaw]:
ipport = re.findall(
r"\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{2,5})\b",
html,
)
if len(ipport) >= 5:
out: List[ProxyRaw] = []
for ip, ps in ipport:
if ps.isdigit() and 1 <= int(ps) <= 65535:
try:
out.append(ProxyRaw(ip, int(ps), "http"))
except ValueError:
pass
return out
soup = BeautifulSoup(html, "lxml")
results: List[ProxyRaw] = []
for tr in soup.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip) and port.isdigit():
if 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), "http"))
except ValueError:
pass
return results
async def crawl(self) -> List[ProxyRaw]:
seen = set()
out: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=10, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
for p in self._parse_any(html):
key = (p.ip, p.port, p.protocol)
if key not in seen:
seen.add(key)
out.append(p)
if out:
logger.info(f"{self.display_name}{url} 累计 {len(out)}")
return out

View File

@@ -0,0 +1,61 @@
"""gatherproxy.com 页面内嵌 JSONPROXY_IP / PROXY_PORT"""
import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwGatherproxyPlugin(BaseHTTPPlugin):
name = "fpw_gatherproxy"
display_name = "GatherProxy"
description = "gatherproxy.com 内嵌代理 JSON站点常有限流"
def __init__(self):
super().__init__()
self.urls = [
"http://www.gatherproxy.com/proxylist/anonymity/?t=Elite",
"http://www.gatherproxy.com/proxylist/country/?c=United%20States",
]
def _extract_from_text(self, text: str) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
for m in re.finditer(
r"PROXY_IP['\"]?\s*:\s*['\"]([\d.]+)['\"].{0,120}?PROXY_PORT['\"]?\s*:\s*['\"](\d+)['\"]",
text,
re.DOTALL | re.IGNORECASE,
):
ip, port = m.group(1), m.group(2)
if port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), "http"))
except ValueError:
continue
for m in re.finditer(
r"\{[^{}]*\"PROXY_IP\"\s*:\s*\"([\d.]+)\"[^{}]*\"PROXY_PORT\"\s*:\s*\"(\d+)\"[^{}]*\}",
text,
):
ip, port = m.group(1), m.group(2)
if port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), "http"))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
seen = set()
out: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=10, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
for p in self._extract_from_text(html):
k = (p.ip, p.port)
if k not in seen:
seen.add(k)
out.append(p)
if out:
logger.info(f"{self.display_name}{url} 累计 {len(out)}")
return out

38
app/plugins/fpw_hidemy.py Normal file
View File

@@ -0,0 +1,38 @@
"""hidemyna.me 免费代理列表表格。"""
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwHidemyPlugin(BaseHTTPPlugin):
name = "fpw_hidemy"
display_name = "HideMy.name"
description = "hidemyna.me 英文代理列表HTTP/HTTPS/SOCKS"
def __init__(self):
super().__init__()
self.urls = [
"https://hidemyna.me/en/proxy-list/",
"https://hidemyna.me/en/proxy-list/?type=hs",
"https://hidemyna.me/en/proxy-list/?type=socks4",
]
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
batch = self.parse_html_table(
html,
column_map={"ip": 0, "port": 1, "protocol": 4},
protocol="http",
)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} {url}: {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

View File

@@ -0,0 +1,64 @@
"""premproxy.com 列表页表格。"""
import re
from typing import List
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwPremproxyPlugin(BaseHTTPPlugin):
name = "fpw_premproxy"
display_name = "PremProxy"
description = "premproxy.com HTTP/SOCKS 列表页"
def __init__(self):
super().__init__()
self.urls = [
"https://premproxy.com/list/",
"https://premproxy.com/socks-list/",
]
def _parse_html(self, html: str) -> List[ProxyRaw]:
soup = BeautifulSoup(html, "lxml")
results: List[ProxyRaw] = []
for tr in soup.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip):
continue
if not port.isdigit() or not (1 <= int(port) <= 65535):
continue
row = tr.get_text(" ", strip=True).lower()
if "socks5" in row:
proto = "socks5"
elif "socks4" in row or "socks" in row:
proto = "socks4"
elif "https" in row:
proto = "https"
else:
proto = "http"
try:
results.append(ProxyRaw(ip, int(port), proto))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
merged: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
batch = self._parse_html(html)
if batch:
merged.extend(batch)
logger.info(f"{self.display_name} {url}: {len(batch)}")
if merged:
logger.info(f"{self.display_name} 合计 {len(merged)}")
return merged

View File

@@ -0,0 +1,54 @@
"""www.proxy-list.download 公开 APIREADME: Free_Proxy_Website"""
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwProxyListDownloadPlugin(BaseHTTPPlugin):
name = "fpw_proxy_list_download"
display_name = "Proxy-List.download"
description = "proxy-list.download 官方 APIhttp/https/socks4/socks5"
def __init__(self):
super().__init__()
self.max_concurrency = 8
self.api_pairs = [
("http", "https://www.proxy-list.download/api/v1/get?type=http"),
("https", "https://www.proxy-list.download/api/v1/get?type=https"),
("socks4", "https://www.proxy-list.download/api/v1/get?type=socks4"),
("socks5", "https://www.proxy-list.download/api/v1/get?type=socks5"),
]
self.fallback_pairs = [
("http", "https://api.proxyscrape.com/v2/?request=get&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all"),
("https", "https://api.proxyscrape.com/v2/?request=get&protocol=https&timeout=10000&country=all&ssl=all&anonymity=all"),
("socks4", "https://api.proxyscrape.com/v2/?request=get&protocol=socks4&timeout=10000&country=all&ssl=all&anonymity=all"),
("socks5", "https://api.proxyscrape.com/v2/?request=get&protocol=socks5&timeout=10000&country=all&ssl=all&anonymity=all"),
]
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
urls = [u for _, u in self.api_pairs]
htmls = await self.fetch_all(urls, timeout=10, retries=1)
for (protocol, _), text in zip(self.api_pairs, htmls):
if not text:
continue
batch = self.parse_text_proxies(text, protocol)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} {protocol}: {len(batch)}")
if not results:
logger.warning(f"{self.display_name} 主 API 无数据,尝试 ProxyScrape 备用")
fb_urls = [u for _, u in self.fallback_pairs]
fb_htmls = await self.fetch_all(fb_urls, timeout=10, retries=1)
for (protocol, _), text in zip(self.fallback_pairs, fb_htmls):
if not text:
continue
batch = self.parse_text_proxies(text, protocol)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} fallback {protocol}: {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

View File

@@ -0,0 +1,74 @@
"""proxynova.com 表格内 JS 混淆 IP + 明文端口。"""
import re
from typing import List, Optional
from bs4 import BeautifulSoup
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwProxynovaPlugin(BaseHTTPPlugin):
name = "fpw_proxynova"
display_name = "ProxyNova"
description = "proxynova.com 代理列表(解析 document.write 混淆 IP"
def __init__(self):
super().__init__()
self.urls = ["https://www.proxynova.com/proxy-server-list/"]
@staticmethod
def _decode_proxynova_ip(script_inner: str) -> Optional[str]:
"""解析 document.write(\".081.301\".split(\"\").reverse()...concat(\"118.174\"...))"""
m1 = re.search(r'document\.write\("([^"]+)"\.split', script_inner)
m2 = re.search(r'\.concat\("([^"]+)"', script_inner)
if not m1 or not m2:
return None
a, b = m1.group(1), m2.group(1)
part1 = "".join(reversed(a))
return part1 + b
def _parse_rows(self, html: str) -> List[ProxyRaw]:
soup = BeautifulSoup(html, "lxml")
tbody = soup.find("tbody")
if not tbody:
return []
out: List[ProxyRaw] = []
for tr in tbody.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
script = tds[0].find("script")
if not script or not script.string:
continue
ip = self._decode_proxynova_ip(script.string)
port_txt = tds[1].get_text(strip=True)
if not ip or not port_txt.isdigit():
continue
port = int(port_txt)
if not (1 <= port <= 65535):
continue
row_text = tr.get_text(" ", strip=True).upper()
if "SOCKS5" in row_text:
proto = "socks5"
elif "SOCKS4" in row_text:
proto = "socks4"
elif "HTTPS" in row_text:
proto = "https"
else:
proto = "http"
try:
out.append(ProxyRaw(ip, port, proto))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]:
html = await self.fetch(self.urls[0], timeout=14, retries=1)
if not html:
return []
results = self._parse_rows(html)
if results:
logger.info(f"{self.display_name} 解析 {len(results)}")
return results

View File

@@ -0,0 +1,56 @@
"""socks-proxy.net / sslproxies.org 表格README 参考 GetProxyFromSocks-proxy.py"""
import re
from typing import List
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwSocksSslProxyPlugin(BaseHTTPPlugin):
name = "fpw_socks_ssl_proxy"
display_name = "Socks-Proxy / SSLProxies"
description = "socks-proxy.net 与 sslproxies.org 首页表格HTTP/HTTPS 列表)"
def __init__(self):
super().__init__()
self.max_concurrency = 6
# 与 sslproxies 同模板的镜像站较多socks-proxy 在部分网络下不稳定,多源提高成功率
self.urls = [
"https://www.sslproxies.org/",
"https://free-proxy-list.net/",
"https://www.us-proxy.org/",
"https://www.socks-proxy.net/",
]
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
results = []
pattern = re.compile(
r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>\s*<td[^>]*>\s*(\d+)",
re.I,
)
for ip, port in pattern.findall(html):
if port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), default_protocol))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
htmls = await self.fetch_all(self.urls, timeout=12, retries=1)
for url, html in zip(self.urls, htmls):
if not html:
continue
if "socks-proxy" in url:
proto = "socks4"
else:
proto = "http"
batch = self._parse_page(html, proto)
results.extend(batch)
if batch:
logger.info(f"{self.display_name} {url}: {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

148
app/plugins/fpw_spys_one.py Normal file
View File

@@ -0,0 +1,148 @@
"""spys.one 表单 POST + 端口 XOR 解码README: GetProxyFromSPYSONE.py"""
import asyncio
import re
from typing import Dict, List, Tuple
from app.core.plugin_system import ProxyRaw
from app.plugins.base import BaseHTTPPlugin
from app.core.log import logger
class FpwSpysOnePlugin(BaseHTTPPlugin):
name = "fpw_spys_one"
display_name = "Spys.one"
description = "spys.one HTTP/SOCKS 列表POST 筛选 + XOR 端口解码)"
def __init__(self):
super().__init__()
self.pages: List[Tuple[str, str, str]] = [
("http", "http://spys.one/en/http-proxy-list/", "1"),
("socks5", "http://spys.one/en/socks-proxy-list/", "2"),
]
@staticmethod
def _exec_spys_decoder(body: str) -> Dict[str, int]:
body = re.sub(r"\s+", "", body)
stmts = [s.strip() for s in body.split(";") if s.strip() and "document" not in s]
env: Dict[str, int] = {}
for _ in range(8):
progressed = False
for stmt in stmts:
if "=" not in stmt:
continue
lhs, rhs = stmt.split("=", 1)
lhs = lhs.strip()
rhs = rhs.strip()
if lhs in env:
continue
if "^" not in rhs:
if rhs.isdigit():
env[lhs] = int(rhs)
progressed = True
continue
a, b = rhs.split("^", 1)
a, b = a.strip(), b.strip()
def gv(x: str) -> int:
if x.isdigit():
return int(x)
return env[x]
try:
env[lhs] = gv(a) ^ gv(b)
progressed = True
except KeyError:
continue
if not progressed:
break
return env
def _decoder_env_from_html(self, html: str) -> Dict[str, int]:
best: Dict[str, int] = {}
for m in re.finditer(r"<script[^>]*>([\s\S]*?)</script>", html, re.IGNORECASE):
chunk = m.group(1).strip()
if "document.write" in chunk:
continue
xor_assigns = len(re.findall(r"\w+=\d+\^\w+", chunk))
if xor_assigns < 4:
continue
env = self._exec_spys_decoder(chunk)
if len(env) > len(best):
best = env
return best
def _parse_page(self, html: str, default_protocol: str) -> List[ProxyRaw]:
env = self._decoder_env_from_html(html)
if not env:
logger.warning(f"{self.display_name} 未解析到 XOR 变量表")
return []
results: List[ProxyRaw] = []
for m in re.finditer(
r"class=spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<script[^>]*>([\s\S]+?)</script>",
html,
re.IGNORECASE,
):
ip = m.group(1)
inner = m.group(2)
dw = re.search(
r'document\.write\("[^"]*"\+((?:\(\w+\^\w+\)\+?)+)\)',
inner,
)
if not dw:
continue
pairs = re.findall(r"\((\w+)\^(\w+)\)", dw.group(1))
if not pairs:
continue
try:
digits = "".join(str(env[a] ^ env[b]) for a, b in pairs)
port = int(digits)
except (KeyError, ValueError):
continue
if not (1 <= port <= 65535):
continue
tail = html[m.end() : m.end() + 2000]
u = tail.upper()
if "SOCKS5" in u:
proto = "socks5"
elif "SOCKS4" in u:
proto = "socks4"
elif "HTTPS" in u:
proto = "https"
elif "HTTP" in u:
proto = "http"
else:
proto = default_protocol
try:
results.append(ProxyRaw(ip, port, proto))
except ValueError:
continue
return results
async def crawl(self) -> List[ProxyRaw]:
results: List[ProxyRaw] = []
form_base = {
"xpp": "3",
"xf1": "0",
"xf2": "0",
"xf4": "0",
}
async def _one(proto: str, url: str, xf5: str) -> Tuple[str, str]:
data = {**form_base, "xf5": xf5}
html = await self.fetch_post(url, data=data, timeout=14, retries=1)
return proto, html or ""
pairs = await asyncio.gather(
*(_one(proto, url, xf5) for proto, url, xf5 in self.pages)
)
for proto, html in pairs:
if not html:
continue
batch = self._parse_page(html, proto)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} ({proto}): {len(batch)}")
if results:
logger.info(f"{self.display_name} 合计 {len(results)}")
return results

View File

@@ -18,17 +18,19 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
def __init__(self):
super().__init__()
# 减少页数,降低被反爬概率,确保至少能拿到数据
# fps/dps 列表页目前仍可 200inha/intr 常返回 567反爬作末位兜底
self.urls = [
"https://www.kuaidaili.com/free/fps/",
"https://www.kuaidaili.com/free/dps/",
"https://www.kuaidaili.com/free/inha/1/",
"https://www.kuaidaili.com/free/intr/1/",
]
def get_headers(self) -> dict:
headers = super().get_headers()
headers["Referer"] = "https://www.kuaidaili.com/free/inha/"
headers["Referer"] = "https://www.kuaidaili.com/free/"
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Accept-Encoding"] = "gzip, deflate, br"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Accept-Language"] = "zh-CN,zh;q=0.9,en;q=0.8"
headers["Sec-Fetch-Dest"] = "document"
headers["Sec-Fetch-Mode"] = "navigate"
@@ -36,15 +38,56 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
headers["Upgrade-Insecure-Requests"] = "1"
return headers
@staticmethod
def _infer_protocol(texts: List[str]) -> str:
"""从一行单元格文本中推断协议(兼容 fps / dps / inha 等版式)。"""
for t in texts[2:]:
tl = t.lower().replace(" ", "")
if tl in VALID_PROTOCOLS:
return tl
if "http(s)" in tl or tl in ("http/https",):
return "http"
if "socks5" in tl:
return "socks5"
if "socks4" in tl:
return "socks4"
if tl == "https":
return "https"
if len(texts) >= 5:
t4 = texts[4].lower().strip()
if t4 in VALID_PROTOCOLS:
return t4
return "http"
def _parse_table(self, table) -> List[ProxyRaw]:
out: List[ProxyRaw] = []
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) < 2:
continue
texts = [td.get_text(strip=True) for td in tds]
ip = texts[0]
port_s = texts[1]
if not re.match(r"^\d+\.\d+\.\d+\.\d+$", ip):
continue
if not port_s.isdigit() or not (1 <= int(port_s) <= 65535):
continue
protocol = self._infer_protocol(texts)
if protocol not in VALID_PROTOCOLS:
protocol = "http"
try:
out.append(ProxyRaw(ip, int(port_s), protocol))
except ValueError:
continue
return out
async def crawl(self) -> List[ProxyRaw]:
results = []
# 先访问首页预热会话,获取 cookie降低被反爬概率
await self.fetch("https://www.kuaidaili.com/", timeout=10)
await asyncio.sleep(random.uniform(2, 4))
await self.fetch("https://www.kuaidaili.com/free/", timeout=10)
await asyncio.sleep(random.uniform(1, 2))
# 顺序请求免费代理页面
for url in self.urls:
html = await self.fetch(url, timeout=10)
html = await self.fetch(url, timeout=15)
if not html:
continue
soup = BeautifulSoup(html, "lxml")
@@ -53,20 +96,11 @@ class KuaiDaiLiPlugin(BaseHTTPPlugin):
logger.warning(f"{self.display_name} 未能找到表格,可能是触发了反爬: {url}")
continue
for row in table.find_all("tr"):
tds = row.find_all("td")
if len(tds) >= 5:
ip = tds[0].get_text(strip=True)
port = tds[1].get_text(strip=True)
protocol = tds[4].get_text(strip=True).lower() if len(tds) > 4 else "http"
if protocol not in VALID_PROTOCOLS:
protocol = "http"
if re.match(r"^\d+\.\d+\.\d+\.\d+$", ip) and port.isdigit() and 1 <= int(port) <= 65535:
try:
results.append(ProxyRaw(ip, int(port), protocol))
except ValueError:
continue
await asyncio.sleep(random.uniform(5, 8))
batch = self._parse_table(table)
if batch:
results.extend(batch)
logger.info(f"{self.display_name} {url} 解析 {len(batch)}")
await asyncio.sleep(random.uniform(1, 2))
if results:
logger.info(f"{self.display_name} 解析完成,获取 {len(results)} 个潜在代理")

View File

@@ -109,21 +109,5 @@ class ProxyScrapePlugin(BaseHTTPPlugin):
if results:
logger.info(f"ProxyScrape 总计获取 {len(results)} 个代理")
else:
# Fallback生成测试代理确保在测试环境也能验证完整流程
logger.warning("ProxyScrape 所有真实源均不可用,生成测试代理用于架构验证")
results = self._generate_test_proxies()
logger.warning("ProxyScrape 所有真实源均不可用,返回空列表")
return results
def _generate_test_proxies(self) -> List[ProxyRaw]:
"""生成测试代理数据,覆盖全协议类型,用于验证插件系统"""
import random
test_proxies = []
protocols = ["http", "https", "socks4", "socks5"]
for protocol in protocols:
for _ in range(3):
# 生成随机公网格式 IP仅用于测试流程
ip = f"{random.randint(1, 223)}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
port = random.randint(1024, 65535)
test_proxies.append(ProxyRaw(ip, port, protocol))
logger.info(f"生成 {len(test_proxies)} 个测试代理 HTTP/HTTPS/SOCKS4/SOCKS5 各 3 个")
return test_proxies

View File

@@ -2,7 +2,8 @@
import aiosqlite
from datetime import datetime, timedelta
from typing import List, Optional, Tuple, Union
from app.models.domain import Proxy
from app.models.domain import Proxy, ProxyRaw
from app.core.log import logger
@@ -32,9 +33,15 @@ def _row_to_proxy(row: Tuple) -> Proxy:
response_time_ms=row[4],
last_check=_to_datetime(row[5]),
created_at=_to_datetime(row[6]),
validated=int(row[7]) if len(row) > 7 and row[7] is not None else 0,
)
_SELECT_PROXY_COLS = (
"ip, port, protocol, score, response_time_ms, last_check, created_at, validated"
)
class ProxyRepository:
"""代理 Repository"""
@@ -51,12 +58,13 @@ class ProxyRepository:
try:
await db.execute(
"""
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at, validated)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 1)
ON CONFLICT(ip, port) DO UPDATE SET
protocol = excluded.protocol,
score = excluded.score,
last_check = CURRENT_TIMESTAMP
last_check = CURRENT_TIMESTAMP,
validated = 1
""",
(ip, port, protocol, score),
)
@@ -66,6 +74,56 @@ class ProxyRepository:
logger.error(f"insert_or_update proxy failed: {e}", exc_info=True)
return False
@staticmethod
async def upsert_from_crawl(
db: aiosqlite.Connection,
ip: str,
port: int,
protocol: str = "http",
initial_score: int = 0,
) -> None:
"""爬取入库待验证状态validated=0, score=0再次爬取同一条则重置为待验证。"""
if protocol not in VALID_PROTOCOLS:
protocol = "http"
await db.execute(
"""
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at, validated)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 0)
ON CONFLICT(ip, port) DO UPDATE SET
protocol = excluded.protocol,
score = excluded.score,
last_check = CURRENT_TIMESTAMP,
validated = 0
""",
(ip, port, protocol, initial_score),
)
@staticmethod
async def upsert_many_from_crawl(
db: aiosqlite.Connection,
proxies: List[ProxyRaw],
initial_score: int = 0,
) -> None:
"""批量爬取入库;不 commit由外层 transaction 提交。"""
if not proxies:
return
rows = []
for p in proxies:
proto = p.protocol if p.protocol in VALID_PROTOCOLS else "http"
rows.append((p.ip, p.port, proto, initial_score))
await db.executemany(
"""
INSERT INTO proxies (ip, port, protocol, score, last_check, created_at, validated)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP, 0)
ON CONFLICT(ip, port) DO UPDATE SET
protocol = excluded.protocol,
score = excluded.score,
last_check = CURRENT_TIMESTAMP,
validated = 0
""",
rows,
)
@staticmethod
async def update_score(
db: aiosqlite.Connection,
@@ -86,9 +144,12 @@ class ProxyRepository:
""",
(min_score, max_score, delta, ip, port),
)
# 删除分数已降至 0 及以下的代理
# 删除已入池且分数耗尽者;待验证(score=0)不经过此路径
await db.execute(
"DELETE FROM proxies WHERE ip = ? AND port = ? AND score <= ?",
"""
DELETE FROM proxies
WHERE ip = ? AND port = ? AND score <= ? AND validated = 1
""",
(ip, port, min_score),
)
await db.commit()
@@ -134,7 +195,7 @@ class ProxyRepository:
db: aiosqlite.Connection, ip: str, port: int
) -> Optional[Proxy]:
async with db.execute(
"SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE ip = ? AND port = ?",
f"SELECT {_SELECT_PROXY_COLS} FROM proxies WHERE ip = ? AND port = ?",
(ip, port),
) as cursor:
row = await cursor.fetchone()
@@ -145,7 +206,11 @@ class ProxyRepository:
@staticmethod
async def get_random(db: aiosqlite.Connection) -> Optional[Proxy]:
async with db.execute(
"SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies WHERE score > 0 ORDER BY RANDOM() LIMIT 1"
f"""
SELECT {_SELECT_PROXY_COLS} FROM proxies
WHERE validated = 1 AND score > 0
ORDER BY RANDOM() LIMIT 1
"""
) as cursor:
row = await cursor.fetchone()
if row:
@@ -158,12 +223,19 @@ class ProxyRepository:
protocol: Optional[str] = None,
limit: int = 100000,
offset: int = 0,
validated: Optional[int] = None,
) -> List[Proxy]:
query = "SELECT ip, port, protocol, score, response_time_ms, last_check, created_at FROM proxies"
query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
params: List = []
clauses = []
if protocol:
query += " WHERE protocol = ?"
clauses.append("protocol = ?")
params.append(protocol.lower())
if validated is not None:
clauses.append("validated = ?")
params.append(int(validated))
if clauses:
query += " WHERE " + " AND ".join(clauses)
query += " LIMIT ? OFFSET ?"
params.extend([limit, offset])
@@ -171,21 +243,77 @@ class ProxyRepository:
rows = await cursor.fetchall()
return [_row_to_proxy(row) for row in rows]
@staticmethod
async def list_for_validation(
db: aiosqlite.Connection,
protocol: Optional[str] = None,
) -> List[Proxy]:
"""待验证优先,其次已验证按 last_check 升序(用于全量/调度复检)。"""
pending: List[Proxy] = []
q = f"SELECT {_SELECT_PROXY_COLS} FROM proxies WHERE validated = 0"
params: List = []
if protocol:
q += " AND protocol = ?"
params.append(protocol.lower())
q += " ORDER BY created_at ASC"
async with db.execute(q, params) as cursor:
rows_p = await cursor.fetchall()
pending = [_row_to_proxy(r) for r in rows_p]
rest_q = f"SELECT {_SELECT_PROXY_COLS} FROM proxies WHERE validated = 1"
rparams: List = []
if protocol:
rest_q += " AND protocol = ?"
rparams.append(protocol.lower())
rest_q += " ORDER BY last_check ASC"
async with db.execute(rest_q, rparams) as cursor:
rows_r = await cursor.fetchall()
rest = [_row_to_proxy(r) for r in rows_r]
return pending + rest
@staticmethod
async def iter_batches(
db: aiosqlite.Connection,
protocol: Optional[str] = None,
batch_size: int = 1000,
only_usable: bool = False,
):
"""流式分批读取代理,避免一次性加载大量数据到内存"""
offset = 0
while True:
batch = await ProxyRepository.list_all(db, protocol, batch_size, offset)
batch = await ProxyRepository._list_batch_offset(
db, protocol, batch_size, offset, only_usable=only_usable
)
if not batch:
break
yield batch
offset += batch_size
@staticmethod
async def _list_batch_offset(
db: aiosqlite.Connection,
protocol: Optional[str],
batch_size: int,
offset: int,
only_usable: bool,
) -> List[Proxy]:
query = f"SELECT {_SELECT_PROXY_COLS} FROM proxies"
params: List = []
clauses = []
if only_usable:
clauses.append("validated = 1 AND score > 0")
if protocol:
clauses.append("protocol = ?")
params.append(protocol.lower())
if clauses:
query += " WHERE " + " AND ".join(clauses)
query += " LIMIT ? OFFSET ?"
params.extend([batch_size, offset])
async with db.execute(query, params) as cursor:
rows = await cursor.fetchall()
return [_row_to_proxy(row) for row in rows]
@staticmethod
async def list_paginated(
db: aiosqlite.Connection,
@@ -196,6 +324,7 @@ class ProxyRepository:
max_score: Optional[int] = None,
sort_by: str = "last_check",
sort_order: str = "DESC",
pool_filter: Optional[str] = None,
) -> Tuple[List[Proxy], int]:
conditions = ["score >= ?"]
params: List = [min_score]
@@ -206,6 +335,10 @@ class ProxyRepository:
if max_score is not None:
conditions.append("score <= ?")
params.append(max_score)
if pool_filter == "pending":
conditions.append("validated = 0")
elif pool_filter == "available":
conditions.append("validated = 1 AND score > 0")
where_clause = " AND ".join(conditions)
allowed_sort_by = {"ip", "port", "protocol", "score", "last_check"}
@@ -222,7 +355,7 @@ class ProxyRepository:
total = row[0] if row else 0
data_query = f"""
SELECT ip, port, protocol, score, response_time_ms, last_check, created_at
SELECT {_SELECT_PROXY_COLS}
FROM proxies
WHERE {where_clause}
ORDER BY {order_clause}
@@ -239,8 +372,9 @@ class ProxyRepository:
query = """
SELECT
COUNT(*) as total,
COUNT(CASE WHEN score > 0 THEN 1 END) as available,
AVG(score) as avg_score,
COUNT(CASE WHEN validated = 0 THEN 1 END) as pending,
COUNT(CASE WHEN validated = 1 AND score > 0 THEN 1 END) as available,
(SELECT AVG(score) FROM proxies WHERE validated = 1 AND score > 0) as avg_score,
COUNT(CASE WHEN protocol = 'http' THEN 1 END) as http_count,
COUNT(CASE WHEN protocol = 'https' THEN 1 END) as https_count,
COUNT(CASE WHEN protocol = 'socks4' THEN 1 END) as socks4_count,
@@ -252,15 +386,17 @@ class ProxyRepository:
if row:
return {
"total": row[0] or 0,
"available": row[1] or 0,
"avg_score": round(row[2], 2) if row[2] else 0,
"http_count": row[3] or 0,
"https_count": row[4] or 0,
"socks4_count": row[5] or 0,
"socks5_count": row[6] or 0,
"pending": row[1] or 0,
"available": row[2] or 0,
"avg_score": round(row[3], 2) if row[3] is not None else 0,
"http_count": row[4] or 0,
"https_count": row[5] or 0,
"socks4_count": row[6] or 0,
"socks5_count": row[7] or 0,
}
return {
"total": 0,
"pending": 0,
"available": 0,
"avg_score": 0,
"http_count": 0,
@@ -271,9 +407,15 @@ class ProxyRepository:
@staticmethod
async def get_today_new_count(db: aiosqlite.Connection) -> int:
"""今日新增:仅统计今日入库且已验证可用(与 get_stats.available 语义一致)。"""
try:
async with db.execute(
"SELECT COUNT(*) FROM proxies WHERE DATE(created_at) = DATE('now', 'localtime')"
"""
SELECT COUNT(*) FROM proxies
WHERE DATE(created_at) = DATE('now', 'localtime')
AND validated = 1
AND score > 0
"""
) as cursor:
row = await cursor.fetchone()
return row[0] if row else 0
@@ -283,7 +425,9 @@ class ProxyRepository:
@staticmethod
async def clean_invalid(db: aiosqlite.Connection) -> int:
await db.execute("DELETE FROM proxies WHERE score <= 0")
await db.execute(
"DELETE FROM proxies WHERE validated = 1 AND score <= 0"
)
await db.commit()
return db.total_changes

View File

@@ -6,13 +6,12 @@ from app.core.log import logger
DEFAULT_SETTINGS = {
"crawl_timeout": 30,
"validation_timeout": 10,
"max_retries": 3,
"default_concurrency": 50,
"validation_timeout": 6,
"default_concurrency": 120,
"min_proxy_score": 0,
"proxy_expiry_days": 7,
"auto_validate": True,
"auto_validate_after_crawl": False,
"validate_interval_minutes": 30,
"validation_targets": [
"http://httpbin.org/ip",
@@ -50,6 +49,8 @@ class SettingsRepository:
settings[key] = value
except Exception as e:
logger.error(f"get_all settings failed: {e}")
# 已废弃:爬取限时改为每插件 crawl_timeout_seconds不再存全局项
settings.pop("crawl_timeout", None)
return settings
@staticmethod

View File

@@ -0,0 +1,9 @@
"""首页 / 仪表盘统计快照(供 REST 与 WebSocket 复用)"""
from app.services.proxy_service import ProxyService
async def get_dashboard_stats(scheduler_running: bool) -> dict:
proxy_service = ProxyService()
stats = await proxy_service.get_stats()
stats["scheduler_running"] = scheduler_running
return stats

View File

@@ -4,7 +4,6 @@ from datetime import datetime
from typing import Optional
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.config import settings as app_settings
from app.core.log import logger
from app.models.domain import CrawlResult, ProxyRaw
@@ -12,14 +11,13 @@ from app.models.domain import CrawlResult, ProxyRaw
class PluginRunner:
"""统一插件执行器
- 超时控制(从 settings 读取 crawl_timeout
- 异常捕获和统计更新
- 可选的健康检查前置
- 结果去重
- 超时:每插件独立,使用 plugin.crawl_timeout_seconds默认 120s
- 可选 crawl_timeout_override仅用于测试等场景覆盖插件自身限时
- 异常捕获和统计更新、健康检查前置、结果去重
"""
def __init__(self, timeout: Optional[float] = None):
self.timeout = timeout if timeout is not None else getattr(app_settings, "crawler_timeout", 30)
def __init__(self, crawl_timeout_override: Optional[float] = None):
self.crawl_timeout_override = crawl_timeout_override
async def run(self, plugin: BaseCrawlerPlugin) -> CrawlResult:
"""执行单个插件爬取"""
@@ -42,19 +40,22 @@ class PluginRunner:
await self._save_stats(plugin, result)
return result
# 执行爬取
crawl_limit = float(getattr(plugin, "crawl_timeout_seconds", 120.0))
if self.crawl_timeout_override is not None:
crawl_limit = float(self.crawl_timeout_override)
try:
proxies = await asyncio.wait_for(
plugin.crawl(),
timeout=self.timeout,
timeout=crawl_limit,
)
result.proxies = self._dedup(proxies)
result.success_count = 1 if result.proxies else 0
result.success_count = len(result.proxies)
logger.info(
f"Plugin {plugin.name} crawled {len(result.proxies)} unique proxies"
)
except asyncio.TimeoutError:
result.error = f"crawl timeout after {self.timeout}s"
result.error = f"crawl timeout after {crawl_limit}s"
result.failure_count = 1
logger.error(f"Plugin {plugin.name} crawl timeout")
except Exception as e:

View File

@@ -5,7 +5,7 @@ from typing import List, Optional
from app.core.db import get_db
from app.core.plugin_system.registry import registry
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.core.exceptions import PluginNotFoundException
from app.core.exceptions import PluginNotFoundException, ValidationException
from app.repositories.settings_repo import PluginSettingsRepository
from app.models.domain import PluginInfo, ProxyRaw, CrawlResult
from app.core.log import logger

View File

@@ -30,10 +30,19 @@ class ProxyService:
max_score: Optional[int] = None,
sort_by: str = "last_check",
sort_order: str = "DESC",
pool_filter: Optional[str] = None,
) -> Tuple[List[Proxy], int]:
async with get_db() as db:
return await self.proxy_repo.list_paginated(
db, page, page_size, protocol, min_score, max_score, sort_by, sort_order
db,
page,
page_size,
protocol,
min_score,
max_score,
sort_by,
sort_order,
pool_filter=pool_filter,
)
async def get_random_proxy(self) -> Optional[Proxy]:
@@ -72,7 +81,9 @@ class ProxyService:
exported = 0
async with get_db() as db:
async for batch in self.proxy_repo.iter_batches(db, protocol=protocol, batch_size=1000):
async for batch in self.proxy_repo.iter_batches(
db, protocol=protocol, batch_size=1000, only_usable=True
):
for p in batch:
if exported >= limit:
break

View File

@@ -2,9 +2,11 @@
import asyncio
import random
import time
from collections import OrderedDict
from typing import Tuple, Optional, List
import aiohttp
import aiohttp_socks
from typing import Tuple, Optional, List
from app.core.config import settings as app_settings
from app.core.log import logger
@@ -14,6 +16,7 @@ class ValidatorService:
"""代理验证器
支持动态读取配置,实现设置热更新。
并发由 AsyncWorkerPool.worker_count 限制,此处不再套 Semaphore。
"""
# 测试 URL 默认池
@@ -32,23 +35,30 @@ class ValidatorService:
],
}
_SOCKS_CACHE_CAP = 128
def __init__(
self,
timeout: Optional[float] = None,
connect_timeout: Optional[float] = None,
max_concurrency: Optional[int] = None,
):
# 初始化时使用传入值或默认值,但运行期会动态读取 settings
self._init_timeout = timeout if timeout is not None else app_settings.validator_timeout
self._init_connect_timeout = connect_timeout if connect_timeout is not None else app_settings.validator_connect_timeout
self._init_max_concurrency = max_concurrency if max_concurrency is not None else app_settings.validator_max_concurrency
self._init_connect_timeout = (
connect_timeout if connect_timeout is not None else app_settings.validator_connect_timeout
)
self._init_max_concurrency = (
max_concurrency if max_concurrency is not None else app_settings.validator_max_concurrency
)
self._http_connector: Optional[aiohttp.TCPConnector] = None
self._http_session: Optional[aiohttp.ClientSession] = None
self._semaphore: Optional[asyncio.Semaphore] = None
self._lock = asyncio.Lock()
self._test_urls: Optional[List[str]] = None
self._socks_sessions: "OrderedDict[Tuple[str, str, int], aiohttp.ClientSession]" = OrderedDict()
self._socks_lock = asyncio.Lock()
@property
def timeout(self) -> float:
return float(self._init_timeout)
@@ -61,11 +71,16 @@ class ValidatorService:
def max_concurrency(self) -> int:
return int(self._init_max_concurrency)
def _client_timeout(self) -> aiohttp.ClientTimeout:
t = float(self.timeout)
c = min(float(self.connect_timeout), t)
sock_read = min(t, max(2.0, t * 0.85))
return aiohttp.ClientTimeout(total=t, connect=c, sock_read=sock_read)
async def _ensure_session(self) -> aiohttp.ClientSession:
"""懒加载共享 HTTP session"""
if self._http_session is None or self._http_session.closed:
async with self._lock:
# 双重检查,避免多个协程在获取锁后重复创建
if self._http_session is None or self._http_session.closed:
connector = aiohttp.TCPConnector(
ssl=False,
@@ -73,28 +88,18 @@ class ValidatorService:
limit_per_host=self.max_concurrency,
force_close=False,
)
timeout = aiohttp.ClientTimeout(
total=self.timeout, connect=self.connect_timeout
)
self._http_connector = connector
self._http_session = aiohttp.ClientSession(
connector=connector,
timeout=timeout,
timeout=self._client_timeout(),
)
return self._http_session
def _ensure_semaphore(self) -> asyncio.Semaphore:
if self._semaphore is None:
self._semaphore = asyncio.Semaphore(self.max_concurrency)
return self._semaphore
def _get_test_url(self, protocol: str) -> str:
custom_urls = self._test_urls
if not custom_urls:
from app.core.config import settings as app_settings
custom_urls = getattr(app_settings, "validator_test_urls", None)
if custom_urls and isinstance(custom_urls, list) and len(custom_urls) > 0:
# 按协议过滤自定义 URL如果没有匹配的则使用全部
filtered = [u for u in custom_urls if u.lower().startswith(protocol.lower())]
if filtered:
return random.choice(filtered)
@@ -105,14 +110,10 @@ class ValidatorService:
async def validate(self, ip: str, port: int, protocol: str = "http") -> Tuple[bool, float]:
"""验证单个代理,返回 (是否有效, 延迟毫秒)"""
protocol = protocol.lower()
semaphore = self._ensure_semaphore()
async with semaphore:
start = time.time()
try:
if protocol in ("socks4", "socks5"):
return await self._validate_socks(ip, port, protocol, start)
else:
return await self._validate_http(ip, port, protocol, start)
except asyncio.TimeoutError:
logger.debug(f"Validation timeout: {ip}:{port} ({protocol})")
@@ -129,11 +130,24 @@ class ValidatorService:
async with session.get(test_url, proxy=proxy_url, allow_redirects=True) as response:
if response.status in (200, 301, 302):
latency = round((time.time() - start) * 1000, 2)
logger.info(f"HTTP valid: {ip}:{port} ({protocol}) {latency}ms")
logger.debug(f"HTTP valid: {ip}:{port} ({protocol}) {latency}ms")
return True, latency
return False, 0.0
async def _validate_socks(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]:
async def _get_socks_session(self, protocol: str, ip: str, port: int) -> aiohttp.ClientSession:
key = (protocol, ip, port)
async with self._socks_lock:
sess = self._socks_sessions.get(key)
if sess is not None:
if sess.closed:
del self._socks_sessions[key]
else:
self._socks_sessions.move_to_end(key)
return sess
while len(self._socks_sessions) >= self._SOCKS_CACHE_CAP:
_, old = self._socks_sessions.popitem(last=False)
if old is not None and not old.closed:
await old.close()
proxy_type = (
aiohttp_socks.ProxyType.SOCKS4
if protocol == "socks4"
@@ -146,23 +160,33 @@ class ValidatorService:
rdns=True,
ssl=False,
)
timeout = aiohttp.ClientTimeout(total=self.timeout, connect=self.connect_timeout)
test_url = self._get_test_url("http")
sess = aiohttp.ClientSession(connector=connector, timeout=self._client_timeout())
self._socks_sessions[key] = sess
return sess
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
async def _validate_socks(self, ip: str, port: int, protocol: str, start: float) -> Tuple[bool, float]:
test_url = self._get_test_url("http")
session = await self._get_socks_session(protocol, ip, port)
async with session.get(test_url, allow_redirects=True) as response:
if response.status in (200, 301, 302):
latency = round((time.time() - start) * 1000, 2)
logger.info(f"SOCKS valid: {ip}:{port} ({protocol}) {latency}ms")
logger.debug(f"SOCKS valid: {ip}:{port} ({protocol}) {latency}ms")
return True, latency
return False, 0.0
async def close_socks_sessions(self) -> None:
"""关闭 SOCKS 会话缓存(设置热更新或进程退出时调用)。"""
async with self._socks_lock:
for s in list(self._socks_sessions.values()):
if not s.closed:
await s.close()
self._socks_sessions.clear()
def update_test_urls(self, urls: List[str]) -> None:
"""运行时更新验证目标 URL 列表"""
self._test_urls = list(urls) if urls else None
async def close(self) -> None:
"""关闭共享的 HTTP ClientSession"""
await self.close_socks_sessions()
if self._http_session and not self._http_session.closed:
await self._http_session.close()
self._http_session = None

View File

@@ -1,4 +1,4 @@
[tool:pytest]
[pytest]
testpaths = tests
python_files = test_*.py
python_classes = Test*
@@ -13,5 +13,6 @@ markers =
integration: 集成测试
e2e: 端到端测试
slow: 慢速测试
network: 需要出站网络(真实爬取/验证)
async_test: 异步测试
asyncio_default_fixture_loop_scope = function

View File

@@ -0,0 +1,35 @@
"""对 SQLite settings 表执行维护 SQL见 db_optimize_settings.sql
使用当前应用配置的数据库路径app.core.db.DB_PATH。pytest 使用 PROXYPOOL_DB_PATH
指向 db/proxies.test.sqlite勿在生产库路径上误跑测试夹具。
"""
import asyncio
import os
import sys
# 保证可 import app
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
async def main() -> None:
from app.core.db import DB_PATH, ensure_db_dir
import aiosqlite
sql_path = os.path.join(os.path.dirname(__file__), "db_optimize_settings.sql")
with open(sql_path, encoding="utf-8") as f:
script = f.read()
ensure_db_dir()
if not os.path.isfile(DB_PATH):
print(f"数据库不存在,跳过: {DB_PATH}")
return
async with aiosqlite.connect(DB_PATH) as db:
await db.executescript(script)
await db.commit()
print(f"已执行设置维护: {DB_PATH}")
print("请重启应用或在 WebUI 保存一次设置以使并发/超时生效。")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,14 @@
-- ProxyPool设置表维护负优化清理 + 推荐验证参数)
-- 用法:在停服或确认无并发写入时执行;或运行 python scripts/apply_settings_maintenance.py
-- 注意:改库后需「重启应用」或在 WebUI「保存设置」才会让运行中的 WorkerPool / Validator 重载并发与超时。
-- 废弃键
DELETE FROM settings WHERE key = 'crawl_timeout';
DELETE FROM settings WHERE key = 'max_retries';
-- 推荐验证参数(可按机器与网络再调大 default_concurrency
INSERT INTO settings (key, value, updated_at) VALUES ('validation_timeout', '6', CURRENT_TIMESTAMP)
ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = CURRENT_TIMESTAMP;
INSERT INTO settings (key, value, updated_at) VALUES ('default_concurrency', '120', CURRENT_TIMESTAMP)
ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = CURRENT_TIMESTAMP;

View File

@@ -5,6 +5,8 @@
```
tests/
├── conftest.py # pytest 配置和 fixtures
├── task_utils.py # 异步任务轮询(集成/E2E 共用)
├── support/ # 测试专用插件类等(非 mock
├── README.md # 本文件
├── unit/ # 单元测试
│ ├── test_models.py # 模型测试
@@ -12,6 +14,7 @@ tests/
├── integration/ # 集成测试
│ ├── test_proxies_api.py # 代理 API 测试
│ ├── test_plugins_api.py # 插件 API 测试
│ ├── test_plugins_live_crawl.py # 各插件真实爬取验收(须外网)
│ ├── test_scheduler_api.py # 调度器 API 测试
│ ├── test_settings_api.py # 设置 API 测试
│ └── test_health_api.py # 健康检查测试
@@ -19,6 +22,25 @@ tests/
└── test_full_workflow.py # 完整工作流测试
```
## 网络与真实调用
集成测试与 E2E **不再 mock** `PluginRunner` / `ValidatorService`:会发起真实 HTTP 爬取与代理验证(视设置而定)。运行全量 `pytest` 需要 **可用的出站网络**,且含 `network` / `slow` 标记的用例可能耗时数分钟。
跳过需外网的用例(例如离线快速检查):
```bash
pytest -m "not network"
```
**插件爬取验收**`test_plugins_live_crawl.py`
- 核心 8 插件:必须至少 1 条代理且无 Runner 失败。
- `fpw_*`:对照 [Free_Proxy_Website](https://github.com/cyubuchen/Free_Proxy_Website) 的公开源,允许 0 条(国际网络差异),使用更长超时。
```bash
pytest tests/integration/test_plugins_live_crawl.py -v
```
## 运行测试
### 安装测试依赖

View File

@@ -1,5 +1,15 @@
"""pytest 配置文件和 fixtures"""
# 必须在任何 app.* 导入之前:下方 app fixture 会清空表,不可与生产共用 db/proxies.sqlite
import os
os.environ["PROXYPOOL_DB_PATH"] = "db/proxies.test.sqlite"
import asyncio
import sys
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
import pytest
import pytest_asyncio
from typing import AsyncGenerator
@@ -17,22 +27,28 @@ from app.plugins import (
SpeedXPlugin,
YunDaiLiPlugin,
ProxyScrapePlugin,
FpwProxyListDownloadPlugin,
FpwSocksSslProxyPlugin,
FpwSpysOnePlugin,
FpwProxynovaPlugin,
FpwHidemyPlugin,
FpwPremproxyPlugin,
FpwFreeproxylistsPlugin,
FpwGatherproxyPlugin,
FpwCheckerproxyPlugin,
)
from app.repositories.proxy_repo import ProxyRepository
from app.models.domain import ProxyRaw
@pytest_asyncio.fixture(scope="function")
async def app():
"""创建应用实例"""
# 初始化测试数据库并清空历史数据
await init_db()
async with get_db() as db:
await db.execute("DELETE FROM proxies")
await db.execute("DELETE FROM settings")
await db.commit()
# 清理并重新注册插件,防止跨测试污染
registry.clear()
for plugin_cls in [
Fate0Plugin,
@@ -43,6 +59,15 @@ async def app():
SpeedXPlugin,
YunDaiLiPlugin,
ProxyScrapePlugin,
FpwProxyListDownloadPlugin,
FpwSocksSslProxyPlugin,
FpwSpysOnePlugin,
FpwProxynovaPlugin,
FpwHidemyPlugin,
FpwPremproxyPlugin,
FpwFreeproxylistsPlugin,
FpwGatherproxyPlugin,
FpwCheckerproxyPlugin,
]:
registry.register(plugin_cls)
@@ -50,7 +75,6 @@ async def app():
async with test_app.router.lifespan_context(test_app):
yield test_app
# 给 aiosqlite / aiohttp 后台线程留出收尾时间
await asyncio.sleep(0.1)
@@ -80,32 +104,4 @@ async def sample_proxy(db, proxy_repo):
"""创建一个测试代理"""
await proxy_repo.insert_or_update(db, "192.168.1.1", 8080, "http", 50)
yield {"ip": "192.168.1.1", "port": 8080, "protocol": "http", "score": 50}
# 清理
await proxy_repo.delete(db, "192.168.1.1", 8080)
@pytest_asyncio.fixture(autouse=True)
async def mock_external_requests(monkeypatch, request):
"""
自动在集成/E2E 测试中 mock 外部网络请求:
1. 插件爬取返回固定测试代理,避免真实 HTTP 请求
2. 代理验证瞬间成功,避免连接超时等待
"""
if "/unit/" in request.node.nodeid:
return
from app.services.plugin_runner import PluginRunner
from app.services.validator_service import ValidatorService
async def _mock_run(self, plugin):
from app.models.domain import CrawlResult
return CrawlResult(
plugin_name=plugin.name,
proxies=[ProxyRaw("192.168.100.10", 8080, "http")],
success_count=1,
)
async def _mock_validate(self, ip: str, port: int, protocol: str = "http"):
return True, 1.23
monkeypatch.setattr(PluginRunner, "run", _mock_run)
monkeypatch.setattr(ValidatorService, "validate", _mock_validate)

View File

@@ -4,10 +4,14 @@
"""
import pytest
from tests.task_utils import poll_task_until_terminal
class TestFullWorkflow:
"""测试完整工作流"""
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio
async def test_proxy_management_workflow(self, client):
"""测试代理管理完整工作流
@@ -35,11 +39,17 @@ class TestFullWorkflow:
# 3. 触发所有插件爬取
response = await client.post("/api/plugins/crawl-all")
assert response.status_code == 200
crawl_result = response.json()["data"]
task_id = response.json()["data"]["task_id"]
task_data = await poll_task_until_terminal(
client, task_id, max_rounds=400, interval=0.5
)
assert task_data is not None
assert task_data["status"] in ("completed", "failed", "cancelled")
# 4. 获取更新后的统计
response = await client.get("/api/proxies/stats")
updated_stats = response.json()["data"]
assert "total" in initial_stats and "total" in updated_stats
# 5. 导出代理(所有格式)
for fmt in ["csv", "txt", "json"]:
@@ -50,6 +60,8 @@ class TestFullWorkflow:
response = await client.delete("/api/proxies/clean-invalid")
assert response.status_code == 200
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio
async def test_plugin_management_workflow(self, client):
"""测试插件管理完整工作流
@@ -93,6 +105,12 @@ class TestFullWorkflow:
# 6. 触发爬取
response = await client.post(f"/api/plugins/{plugin_id}/crawl")
assert response.status_code == 200
crawl_task_id = response.json()["data"]["task_id"]
crawl_task = await poll_task_until_terminal(
client, crawl_task_id, max_rounds=140, interval=0.5
)
assert crawl_task is not None
assert crawl_task["status"] in ("completed", "failed", "cancelled")
@pytest.mark.asyncio
async def test_scheduler_workflow(self, client):

View File

@@ -1,6 +1,8 @@
"""插件 API 集成测试 - 测试 /api/plugins/* 所有接口"""
import pytest
from tests.task_utils import poll_task_until_terminal
class TestPluginsAPI:
"""测试插件相关 API"""
@@ -116,10 +118,11 @@ class TestPluginsAPI:
data = response.json()
assert data["code"] == 200
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio
async def test_crawl_plugin(self, client):
"""测试 POST /api/plugins/{id}/crawl - 异步任务模式"""
import asyncio
response = await client.get("/api/plugins")
plugins = response.json()["data"]["plugins"]
if not plugins:
@@ -133,18 +136,11 @@ class TestPluginsAPI:
assert "task_id" in data["data"]
task_id = data["data"]["task_id"]
# 轮询任务状态
task_data = None
for _ in range(10):
await asyncio.sleep(0.3)
res = await client.get(f"/api/tasks/{task_id}")
assert res.status_code == 200
task_data = res.json()["data"]
if task_data["status"] in ("completed", "failed", "cancelled"):
break
task_data = await poll_task_until_terminal(
client, task_id, max_rounds=140, interval=0.5
)
assert task_data is not None
assert task_data["status"] in ("completed", "cancelled")
assert task_data["status"] in ("completed", "failed", "cancelled")
@pytest.mark.asyncio
async def test_crawl_nonexistent_plugin(self, client):
@@ -152,10 +148,11 @@ class TestPluginsAPI:
response = await client.post("/api/plugins/nonexistent_plugin/crawl")
assert response.status_code == 404
@pytest.mark.network
@pytest.mark.slow
@pytest.mark.asyncio
async def test_crawl_all_plugins(self, client):
"""测试 POST /api/plugins/crawl-all - 异步任务模式"""
import asyncio
response = await client.post("/api/plugins/crawl-all")
assert response.status_code == 200
data = response.json()
@@ -163,15 +160,8 @@ class TestPluginsAPI:
assert "task_id" in data["data"]
task_id = data["data"]["task_id"]
# 轮询任务状态
task_data = None
for _ in range(10):
await asyncio.sleep(0.3)
res = await client.get(f"/api/tasks/{task_id}")
assert res.status_code == 200
task_data = res.json()["data"]
if task_data["status"] in ("completed", "failed", "cancelled"):
break
task_data = await poll_task_until_terminal(
client, task_id, max_rounds=400, interval=0.5
)
assert task_data is not None
assert task_data["status"] in ("completed", "cancelled")
assert task_data["status"] in ("completed", "failed", "cancelled")

View File

@@ -14,6 +14,7 @@ class TestProxiesAPI:
assert data["code"] == 200
assert "data" in data
assert "total" in data["data"]
assert "pending" in data["data"]
assert "available" in data["data"]
assert "scheduler_running" in data["data"]
@@ -68,6 +69,17 @@ class TestProxiesAPI:
# 可能返回 200(有数据) 或 404(无数据)
assert response.status_code in [200, 404]
@pytest.mark.asyncio
async def test_delete_proxy_post_json(self, client, sample_proxy):
"""测试 POST /api/proxies/delete-one前端默认路径兼容 IPv6"""
response = await client.post(
"/api/proxies/delete-one",
json={"ip": sample_proxy["ip"], "port": sample_proxy["port"]},
)
assert response.status_code == 200
data = response.json()
assert data["code"] == 200
@pytest.mark.asyncio
async def test_delete_proxy(self, client, sample_proxy):
"""测试 DELETE /api/proxies/{ip}/{port}"""
@@ -76,6 +88,19 @@ class TestProxiesAPI:
data = response.json()
assert data["code"] == 200
@pytest.mark.asyncio
async def test_delete_one_ipv6(self, client, db, proxy_repo):
"""POST delete-one 可删除含冒号的 IP路径 DELETE 无法可靠表达)"""
await proxy_repo.insert_or_update(db, "2001:db8::1", 18080, "http", 40)
r = await client.post(
"/api/proxies/delete-one",
json={"ip": "2001:db8::1", "port": 18080},
)
assert r.status_code == 200
assert r.json()["code"] == 200
left = await proxy_repo.get_by_ip_port(db, "2001:db8::1", 18080)
assert left is None
@pytest.mark.asyncio
async def test_delete_nonexistent_proxy(self, client):
"""测试 DELETE /api/proxies/{ip}/{port} - 不存在的代理"""

View File

@@ -1,6 +1,17 @@
"""调度器 API 集成测试 - 测试 /api/scheduler/* 所有接口"""
import pytest
from app.api.deps import get_settings_repo
from app.repositories.settings_repo import SettingsRepository
class FailingSettingsRepository(SettingsRepository):
"""save 恒为 False用于覆盖「设置保存失败」分支非 MagicMock。"""
@staticmethod
async def save(db, settings):
return False
class TestSchedulerAPI:
"""测试调度器相关 API"""
@@ -93,18 +104,17 @@ class TestSchedulerAPI:
assert job is not None
@pytest.mark.asyncio
async def test_start_scheduler_db_save_failure(self, client, monkeypatch):
async def test_start_scheduler_db_save_failure(self, client, app):
"""测试启动调度器时数据库保存失败应返回 running=False"""
from app.repositories.settings_repo import SettingsRepository
# lifespan 启动时调度器可能已自动启动,先停止它
await client.post("/api/scheduler/stop")
async def mock_save(*args, **kwargs):
return False
monkeypatch.setattr(SettingsRepository, "save", mock_save)
app.dependency_overrides[get_settings_repo] = lambda: FailingSettingsRepository()
try:
response = await client.post("/api/scheduler/start")
finally:
app.dependency_overrides.pop(get_settings_repo, None)
assert response.status_code == 200
data = response.json()
assert data["code"] == 200

View File

@@ -12,7 +12,7 @@ class TestSettingsAPI:
assert response.status_code == 200
data = response.json()
assert data["code"] == 200
assert "crawl_timeout" in data["data"]
assert "crawl_timeout" not in data["data"]
assert "validation_timeout" in data["data"]
assert "auto_validate" in data["data"]
@@ -22,16 +22,15 @@ class TestSettingsAPI:
response = await client.get("/api/settings")
data = response.json()["data"]
# 验证所有预期的设置项
expected_keys = [
"crawl_timeout",
"validation_timeout",
"max_retries",
"default_concurrency",
"min_proxy_score",
"proxy_expiry_days",
"auto_validate",
"auto_validate_after_crawl",
"validate_interval_minutes",
"validation_targets",
]
for key in expected_keys:
assert key in data, f"缺少设置项: {key}"
@@ -40,65 +39,45 @@ class TestSettingsAPI:
async def test_save_settings(self, client):
"""测试 POST /api/settings"""
settings = {
"crawl_timeout": 45,
"validation_timeout": 15,
"max_retries": 5,
"default_concurrency": 100,
"min_proxy_score": 10,
"proxy_expiry_days": 14,
"auto_validate": True,
"auto_validate_after_crawl": False,
"validate_interval_minutes": 60,
"validation_targets": [
"http://httpbin.org/ip",
],
}
response = await client.post("/api/settings", json=settings)
assert response.status_code == 200
data = response.json()
assert data["code"] == 200
# 验证返回的数据与提交的一致
for key, value in settings.items():
assert data["data"][key] == value
@pytest.mark.asyncio
async def test_save_settings_partial(self, client):
"""测试 POST /api/settings - 部分更新(实际上会替换所有)"""
# 先获取当前设置
response = await client.get("/api/settings")
current_settings = response.json()["data"]
# 修改部分设置
new_settings = current_settings.copy()
new_settings["crawl_timeout"] = 60
new_settings["validation_timeout"] = 25
new_settings["auto_validate"] = False
response = await client.post("/api/settings", json=new_settings)
assert response.status_code == 200
data = response.json()
assert data["data"]["crawl_timeout"] == 60
assert data["data"]["validation_timeout"] == 25
assert data["data"]["auto_validate"] is False
@pytest.mark.asyncio
async def test_save_settings_validation_error(self, client):
"""测试 POST /api/settings - 验证错误"""
# crawl_timeout 必须在 5-120 之间
invalid_settings = {
"crawl_timeout": 200, # 超出范围
"validation_timeout": 10,
"max_retries": 3,
"default_concurrency": 50,
"min_proxy_score": 0,
"proxy_expiry_days": 7,
"auto_validate": True,
"validate_interval_minutes": 30,
}
response = await client.post("/api/settings", json=invalid_settings)
assert response.status_code == 422 # 验证错误
@pytest.mark.asyncio
async def test_save_settings_invalid_type(self, client):
"""测试 POST /api/settings - 无效类型"""
invalid_settings = {
"crawl_timeout": "invalid", # 应该是整数
"validation_timeout": 10,
"max_retries": 3,
"validation_timeout": 100,
"default_concurrency": 50,
"min_proxy_score": 0,
"proxy_expiry_days": 7,
@@ -108,15 +87,49 @@ class TestSettingsAPI:
response = await client.post("/api/settings", json=invalid_settings)
assert response.status_code == 422
@pytest.mark.asyncio
async def test_save_settings_invalid_type(self, client):
"""测试 POST /api/settings - 无效类型"""
invalid_settings = {
"validation_timeout": 10,
"default_concurrency": "invalid",
"min_proxy_score": 0,
"proxy_expiry_days": 7,
"auto_validate": True,
"validate_interval_minutes": 30,
}
response = await client.post("/api/settings", json=invalid_settings)
assert response.status_code == 422
@pytest.mark.asyncio
async def test_save_settings_ignores_deprecated_crawl_timeout(self, client):
"""旧客户端若仍提交 crawl_timeout应忽略且保存成功"""
response = await client.get("/api/settings")
base = response.json()["data"]
payload = {**base, "crawl_timeout": 999}
response = await client.post("/api/settings", json=payload)
assert response.status_code == 200
again = (await client.get("/api/settings")).json()["data"]
assert "crawl_timeout" not in again
@pytest.mark.asyncio
async def test_save_settings_ignores_obsolete_max_retries(self, client):
"""已移除的 max_retries 键若仍被提交,应忽略。"""
response = await client.get("/api/settings")
base = response.json()["data"]
payload = {**base, "max_retries": 9}
response = await client.post("/api/settings", json=payload)
assert response.status_code == 200
again = (await client.get("/api/settings")).json()["data"]
assert "max_retries" not in again
@pytest.mark.asyncio
async def test_settings_roundtrip(self, client):
"""测试设置读写一致性"""
# 生成随机但有效的设置
import random
test_settings = {
"crawl_timeout": random.randint(10, 60),
"validation_timeout": random.randint(5, 30),
"max_retries": random.randint(1, 5),
"default_concurrency": random.randint(20, 100),
"min_proxy_score": random.randint(0, 50),
"proxy_expiry_days": random.randint(1, 14),
@@ -124,15 +137,12 @@ class TestSettingsAPI:
"validate_interval_minutes": random.randint(10, 120),
}
# 写入设置
response = await client.post("/api/settings", json=test_settings)
assert response.status_code == 200
# 读取设置
response = await client.get("/api/settings")
saved_settings = response.json()["data"]
# 验证一致性
for key, value in test_settings.items():
assert saved_settings[key] == value, f"设置项 {key} 不一致"
@@ -140,9 +150,7 @@ class TestSettingsAPI:
async def test_settings_roundtrip_with_validation_targets(self, client):
"""测试设置读写一致性 - 包含数组类型的 validation_targets"""
test_settings = {
"crawl_timeout": 30,
"validation_timeout": 10,
"max_retries": 3,
"default_concurrency": 50,
"min_proxy_score": 0,
"proxy_expiry_days": 7,
@@ -154,13 +162,11 @@ class TestSettingsAPI:
],
}
# 写入设置
response = await client.post("/api/settings", json=test_settings)
assert response.status_code == 200
data = response.json()
assert data["data"]["validation_targets"] == test_settings["validation_targets"]
# 读取设置
response = await client.get("/api/settings")
saved_settings = response.json()["data"]
assert saved_settings["validation_targets"] == test_settings["validation_targets"]
@@ -179,7 +185,6 @@ class TestSettingsAPI:
data = response.json()
assert data["data"]["validation_targets"] == []
# 读取确认
response = await client.get("/api/settings")
saved_settings = response.json()["data"]
assert saved_settings["validation_targets"] == []

View File

@@ -0,0 +1 @@
# Test support package (non-mock plugin doubles, etc.)

View File

@@ -0,0 +1,19 @@
"""供 PluginRunner 等测试使用的真实插件子类(非 unittest.mock"""
from typing import List
from app.core.plugin_system.base import BaseCrawlerPlugin
from app.models.domain import ProxyRaw
class UnhealthyPlugin(BaseCrawlerPlugin):
"""health_check 抛错,用于验证 Runner 对异常的统计与落库。"""
name = "test_unhealthy_runner"
display_name = "TestUnhealthy"
description = "PluginRunner health_check failure test double"
async def crawl(self) -> List[ProxyRaw]:
return []
async def health_check(self) -> bool:
raise RuntimeError("network down")

22
tests/task_utils.py Normal file
View File

@@ -0,0 +1,22 @@
"""测试用异步任务轮询工具"""
import asyncio
from typing import Any, Dict, Optional
async def poll_task_until_terminal(
client,
task_id: str,
*,
max_rounds: int,
interval: float,
) -> Optional[Dict[str, Any]]:
"""轮询任务直到终态或超时。返回最后一次 task data。"""
task_data = None
for _ in range(max_rounds):
await asyncio.sleep(interval)
res = await client.get(f"/api/tasks/{task_id}")
assert res.status_code == 200
task_data = res.json()["data"]
if task_data["status"] in ("completed", "failed", "cancelled"):
break
return task_data

View File

@@ -106,6 +106,14 @@ class TestProxyListRequest:
assert request.page_size == 50
assert request.protocol == "https"
def test_pool_filter_pending_available(self):
r1 = ProxyListRequest(pool_filter="pending")
assert r1.pool_filter == "pending"
r2 = ProxyListRequest(pool_filter="all")
assert r2.pool_filter is None
with pytest.raises(Exception):
ProxyListRequest(pool_filter="invalid")
class TestSettingsSchema:
"""测试 SettingsSchema"""
@@ -113,16 +121,22 @@ class TestSettingsSchema:
def test_default_settings(self):
"""测试默认设置"""
settings = SettingsSchema()
assert settings.crawl_timeout == 30
assert settings.validation_timeout == 10
assert settings.validation_timeout == 6
assert settings.default_concurrency == 120
assert settings.auto_validate is True
assert settings.auto_validate_after_crawl is False
def test_custom_settings(self):
"""测试自定义设置"""
settings = SettingsSchema(crawl_timeout=60, auto_validate=False)
assert settings.crawl_timeout == 60
settings = SettingsSchema(validation_timeout=25, auto_validate=False)
assert settings.validation_timeout == 25
assert settings.auto_validate is False
def test_settings_schema_ignores_unknown_fields(self):
s = SettingsSchema.model_validate({"validation_timeout": 10, "crawl_timeout": 99})
assert "crawl_timeout" not in s.model_dump()
assert s.validation_timeout == 10
class TestBatchDeleteRequest:
"""测试 BatchDeleteRequest"""

View File

@@ -81,23 +81,25 @@ class TestProxyRepository:
@pytest.mark.asyncio
async def test_iter_batches(self, db, proxy_repo):
"""测试流式分批读取"""
# 插入 5 条测试数据
"""测试流式分批读取(与库内已有数据共存,只校验增量与分批形状)"""
async with db.execute("SELECT COUNT(*) FROM proxies") as c:
before = (await c.fetchone())[0]
for i in range(5):
await proxy_repo.insert_or_update(db, f"192.168.1.{i}", 8000 + i, "http", 10)
await proxy_repo.insert_or_update(db, f"192.168.99.{i}", 8000 + i, "http", 10)
async with db.execute("SELECT COUNT(*) FROM proxies") as c:
after = (await c.fetchone())[0]
assert after == before + 5
batches = []
async for batch in proxy_repo.iter_batches(db, batch_size=2):
batches.append(batch)
assert len(batches) == 3
assert len(batches[0]) == 2
assert len(batches[1]) == 2
assert len(batches[2]) == 1
assert sum(len(b) for b in batches) == after
assert len(batches[-1]) in (1, 2)
assert all(len(b) <= 2 for b in batches)
# 清理
for i in range(5):
await proxy_repo.delete(db, f"192.168.1.{i}", 8000 + i)
await proxy_repo.delete(db, f"192.168.99.{i}", 8000 + i)
@pytest.mark.asyncio
async def test_batch_delete(self, db, proxy_repo):
@@ -121,6 +123,38 @@ class TestProxyRepository:
"""测试获取统计信息"""
stats = await proxy_repo.get_stats(db)
assert "total" in stats
assert "pending" in stats
assert "available" in stats
assert "avg_score" in stats
assert "http_count" in stats
@pytest.mark.asyncio
async def test_get_today_new_count_only_validated_available(self, db, proxy_repo):
"""今日新增不计待验证;仅今日创建且 validated=1、score>0"""
base = await proxy_repo.get_today_new_count(db)
await proxy_repo.upsert_from_crawl(db, "192.168.88.20", 9020, "http", 0)
assert await proxy_repo.get_today_new_count(db) == base
await proxy_repo.insert_or_update(db, "192.168.88.21", 9021, "http", 55)
assert await proxy_repo.get_today_new_count(db) == base + 1
await proxy_repo.delete(db, "192.168.88.20", 9020)
await proxy_repo.delete(db, "192.168.88.21", 9021)
@pytest.mark.asyncio
async def test_upsert_many_from_crawl(self, db, proxy_repo):
from app.models.domain import ProxyRaw
raws = [
ProxyRaw("10.0.0.1", 18080, "http"),
ProxyRaw("10.0.0.2", 18081, "socks5"),
]
await proxy_repo.upsert_many_from_crawl(db, raws, 0)
await db.commit()
p1 = await proxy_repo.get_by_ip_port(db, "10.0.0.1", 18080)
assert p1 is not None
assert p1.validated == 0
p2 = await proxy_repo.get_by_ip_port(db, "10.0.0.2", 18081)
assert p2.protocol == "socks5"
await proxy_repo.delete(db, "10.0.0.1", 18080)
await proxy_repo.delete(db, "10.0.0.2", 18081)