Files
qnc-webview-v3/server/crawler-detector.js
2026-02-28 14:33:47 +08:00

171 lines
4.3 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* 爬虫检测模块
* 用于识别搜索引擎爬虫和社交媒体爬虫
*/
class CrawlerDetector {
constructor() {
// 常见搜索引擎爬虫User-Agent列表
this.crawlerPatterns = [
// 百度爬虫
'baiduspider',
'baiduspider-mobile',
'baiduspider-image',
'baiduspider-video',
'baiduspider-news',
'baiduboxapp',
// Google爬虫
'googlebot',
'googlebot-image',
'googlebot-news',
'googlebot-mobile',
'googlebot-video',
'google-web-snippet',
// 360搜索
'360spider',
'soha-agent',
'haosouspider',
// 搜狗搜索
'sogou spider',
'sogou news spider',
'sogou orion spider',
'sogou-blog',
// 必应
'bingbot',
'msnbot',
// 雅虎
'slurp',
// 搜搜
'sosospider',
'sosoimagespider',
// 有道
'youdaobot',
'yodaobot',
// 头条搜索
'bytedance-spider',
'toutiaospider',
// 社交媒体爬虫
'facebookexternalhit',
'facebookcatalog',
'twitterbot',
'linkedinbot',
'whatsapp',
'telegrambot',
'viber',
'line',
// 其他常见爬虫
'applebot',
'semrushbot',
'ahrefsbot',
'mj12bot',
'dotbot',
'crawler',
'spider',
'bot'
]
// 需要检测的头部字段
this.crawlerHeaders = ['x-bot', 'x-crawler', 'x-forwarded-for']
}
/**
* 检测请求是否来自爬虫
* @param {Object} req - HTTP请求对象
* @returns {Boolean} 是否为爬虫
*/
isCrawler(req) {
const userAgent = req.headers['user-agent']?.toLowerCase() || ''
const headers = req.headers
// 1. 通过User-Agent检测
if (this.checkUserAgent(userAgent)) {
console.log(`[CrawlerDetector] 检测到爬虫 UA: ${userAgent}`)
return true
}
// 2. 通过特定头部检测
if (this.checkHeaders(headers)) {
console.log(`[CrawlerDetector] 检测到爬虫 Headers`)
return true
}
// 3. 通过IP地址检测可选
// if (this.checkIP(req.connection.remoteAddress)) {
// return true
// }
return false
}
/**
* 检查User-Agent
* @param {String} userAgent
* @returns {Boolean}
*/
checkUserAgent(userAgent) {
if (!userAgent) return false
return this.crawlerPatterns.some(pattern => {
return userAgent.includes(pattern.toLowerCase())
})
}
/**
* 检查请求头
* @param {Object} headers
* @returns {Boolean}
*/
checkHeaders(headers) {
for (const header of this.crawlerHeaders) {
const headerValue = headers[header]?.toLowerCase()
if (headerValue && (headerValue.includes('bot') || headerValue.includes('crawler'))) {
return true
}
}
return false
}
/**
* 检查IP地址是否为已知爬虫IP
* @param {String} ip
* @returns {Boolean}
*/
checkIP(ip) {
// 这里可以添加已知爬虫IP段的检测
// 需要定期更新爬虫IP列表
return false
}
/**
* 获取爬虫类型
* @param {String} userAgent
* @returns {String} 爬虫类型
*/
getCrawlerType(userAgent) {
const ua = userAgent.toLowerCase()
if (ua.includes('baiduspider')) return 'baidu'
if (ua.includes('googlebot')) return 'google'
if (ua.includes('bingbot') || ua.includes('msnbot')) return 'bing'
if (ua.includes('360spider')) return '360'
if (ua.includes('sogou spider')) return 'sogou'
if (ua.includes('facebookexternalhit')) return 'facebook'
if (ua.includes('twitterbot')) return 'twitter'
if (ua.includes('linkedinbot')) return 'linkedin'
return 'unknown'
}
}
module.exports = CrawlerDetector