Files
tydata-webview-v2/server/crawler-detector.js

171 lines
4.3 KiB
JavaScript
Raw Permalink Normal View History

2026-02-28 12:00:02 +08:00
/**
* 爬虫检测模块
* 用于识别搜索引擎爬虫和社交媒体爬虫
*/
class CrawlerDetector {
constructor() {
// 常见搜索引擎爬虫User-Agent列表
this.crawlerPatterns = [
// 百度爬虫
'baiduspider',
'baiduspider-mobile',
'baiduspider-image',
'baiduspider-video',
'baiduspider-news',
'baiduboxapp',
// Google爬虫
'googlebot',
'googlebot-image',
'googlebot-news',
'googlebot-mobile',
'googlebot-video',
'google-web-snippet',
// 360搜索
'360spider',
'soha-agent',
'haosouspider',
// 搜狗搜索
'sogou spider',
'sogou news spider',
'sogou orion spider',
'sogou-blog',
// 必应
'bingbot',
'msnbot',
// 雅虎
'slurp',
// 搜搜
'sosospider',
'sosoimagespider',
// 有道
'youdaobot',
'yodaobot',
// 头条搜索
'bytedance-spider',
'toutiaospider',
// 社交媒体爬虫
'facebookexternalhit',
'facebookcatalog',
'twitterbot',
'linkedinbot',
'whatsapp',
'telegrambot',
'viber',
'line',
// 其他常见爬虫
'applebot',
'semrushbot',
'ahrefsbot',
'mj12bot',
'dotbot',
'crawler',
'spider',
'bot'
]
// 需要检测的头部字段
this.crawlerHeaders = ['x-bot', 'x-crawler', 'x-forwarded-for']
}
/**
* 检测请求是否来自爬虫
* @param {Object} req - HTTP请求对象
* @returns {Boolean} 是否为爬虫
*/
isCrawler(req) {
const userAgent = req.headers['user-agent']?.toLowerCase() || ''
const headers = req.headers
// 1. 通过User-Agent检测
if (this.checkUserAgent(userAgent)) {
console.log(`[CrawlerDetector] 检测到爬虫 UA: ${userAgent}`)
return true
}
// 2. 通过特定头部检测
if (this.checkHeaders(headers)) {
console.log(`[CrawlerDetector] 检测到爬虫 Headers`)
return true
}
// 3. 通过IP地址检测可选
// if (this.checkIP(req.connection.remoteAddress)) {
// return true
// }
return false
}
/**
* 检查User-Agent
* @param {String} userAgent
* @returns {Boolean}
*/
checkUserAgent(userAgent) {
if (!userAgent) return false
return this.crawlerPatterns.some(pattern => {
return userAgent.includes(pattern.toLowerCase())
})
}
/**
* 检查请求头
* @param {Object} headers
* @returns {Boolean}
*/
checkHeaders(headers) {
for (const header of this.crawlerHeaders) {
const headerValue = headers[header]?.toLowerCase()
if (headerValue && (headerValue.includes('bot') || headerValue.includes('crawler'))) {
return true
}
}
return false
}
/**
* 检查IP地址是否为已知爬虫IP
* @param {String} ip
* @returns {Boolean}
*/
checkIP(ip) {
// 这里可以添加已知爬虫IP段的检测
// 需要定期更新爬虫IP列表
return false
}
/**
* 获取爬虫类型
* @param {String} userAgent
* @returns {String} 爬虫类型
*/
getCrawlerType(userAgent) {
const ua = userAgent.toLowerCase()
if (ua.includes('baiduspider')) return 'baidu'
if (ua.includes('googlebot')) return 'google'
if (ua.includes('bingbot') || ua.includes('msnbot')) return 'bing'
if (ua.includes('360spider')) return '360'
if (ua.includes('sogou spider')) return 'sogou'
if (ua.includes('facebookexternalhit')) return 'facebook'
if (ua.includes('twitterbot')) return 'twitter'
if (ua.includes('linkedinbot')) return 'linkedin'
return 'unknown'
}
}
module.exports = CrawlerDetector