171 lines
4.3 KiB
JavaScript
171 lines
4.3 KiB
JavaScript
/**
|
||
* 爬虫检测模块
|
||
* 用于识别搜索引擎爬虫和社交媒体爬虫
|
||
*/
|
||
|
||
class CrawlerDetector {
|
||
constructor() {
|
||
// 常见搜索引擎爬虫User-Agent列表
|
||
this.crawlerPatterns = [
|
||
// 百度爬虫
|
||
'baiduspider',
|
||
'baiduspider-mobile',
|
||
'baiduspider-image',
|
||
'baiduspider-video',
|
||
'baiduspider-news',
|
||
'baiduboxapp',
|
||
|
||
// Google爬虫
|
||
'googlebot',
|
||
'googlebot-image',
|
||
'googlebot-news',
|
||
'googlebot-mobile',
|
||
'googlebot-video',
|
||
'google-web-snippet',
|
||
|
||
// 360搜索
|
||
'360spider',
|
||
'soha-agent',
|
||
'haosouspider',
|
||
|
||
// 搜狗搜索
|
||
'sogou spider',
|
||
'sogou news spider',
|
||
'sogou orion spider',
|
||
'sogou-blog',
|
||
|
||
// 必应
|
||
'bingbot',
|
||
'msnbot',
|
||
|
||
// 雅虎
|
||
'slurp',
|
||
|
||
// 搜搜
|
||
'sosospider',
|
||
'sosoimagespider',
|
||
|
||
// 有道
|
||
'youdaobot',
|
||
'yodaobot',
|
||
|
||
// 头条搜索
|
||
'bytedance-spider',
|
||
'toutiaospider',
|
||
|
||
// 社交媒体爬虫
|
||
'facebookexternalhit',
|
||
'facebookcatalog',
|
||
'twitterbot',
|
||
'linkedinbot',
|
||
'whatsapp',
|
||
'telegrambot',
|
||
'viber',
|
||
'line',
|
||
|
||
// 其他常见爬虫
|
||
'applebot',
|
||
'semrushbot',
|
||
'ahrefsbot',
|
||
'mj12bot',
|
||
'dotbot',
|
||
'crawler',
|
||
'spider',
|
||
'bot'
|
||
]
|
||
|
||
// 需要检测的头部字段
|
||
this.crawlerHeaders = ['x-bot', 'x-crawler', 'x-forwarded-for']
|
||
}
|
||
|
||
/**
|
||
* 检测请求是否来自爬虫
|
||
* @param {Object} req - HTTP请求对象
|
||
* @returns {Boolean} 是否为爬虫
|
||
*/
|
||
isCrawler(req) {
|
||
const userAgent = req.headers['user-agent']?.toLowerCase() || ''
|
||
const headers = req.headers
|
||
|
||
// 1. 通过User-Agent检测
|
||
if (this.checkUserAgent(userAgent)) {
|
||
console.log(`[CrawlerDetector] 检测到爬虫 UA: ${userAgent}`)
|
||
return true
|
||
}
|
||
|
||
// 2. 通过特定头部检测
|
||
if (this.checkHeaders(headers)) {
|
||
console.log(`[CrawlerDetector] 检测到爬虫 Headers`)
|
||
return true
|
||
}
|
||
|
||
// 3. 通过IP地址检测(可选)
|
||
// if (this.checkIP(req.connection.remoteAddress)) {
|
||
// return true
|
||
// }
|
||
|
||
return false
|
||
}
|
||
|
||
/**
|
||
* 检查User-Agent
|
||
* @param {String} userAgent
|
||
* @returns {Boolean}
|
||
*/
|
||
checkUserAgent(userAgent) {
|
||
if (!userAgent) return false
|
||
|
||
return this.crawlerPatterns.some(pattern => {
|
||
return userAgent.includes(pattern.toLowerCase())
|
||
})
|
||
}
|
||
|
||
/**
|
||
* 检查请求头
|
||
* @param {Object} headers
|
||
* @returns {Boolean}
|
||
*/
|
||
checkHeaders(headers) {
|
||
for (const header of this.crawlerHeaders) {
|
||
const headerValue = headers[header]?.toLowerCase()
|
||
if (headerValue && (headerValue.includes('bot') || headerValue.includes('crawler'))) {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
/**
|
||
* 检查IP地址是否为已知爬虫IP
|
||
* @param {String} ip
|
||
* @returns {Boolean}
|
||
*/
|
||
checkIP(ip) {
|
||
// 这里可以添加已知爬虫IP段的检测
|
||
// 需要定期更新爬虫IP列表
|
||
return false
|
||
}
|
||
|
||
/**
|
||
* 获取爬虫类型
|
||
* @param {String} userAgent
|
||
* @returns {String} 爬虫类型
|
||
*/
|
||
getCrawlerType(userAgent) {
|
||
const ua = userAgent.toLowerCase()
|
||
|
||
if (ua.includes('baiduspider')) return 'baidu'
|
||
if (ua.includes('googlebot')) return 'google'
|
||
if (ua.includes('bingbot') || ua.includes('msnbot')) return 'bing'
|
||
if (ua.includes('360spider')) return '360'
|
||
if (ua.includes('sogou spider')) return 'sogou'
|
||
if (ua.includes('facebookexternalhit')) return 'facebook'
|
||
if (ua.includes('twitterbot')) return 'twitter'
|
||
if (ua.includes('linkedinbot')) return 'linkedin'
|
||
|
||
return 'unknown'
|
||
}
|
||
}
|
||
|
||
module.exports = CrawlerDetector
|