/**
* 爬虫检测中间件
* 用于识别和区分搜索引擎爬虫和正常用户访问
*/
// 常见搜索引擎爬虫的User-Agent列表
const SEARCH_ENGINE_BOTS = [
// Google
'googlebot',
'googlebot-image',
'googlebot-news',
'mediapartners-google',
'adsbot-google',
// 百度
'baiduspider',
'baiduspider-mobile',
// 必应
'bingbot',
'msnbot',
// 360
'360spider',
// 搜狗
'sogou spider',
'sogou-orion',
// 雅虎
'slurp',
'yahoo',
// Yandex
'yandex',
// DuckDuckGo
'duckduckbot',
// 头头条
'bytespider',
// 神马
'yisouspider',
// 其他
'spider',
'crawl',
'bot',
'curl',
'wget',
];
// 爬虫检测函数
export function isCrawler(userAgent) {
if (!userAgent) {
return false;
}
const ua = userAgent.toLowerCase();
// 检查是否匹配已知的爬虫User-Agent
return SEARCH_ENGINE_BOTS.some(bot => ua.includes(bot));
}
// Node.js Express 中间件
export function crawlerMiddleware(options = {}) {
const {
staticPagesPath = '../static-pages',
fallbackPath = '/index.html',
debug = false
} = options;
return (req, res, next) => {
const userAgent = req.headers['user-agent'];
// 检测是否是爬虫
if (isCrawler(userAgent)) {
if (debug) {
console.log(`🕷️ 检测到爬虫: ${userAgent}`);
console.log(`📍 请求路径: ${req.path}`);
}
// 尝试返回对应的静态页面
const path = require('path');
const fs = require('fs');
const staticDir = path.join(__dirname, staticPagesPath);
// 将路径转换为文件名
let filename = req.path
.replace(/^\//, '')
.replace(/\/+/g, '-')
.replace(/:/g, '-');
if (!filename) {
filename = 'index';
}
const filepath = path.join(staticDir, `${filename}.html`);
// 检查静态文件是否存在
if (fs.existsSync(filepath)) {
if (debug) {
console.log(`📄 返回静态页面: ${filepath}`);
}
const html = fs.readFileSync(filepath, 'utf-8');
res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.send(html);
return;
} else {
if (debug) {
console.log(`⚠️ 静态页面不存在: ${filepath}`);
}
}
}
// 正常用户或未找到静态页面,继续处理
next();
};
}
// Nginx 配置生成器
export function generateNginxConfig(options = {}) {
const {
serverName = 'www.tianyuancha.cn',
spaRoot = '/var/www/tyc-webview-v2/dist',
staticRoot = '/var/www/tyc-webview-v2/static-pages',
sslEnabled = false,
sslCertPath = '',
sslKeyPath = ''
} = options;
const protocol = sslEnabled ? 'https' : 'http';
const listenPort = sslEnabled ? '443 ssl' : '80';
const sslConfig = sslEnabled ? `
ssl_certificate ${sslCertPath};
ssl_certificate_key ${sslKeyPath};
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
` : '';
const httpRedirect = sslEnabled ? `
# HTTP 重定向到 HTTPS
server {
listen 80;
server_name ${serverName};
return 301 https://$server_name$request_uri;
}
` : '';
return `# Nginx 配置文件 - 天远查 SPA + SEO 静态页面
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
${httpRedirect}
server {
listen ${listenPort};
server_name ${serverName};
charset utf-8;
${sslConfig}
# 日志配置
access_log /var/log/nginx/${serverName}-access.log;
error_log /var/log/nginx/${serverName}-error.log;
# 爬虫检测
set $is_bot 0;
# Google
if ($http_user_agent ~* (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google)) {
set $is_bot 1;
}
# 百度
if ($http_user_agent ~* (baiduspider|baiduspider-mobile)) {
set $is_bot 1;
}
# 必应
if ($http_user_agent ~* (bingbot|msnbot)) {
set $is_bot 1;
}
# 360
if ($http_user_agent ~* "360spider") {
set $is_bot 1;
}
# 搜狗
if ($http_user_agent ~* "(sogou spider|sogou-orion)") {
set $is_bot 1;
}
# 其他爬虫
if ($http_user_agent ~* "(spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider)") {
set $is_bot 1;
}
# 根路径处理
location = / {
if ($is_bot = 1) {
root ${staticRoot};
try_files /index.html /index.html;
break;
}
root ${spaRoot};
try_files $uri $uri/ /index.html;
}
# 静态页面处理
location ~ ^/([^/]+)(/.*)?$ {
# 将路径转换为文件名
set $static_filename $1;
set $rest_path $2;
if ($is_bot = 1) {
root ${staticRoot};
try_files /$static_filename.html /index.html;
break;
}
root ${spaRoot};
try_files $uri $uri/ /index.html;
}
# SPA 路由回退
location / {
root ${spaRoot};
try_files $uri $uri/ /index.html;
add_header Cache-Control "no-cache";
}
# 静态资源缓存
location ~* \\.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
root ${spaRoot};
expires 1y;
add_header Cache-Control "public, immutable";
}
# API 代理
location /api/ {
proxy_pass http://localhost:8080;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection 'upgrade';
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_cache_bypass $http_upgrade;
}
# Gzip 压缩
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/javascript application/json;
# 安全头
add_header X-Frame-Options "SAMEORIGIN";
add_header X-Content-Type-Options "nosniff";
add_header X-XSS-Protection "1; mode=block";
}
`;
}
// 生成 .htaccess 文件(Apache)
export function generateHtaccessConfig(options = {}) {
const {
staticRoot = '/static-pages',
spaRoot = '/'
} = options;
return `# Apache .htaccess 配置 - 天远查 SPA + SEO 静态页面
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
# 启用重写引擎
RewriteEngine On
# 爬虫检测
RewriteCond %{HTTP_USER_AGENT} (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (baiduspider|baiduspider-mobile) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (bingbot|msnbot) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} "360spider" [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (sogou spider|sogou-orion) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider) [NC]
# 爬虫访问静态页面
RewriteRule ^$ ${staticRoot}/index.html [L]
RewriteRule ^([^/]+)/?$ ${staticRoot}/$1.html [L]
RewriteRule ^([^/]+)/(.+)$ ${staticRoot}/$1.html [L]
# 正常用户访问 SPA
RewriteCond %{REQUEST_FILENAME} !-f
RewriteCond %{REQUEST_FILENAME} !-d
RewriteRule . ${spaRoot}index.html [L]
# Gzip 压缩
AddOutputFilterByType DEFLATE text/html text/plain text/xml text/css text/javascript application/javascript application/json
# 缓存控制
ExpiresActive On
ExpiresByType image/jpg "access plus 1 year"
ExpiresByType image/jpeg "access plus 1 year"
ExpiresByType image/gif "access plus 1 year"
ExpiresByType image/png "access plus 1 year"
ExpiresByType text/css "access plus 1 month"
ExpiresByType application/javascript "access plus 1 month"
# 安全头
Header set X-Frame-Options "SAMEORIGIN"
Header set X-Content-Type-Options "nosniff"
Header set X-XSS-Protection "1; mode=block"
`;
}
// 导出工具函数
export { SEARCH_ENGINE_BOTS };