325 lines
8.2 KiB
JavaScript
325 lines
8.2 KiB
JavaScript
/**
|
||
* 爬虫检测中间件
|
||
* 用于识别和区分搜索引擎爬虫和正常用户访问
|
||
*/
|
||
|
||
// 常见搜索引擎爬虫的User-Agent列表
|
||
const SEARCH_ENGINE_BOTS = [
|
||
// Google
|
||
'googlebot',
|
||
'googlebot-image',
|
||
'googlebot-news',
|
||
'mediapartners-google',
|
||
'adsbot-google',
|
||
|
||
// 百度
|
||
'baiduspider',
|
||
'baiduspider-mobile',
|
||
|
||
// 必应
|
||
'bingbot',
|
||
'msnbot',
|
||
|
||
// 360
|
||
'360spider',
|
||
|
||
// 搜狗
|
||
'sogou spider',
|
||
'sogou-orion',
|
||
|
||
// 雅虎
|
||
'slurp',
|
||
'yahoo',
|
||
|
||
// Yandex
|
||
'yandex',
|
||
|
||
// DuckDuckGo
|
||
'duckduckbot',
|
||
|
||
// 头头条
|
||
'bytespider',
|
||
|
||
// 神马
|
||
'yisouspider',
|
||
|
||
// 其他
|
||
'spider',
|
||
'crawl',
|
||
'bot',
|
||
'curl',
|
||
'wget',
|
||
];
|
||
|
||
// 爬虫检测函数
|
||
export function isCrawler(userAgent) {
|
||
if (!userAgent) {
|
||
return false;
|
||
}
|
||
|
||
const ua = userAgent.toLowerCase();
|
||
|
||
// 检查是否匹配已知的爬虫User-Agent
|
||
return SEARCH_ENGINE_BOTS.some(bot => ua.includes(bot));
|
||
}
|
||
|
||
// Node.js Express 中间件
|
||
export function crawlerMiddleware(options = {}) {
|
||
const {
|
||
staticPagesPath = '../static-pages',
|
||
fallbackPath = '/index.html',
|
||
debug = false
|
||
} = options;
|
||
|
||
return (req, res, next) => {
|
||
const userAgent = req.headers['user-agent'];
|
||
|
||
// 检测是否是爬虫
|
||
if (isCrawler(userAgent)) {
|
||
if (debug) {
|
||
console.log(`🕷️ 检测到爬虫: ${userAgent}`);
|
||
console.log(`📍 请求路径: ${req.path}`);
|
||
}
|
||
|
||
// 尝试返回对应的静态页面
|
||
const path = require('path');
|
||
const fs = require('fs');
|
||
|
||
const staticDir = path.join(__dirname, staticPagesPath);
|
||
|
||
// 将路径转换为文件名
|
||
let filename = req.path
|
||
.replace(/^\//, '')
|
||
.replace(/\/+/g, '-')
|
||
.replace(/:/g, '-');
|
||
|
||
if (!filename) {
|
||
filename = 'index';
|
||
}
|
||
|
||
const filepath = path.join(staticDir, `${filename}.html`);
|
||
|
||
// 检查静态文件是否存在
|
||
if (fs.existsSync(filepath)) {
|
||
if (debug) {
|
||
console.log(`📄 返回静态页面: ${filepath}`);
|
||
}
|
||
|
||
const html = fs.readFileSync(filepath, 'utf-8');
|
||
res.setHeader('Content-Type', 'text/html; charset=utf-8');
|
||
res.send(html);
|
||
return;
|
||
} else {
|
||
if (debug) {
|
||
console.log(`⚠️ 静态页面不存在: ${filepath}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 正常用户或未找到静态页面,继续处理
|
||
next();
|
||
};
|
||
}
|
||
|
||
// Nginx 配置生成器
|
||
export function generateNginxConfig(options = {}) {
|
||
const {
|
||
serverName = 'www.tianyuancha.cn',
|
||
spaRoot = '/var/www/tyc-webview-v2/dist',
|
||
staticRoot = '/var/www/tyc-webview-v2/static-pages',
|
||
sslEnabled = false,
|
||
sslCertPath = '',
|
||
sslKeyPath = ''
|
||
} = options;
|
||
|
||
const protocol = sslEnabled ? 'https' : 'http';
|
||
const listenPort = sslEnabled ? '443 ssl' : '80';
|
||
const sslConfig = sslEnabled ? `
|
||
ssl_certificate ${sslCertPath};
|
||
ssl_certificate_key ${sslKeyPath};
|
||
ssl_protocols TLSv1.2 TLSv1.3;
|
||
ssl_ciphers HIGH:!aNULL:!MD5;
|
||
` : '';
|
||
|
||
const httpRedirect = sslEnabled ? `
|
||
# HTTP 重定向到 HTTPS
|
||
server {
|
||
listen 80;
|
||
server_name ${serverName};
|
||
return 301 https://$server_name$request_uri;
|
||
}
|
||
` : '';
|
||
|
||
return `# Nginx 配置文件 - 天远查 SPA + SEO 静态页面
|
||
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
|
||
|
||
${httpRedirect}
|
||
|
||
server {
|
||
listen ${listenPort};
|
||
server_name ${serverName};
|
||
charset utf-8;
|
||
|
||
${sslConfig}
|
||
# 日志配置
|
||
access_log /var/log/nginx/${serverName}-access.log;
|
||
error_log /var/log/nginx/${serverName}-error.log;
|
||
|
||
# 爬虫检测
|
||
set $is_bot 0;
|
||
|
||
# Google
|
||
if ($http_user_agent ~* (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google)) {
|
||
set $is_bot 1;
|
||
}
|
||
|
||
# 百度
|
||
if ($http_user_agent ~* (baiduspider|baiduspider-mobile)) {
|
||
set $is_bot 1;
|
||
}
|
||
|
||
# 必应
|
||
if ($http_user_agent ~* (bingbot|msnbot)) {
|
||
set $is_bot 1;
|
||
}
|
||
|
||
# 360
|
||
if ($http_user_agent ~* "360spider") {
|
||
set $is_bot 1;
|
||
}
|
||
|
||
# 搜狗
|
||
if ($http_user_agent ~* "(sogou spider|sogou-orion)") {
|
||
set $is_bot 1;
|
||
}
|
||
|
||
# 其他爬虫
|
||
if ($http_user_agent ~* "(spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider)") {
|
||
set $is_bot 1;
|
||
}
|
||
|
||
# 根路径处理
|
||
location = / {
|
||
if ($is_bot = 1) {
|
||
root ${staticRoot};
|
||
try_files /index.html /index.html;
|
||
break;
|
||
}
|
||
root ${spaRoot};
|
||
try_files $uri $uri/ /index.html;
|
||
}
|
||
|
||
# 静态页面处理
|
||
location ~ ^/([^/]+)(/.*)?$ {
|
||
# 将路径转换为文件名
|
||
set $static_filename $1;
|
||
set $rest_path $2;
|
||
|
||
if ($is_bot = 1) {
|
||
root ${staticRoot};
|
||
try_files /$static_filename.html /index.html;
|
||
break;
|
||
}
|
||
root ${spaRoot};
|
||
try_files $uri $uri/ /index.html;
|
||
}
|
||
|
||
# SPA 路由回退
|
||
location / {
|
||
root ${spaRoot};
|
||
try_files $uri $uri/ /index.html;
|
||
add_header Cache-Control "no-cache";
|
||
}
|
||
|
||
# 静态资源缓存
|
||
location ~* \\.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
|
||
root ${spaRoot};
|
||
expires 1y;
|
||
add_header Cache-Control "public, immutable";
|
||
}
|
||
|
||
# API 代理
|
||
location /api/ {
|
||
proxy_pass http://localhost:8080;
|
||
proxy_http_version 1.1;
|
||
proxy_set_header Upgrade $http_upgrade;
|
||
proxy_set_header Connection 'upgrade';
|
||
proxy_set_header Host $host;
|
||
proxy_set_header X-Real-IP $remote_addr;
|
||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||
proxy_cache_bypass $http_upgrade;
|
||
}
|
||
|
||
# Gzip 压缩
|
||
gzip on;
|
||
gzip_vary on;
|
||
gzip_min_length 1024;
|
||
gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/javascript application/json;
|
||
|
||
# 安全头
|
||
add_header X-Frame-Options "SAMEORIGIN";
|
||
add_header X-Content-Type-Options "nosniff";
|
||
add_header X-XSS-Protection "1; mode=block";
|
||
}
|
||
`;
|
||
}
|
||
|
||
// 生成 .htaccess 文件(Apache)
|
||
export function generateHtaccessConfig(options = {}) {
|
||
const {
|
||
staticRoot = '/static-pages',
|
||
spaRoot = '/'
|
||
} = options;
|
||
|
||
return `# Apache .htaccess 配置 - 天远查 SPA + SEO 静态页面
|
||
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
|
||
|
||
# 启用重写引擎
|
||
RewriteEngine On
|
||
|
||
# 爬虫检测
|
||
RewriteCond %{HTTP_USER_AGENT} (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google) [NC,OR]
|
||
RewriteCond %{HTTP_USER_AGENT} (baiduspider|baiduspider-mobile) [NC,OR]
|
||
RewriteCond %{HTTP_USER_AGENT} (bingbot|msnbot) [NC,OR]
|
||
RewriteCond %{HTTP_USER_AGENT} "360spider" [NC,OR]
|
||
RewriteCond %{HTTP_USER_AGENT} (sogou spider|sogou-orion) [NC,OR]
|
||
RewriteCond %{HTTP_USER_AGENT} (spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider) [NC]
|
||
|
||
# 爬虫访问静态页面
|
||
RewriteRule ^$ ${staticRoot}/index.html [L]
|
||
RewriteRule ^([^/]+)/?$ ${staticRoot}/$1.html [L]
|
||
RewriteRule ^([^/]+)/(.+)$ ${staticRoot}/$1.html [L]
|
||
|
||
# 正常用户访问 SPA
|
||
RewriteCond %{REQUEST_FILENAME} !-f
|
||
RewriteCond %{REQUEST_FILENAME} !-d
|
||
RewriteRule . ${spaRoot}index.html [L]
|
||
|
||
# Gzip 压缩
|
||
<IfModule mod_deflate.c>
|
||
AddOutputFilterByType DEFLATE text/html text/plain text/xml text/css text/javascript application/javascript application/json
|
||
</IfModule>
|
||
|
||
# 缓存控制
|
||
<IfModule mod_expires.c>
|
||
ExpiresActive On
|
||
ExpiresByType image/jpg "access plus 1 year"
|
||
ExpiresByType image/jpeg "access plus 1 year"
|
||
ExpiresByType image/gif "access plus 1 year"
|
||
ExpiresByType image/png "access plus 1 year"
|
||
ExpiresByType text/css "access plus 1 month"
|
||
ExpiresByType application/javascript "access plus 1 month"
|
||
</IfModule>
|
||
|
||
# 安全头
|
||
<IfModule mod_headers.c>
|
||
Header set X-Frame-Options "SAMEORIGIN"
|
||
Header set X-Content-Type-Options "nosniff"
|
||
Header set X-XSS-Protection "1; mode=block"
|
||
</IfModule>
|
||
`;
|
||
}
|
||
|
||
// 导出工具函数
|
||
export { SEARCH_ENGINE_BOTS };
|