Files
tyc-webview-v2/scripts/seo-static-generator/crawlerMiddleware.js
2026-02-25 11:45:21 +08:00

325 lines
8.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* 爬虫检测中间件
* 用于识别和区分搜索引擎爬虫和正常用户访问
*/
// 常见搜索引擎爬虫的User-Agent列表
const SEARCH_ENGINE_BOTS = [
// Google
'googlebot',
'googlebot-image',
'googlebot-news',
'mediapartners-google',
'adsbot-google',
// 百度
'baiduspider',
'baiduspider-mobile',
// 必应
'bingbot',
'msnbot',
// 360
'360spider',
// 搜狗
'sogou spider',
'sogou-orion',
// 雅虎
'slurp',
'yahoo',
// Yandex
'yandex',
// DuckDuckGo
'duckduckbot',
// 头头条
'bytespider',
// 神马
'yisouspider',
// 其他
'spider',
'crawl',
'bot',
'curl',
'wget',
];
// 爬虫检测函数
export function isCrawler(userAgent) {
if (!userAgent) {
return false;
}
const ua = userAgent.toLowerCase();
// 检查是否匹配已知的爬虫User-Agent
return SEARCH_ENGINE_BOTS.some(bot => ua.includes(bot));
}
// Node.js Express 中间件
export function crawlerMiddleware(options = {}) {
const {
staticPagesPath = '../static-pages',
fallbackPath = '/index.html',
debug = false
} = options;
return (req, res, next) => {
const userAgent = req.headers['user-agent'];
// 检测是否是爬虫
if (isCrawler(userAgent)) {
if (debug) {
console.log(`🕷️ 检测到爬虫: ${userAgent}`);
console.log(`📍 请求路径: ${req.path}`);
}
// 尝试返回对应的静态页面
const path = require('path');
const fs = require('fs');
const staticDir = path.join(__dirname, staticPagesPath);
// 将路径转换为文件名
let filename = req.path
.replace(/^\//, '')
.replace(/\/+/g, '-')
.replace(/:/g, '-');
if (!filename) {
filename = 'index';
}
const filepath = path.join(staticDir, `${filename}.html`);
// 检查静态文件是否存在
if (fs.existsSync(filepath)) {
if (debug) {
console.log(`📄 返回静态页面: ${filepath}`);
}
const html = fs.readFileSync(filepath, 'utf-8');
res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.send(html);
return;
} else {
if (debug) {
console.log(`⚠️ 静态页面不存在: ${filepath}`);
}
}
}
// 正常用户或未找到静态页面,继续处理
next();
};
}
// Nginx 配置生成器
export function generateNginxConfig(options = {}) {
const {
serverName = 'www.tianyuancha.cn',
spaRoot = '/var/www/tyc-webview-v2/dist',
staticRoot = '/var/www/tyc-webview-v2/static-pages',
sslEnabled = false,
sslCertPath = '',
sslKeyPath = ''
} = options;
const protocol = sslEnabled ? 'https' : 'http';
const listenPort = sslEnabled ? '443 ssl' : '80';
const sslConfig = sslEnabled ? `
ssl_certificate ${sslCertPath};
ssl_certificate_key ${sslKeyPath};
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
` : '';
const httpRedirect = sslEnabled ? `
# HTTP 重定向到 HTTPS
server {
listen 80;
server_name ${serverName};
return 301 https://$server_name$request_uri;
}
` : '';
return `# Nginx 配置文件 - 天远查 SPA + SEO 静态页面
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
${httpRedirect}
server {
listen ${listenPort};
server_name ${serverName};
charset utf-8;
${sslConfig}
# 日志配置
access_log /var/log/nginx/${serverName}-access.log;
error_log /var/log/nginx/${serverName}-error.log;
# 爬虫检测
set $is_bot 0;
# Google
if ($http_user_agent ~* (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google)) {
set $is_bot 1;
}
# 百度
if ($http_user_agent ~* (baiduspider|baiduspider-mobile)) {
set $is_bot 1;
}
# 必应
if ($http_user_agent ~* (bingbot|msnbot)) {
set $is_bot 1;
}
# 360
if ($http_user_agent ~* "360spider") {
set $is_bot 1;
}
# 搜狗
if ($http_user_agent ~* "(sogou spider|sogou-orion)") {
set $is_bot 1;
}
# 其他爬虫
if ($http_user_agent ~* "(spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider)") {
set $is_bot 1;
}
# 根路径处理
location = / {
if ($is_bot = 1) {
root ${staticRoot};
try_files /index.html /index.html;
break;
}
root ${spaRoot};
try_files $uri $uri/ /index.html;
}
# 静态页面处理
location ~ ^/([^/]+)(/.*)?$ {
# 将路径转换为文件名
set $static_filename $1;
set $rest_path $2;
if ($is_bot = 1) {
root ${staticRoot};
try_files /$static_filename.html /index.html;
break;
}
root ${spaRoot};
try_files $uri $uri/ /index.html;
}
# SPA 路由回退
location / {
root ${spaRoot};
try_files $uri $uri/ /index.html;
add_header Cache-Control "no-cache";
}
# 静态资源缓存
location ~* \\.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
root ${spaRoot};
expires 1y;
add_header Cache-Control "public, immutable";
}
# API 代理
location /api/ {
proxy_pass http://localhost:8080;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection 'upgrade';
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_cache_bypass $http_upgrade;
}
# Gzip 压缩
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/javascript application/json;
# 安全头
add_header X-Frame-Options "SAMEORIGIN";
add_header X-Content-Type-Options "nosniff";
add_header X-XSS-Protection "1; mode=block";
}
`;
}
// 生成 .htaccess 文件Apache
export function generateHtaccessConfig(options = {}) {
const {
staticRoot = '/static-pages',
spaRoot = '/'
} = options;
return `# Apache .htaccess 配置 - 天远查 SPA + SEO 静态页面
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
# 启用重写引擎
RewriteEngine On
# 爬虫检测
RewriteCond %{HTTP_USER_AGENT} (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (baiduspider|baiduspider-mobile) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (bingbot|msnbot) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} "360spider" [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (sogou spider|sogou-orion) [NC,OR]
RewriteCond %{HTTP_USER_AGENT} (spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider) [NC]
# 爬虫访问静态页面
RewriteRule ^$ ${staticRoot}/index.html [L]
RewriteRule ^([^/]+)/?$ ${staticRoot}/$1.html [L]
RewriteRule ^([^/]+)/(.+)$ ${staticRoot}/$1.html [L]
# 正常用户访问 SPA
RewriteCond %{REQUEST_FILENAME} !-f
RewriteCond %{REQUEST_FILENAME} !-d
RewriteRule . ${spaRoot}index.html [L]
# Gzip 压缩
<IfModule mod_deflate.c>
AddOutputFilterByType DEFLATE text/html text/plain text/xml text/css text/javascript application/javascript application/json
</IfModule>
# 缓存控制
<IfModule mod_expires.c>
ExpiresActive On
ExpiresByType image/jpg "access plus 1 year"
ExpiresByType image/jpeg "access plus 1 year"
ExpiresByType image/gif "access plus 1 year"
ExpiresByType image/png "access plus 1 year"
ExpiresByType text/css "access plus 1 month"
ExpiresByType application/javascript "access plus 1 month"
</IfModule>
# 安全头
<IfModule mod_headers.c>
Header set X-Frame-Options "SAMEORIGIN"
Header set X-Content-Type-Options "nosniff"
Header set X-XSS-Protection "1; mode=block"
</IfModule>
`;
}
// 导出工具函数
export { SEARCH_ENGINE_BOTS };