325 lines
8.2 KiB
JavaScript
325 lines
8.2 KiB
JavaScript
|
|
/**
|
|||
|
|
* 爬虫检测中间件
|
|||
|
|
* 用于识别和区分搜索引擎爬虫和正常用户访问
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
// 常见搜索引擎爬虫的User-Agent列表
|
|||
|
|
const SEARCH_ENGINE_BOTS = [
|
|||
|
|
// Google
|
|||
|
|
'googlebot',
|
|||
|
|
'googlebot-image',
|
|||
|
|
'googlebot-news',
|
|||
|
|
'mediapartners-google',
|
|||
|
|
'adsbot-google',
|
|||
|
|
|
|||
|
|
// 百度
|
|||
|
|
'baiduspider',
|
|||
|
|
'baiduspider-mobile',
|
|||
|
|
|
|||
|
|
// 必应
|
|||
|
|
'bingbot',
|
|||
|
|
'msnbot',
|
|||
|
|
|
|||
|
|
// 360
|
|||
|
|
'360spider',
|
|||
|
|
|
|||
|
|
// 搜狗
|
|||
|
|
'sogou spider',
|
|||
|
|
'sogou-orion',
|
|||
|
|
|
|||
|
|
// 雅虎
|
|||
|
|
'slurp',
|
|||
|
|
'yahoo',
|
|||
|
|
|
|||
|
|
// Yandex
|
|||
|
|
'yandex',
|
|||
|
|
|
|||
|
|
// DuckDuckGo
|
|||
|
|
'duckduckbot',
|
|||
|
|
|
|||
|
|
// 头头条
|
|||
|
|
'bytespider',
|
|||
|
|
|
|||
|
|
// 神马
|
|||
|
|
'yisouspider',
|
|||
|
|
|
|||
|
|
// 其他
|
|||
|
|
'spider',
|
|||
|
|
'crawl',
|
|||
|
|
'bot',
|
|||
|
|
'curl',
|
|||
|
|
'wget',
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
// 爬虫检测函数
|
|||
|
|
export function isCrawler(userAgent) {
|
|||
|
|
if (!userAgent) {
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const ua = userAgent.toLowerCase();
|
|||
|
|
|
|||
|
|
// 检查是否匹配已知的爬虫User-Agent
|
|||
|
|
return SEARCH_ENGINE_BOTS.some(bot => ua.includes(bot));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Node.js Express 中间件
|
|||
|
|
export function crawlerMiddleware(options = {}) {
|
|||
|
|
const {
|
|||
|
|
staticPagesPath = '../static-pages',
|
|||
|
|
fallbackPath = '/index.html',
|
|||
|
|
debug = false
|
|||
|
|
} = options;
|
|||
|
|
|
|||
|
|
return (req, res, next) => {
|
|||
|
|
const userAgent = req.headers['user-agent'];
|
|||
|
|
|
|||
|
|
// 检测是否是爬虫
|
|||
|
|
if (isCrawler(userAgent)) {
|
|||
|
|
if (debug) {
|
|||
|
|
console.log(`🕷️ 检测到爬虫: ${userAgent}`);
|
|||
|
|
console.log(`📍 请求路径: ${req.path}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 尝试返回对应的静态页面
|
|||
|
|
const path = require('path');
|
|||
|
|
const fs = require('fs');
|
|||
|
|
|
|||
|
|
const staticDir = path.join(__dirname, staticPagesPath);
|
|||
|
|
|
|||
|
|
// 将路径转换为文件名
|
|||
|
|
let filename = req.path
|
|||
|
|
.replace(/^\//, '')
|
|||
|
|
.replace(/\/+/g, '-')
|
|||
|
|
.replace(/:/g, '-');
|
|||
|
|
|
|||
|
|
if (!filename) {
|
|||
|
|
filename = 'index';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const filepath = path.join(staticDir, `${filename}.html`);
|
|||
|
|
|
|||
|
|
// 检查静态文件是否存在
|
|||
|
|
if (fs.existsSync(filepath)) {
|
|||
|
|
if (debug) {
|
|||
|
|
console.log(`📄 返回静态页面: ${filepath}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const html = fs.readFileSync(filepath, 'utf-8');
|
|||
|
|
res.setHeader('Content-Type', 'text/html; charset=utf-8');
|
|||
|
|
res.send(html);
|
|||
|
|
return;
|
|||
|
|
} else {
|
|||
|
|
if (debug) {
|
|||
|
|
console.log(`⚠️ 静态页面不存在: ${filepath}`);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 正常用户或未找到静态页面,继续处理
|
|||
|
|
next();
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Nginx 配置生成器
|
|||
|
|
export function generateNginxConfig(options = {}) {
|
|||
|
|
const {
|
|||
|
|
serverName = 'www.tianyuancha.cn',
|
|||
|
|
spaRoot = '/var/www/tyc-webview-v2/dist',
|
|||
|
|
staticRoot = '/var/www/tyc-webview-v2/static-pages',
|
|||
|
|
sslEnabled = false,
|
|||
|
|
sslCertPath = '',
|
|||
|
|
sslKeyPath = ''
|
|||
|
|
} = options;
|
|||
|
|
|
|||
|
|
const protocol = sslEnabled ? 'https' : 'http';
|
|||
|
|
const listenPort = sslEnabled ? '443 ssl' : '80';
|
|||
|
|
const sslConfig = sslEnabled ? `
|
|||
|
|
ssl_certificate ${sslCertPath};
|
|||
|
|
ssl_certificate_key ${sslKeyPath};
|
|||
|
|
ssl_protocols TLSv1.2 TLSv1.3;
|
|||
|
|
ssl_ciphers HIGH:!aNULL:!MD5;
|
|||
|
|
` : '';
|
|||
|
|
|
|||
|
|
const httpRedirect = sslEnabled ? `
|
|||
|
|
# HTTP 重定向到 HTTPS
|
|||
|
|
server {
|
|||
|
|
listen 80;
|
|||
|
|
server_name ${serverName};
|
|||
|
|
return 301 https://$server_name$request_uri;
|
|||
|
|
}
|
|||
|
|
` : '';
|
|||
|
|
|
|||
|
|
return `# Nginx 配置文件 - 天远查 SPA + SEO 静态页面
|
|||
|
|
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
|
|||
|
|
|
|||
|
|
${httpRedirect}
|
|||
|
|
|
|||
|
|
server {
|
|||
|
|
listen ${listenPort};
|
|||
|
|
server_name ${serverName};
|
|||
|
|
charset utf-8;
|
|||
|
|
|
|||
|
|
${sslConfig}
|
|||
|
|
# 日志配置
|
|||
|
|
access_log /var/log/nginx/${serverName}-access.log;
|
|||
|
|
error_log /var/log/nginx/${serverName}-error.log;
|
|||
|
|
|
|||
|
|
# 爬虫检测
|
|||
|
|
set $is_bot 0;
|
|||
|
|
|
|||
|
|
# Google
|
|||
|
|
if ($http_user_agent ~* (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google)) {
|
|||
|
|
set $is_bot 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 百度
|
|||
|
|
if ($http_user_agent ~* (baiduspider|baiduspider-mobile)) {
|
|||
|
|
set $is_bot 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 必应
|
|||
|
|
if ($http_user_agent ~* (bingbot|msnbot)) {
|
|||
|
|
set $is_bot 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 360
|
|||
|
|
if ($http_user_agent ~* "360spider") {
|
|||
|
|
set $is_bot 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 搜狗
|
|||
|
|
if ($http_user_agent ~* "(sogou spider|sogou-orion)") {
|
|||
|
|
set $is_bot 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 其他爬虫
|
|||
|
|
if ($http_user_agent ~* "(spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider)") {
|
|||
|
|
set $is_bot 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 根路径处理
|
|||
|
|
location = / {
|
|||
|
|
if ($is_bot = 1) {
|
|||
|
|
root ${staticRoot};
|
|||
|
|
try_files /index.html /index.html;
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
root ${spaRoot};
|
|||
|
|
try_files $uri $uri/ /index.html;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 静态页面处理
|
|||
|
|
location ~ ^/([^/]+)(/.*)?$ {
|
|||
|
|
# 将路径转换为文件名
|
|||
|
|
set $static_filename $1;
|
|||
|
|
set $rest_path $2;
|
|||
|
|
|
|||
|
|
if ($is_bot = 1) {
|
|||
|
|
root ${staticRoot};
|
|||
|
|
try_files /$static_filename.html /index.html;
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
root ${spaRoot};
|
|||
|
|
try_files $uri $uri/ /index.html;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# SPA 路由回退
|
|||
|
|
location / {
|
|||
|
|
root ${spaRoot};
|
|||
|
|
try_files $uri $uri/ /index.html;
|
|||
|
|
add_header Cache-Control "no-cache";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 静态资源缓存
|
|||
|
|
location ~* \\.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
|
|||
|
|
root ${spaRoot};
|
|||
|
|
expires 1y;
|
|||
|
|
add_header Cache-Control "public, immutable";
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# API 代理
|
|||
|
|
location /api/ {
|
|||
|
|
proxy_pass http://localhost:8080;
|
|||
|
|
proxy_http_version 1.1;
|
|||
|
|
proxy_set_header Upgrade $http_upgrade;
|
|||
|
|
proxy_set_header Connection 'upgrade';
|
|||
|
|
proxy_set_header Host $host;
|
|||
|
|
proxy_set_header X-Real-IP $remote_addr;
|
|||
|
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
|||
|
|
proxy_cache_bypass $http_upgrade;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Gzip 压缩
|
|||
|
|
gzip on;
|
|||
|
|
gzip_vary on;
|
|||
|
|
gzip_min_length 1024;
|
|||
|
|
gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/javascript application/json;
|
|||
|
|
|
|||
|
|
# 安全头
|
|||
|
|
add_header X-Frame-Options "SAMEORIGIN";
|
|||
|
|
add_header X-Content-Type-Options "nosniff";
|
|||
|
|
add_header X-XSS-Protection "1; mode=block";
|
|||
|
|
}
|
|||
|
|
`;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 生成 .htaccess 文件(Apache)
|
|||
|
|
export function generateHtaccessConfig(options = {}) {
|
|||
|
|
const {
|
|||
|
|
staticRoot = '/static-pages',
|
|||
|
|
spaRoot = '/'
|
|||
|
|
} = options;
|
|||
|
|
|
|||
|
|
return `# Apache .htaccess 配置 - 天远查 SPA + SEO 静态页面
|
|||
|
|
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
|
|||
|
|
|
|||
|
|
# 启用重写引擎
|
|||
|
|
RewriteEngine On
|
|||
|
|
|
|||
|
|
# 爬虫检测
|
|||
|
|
RewriteCond %{HTTP_USER_AGENT} (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google) [NC,OR]
|
|||
|
|
RewriteCond %{HTTP_USER_AGENT} (baiduspider|baiduspider-mobile) [NC,OR]
|
|||
|
|
RewriteCond %{HTTP_USER_AGENT} (bingbot|msnbot) [NC,OR]
|
|||
|
|
RewriteCond %{HTTP_USER_AGENT} "360spider" [NC,OR]
|
|||
|
|
RewriteCond %{HTTP_USER_AGENT} (sogou spider|sogou-orion) [NC,OR]
|
|||
|
|
RewriteCond %{HTTP_USER_AGENT} (spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider) [NC]
|
|||
|
|
|
|||
|
|
# 爬虫访问静态页面
|
|||
|
|
RewriteRule ^$ ${staticRoot}/index.html [L]
|
|||
|
|
RewriteRule ^([^/]+)/?$ ${staticRoot}/$1.html [L]
|
|||
|
|
RewriteRule ^([^/]+)/(.+)$ ${staticRoot}/$1.html [L]
|
|||
|
|
|
|||
|
|
# 正常用户访问 SPA
|
|||
|
|
RewriteCond %{REQUEST_FILENAME} !-f
|
|||
|
|
RewriteCond %{REQUEST_FILENAME} !-d
|
|||
|
|
RewriteRule . ${spaRoot}index.html [L]
|
|||
|
|
|
|||
|
|
# Gzip 压缩
|
|||
|
|
<IfModule mod_deflate.c>
|
|||
|
|
AddOutputFilterByType DEFLATE text/html text/plain text/xml text/css text/javascript application/javascript application/json
|
|||
|
|
</IfModule>
|
|||
|
|
|
|||
|
|
# 缓存控制
|
|||
|
|
<IfModule mod_expires.c>
|
|||
|
|
ExpiresActive On
|
|||
|
|
ExpiresByType image/jpg "access plus 1 year"
|
|||
|
|
ExpiresByType image/jpeg "access plus 1 year"
|
|||
|
|
ExpiresByType image/gif "access plus 1 year"
|
|||
|
|
ExpiresByType image/png "access plus 1 year"
|
|||
|
|
ExpiresByType text/css "access plus 1 month"
|
|||
|
|
ExpiresByType application/javascript "access plus 1 month"
|
|||
|
|
</IfModule>
|
|||
|
|
|
|||
|
|
# 安全头
|
|||
|
|
<IfModule mod_headers.c>
|
|||
|
|
Header set X-Frame-Options "SAMEORIGIN"
|
|||
|
|
Header set X-Content-Type-Options "nosniff"
|
|||
|
|
Header set X-XSS-Protection "1; mode=block"
|
|||
|
|
</IfModule>
|
|||
|
|
`;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 导出工具函数
|
|||
|
|
export { SEARCH_ENGINE_BOTS };
|