up seo
This commit is contained in:
324
scripts/seo-static-generator/crawlerMiddleware.js
Normal file
324
scripts/seo-static-generator/crawlerMiddleware.js
Normal file
@@ -0,0 +1,324 @@
|
||||
/**
|
||||
* 爬虫检测中间件
|
||||
* 用于识别和区分搜索引擎爬虫和正常用户访问
|
||||
*/
|
||||
|
||||
// 常见搜索引擎爬虫的User-Agent列表
|
||||
const SEARCH_ENGINE_BOTS = [
|
||||
// Google
|
||||
'googlebot',
|
||||
'googlebot-image',
|
||||
'googlebot-news',
|
||||
'mediapartners-google',
|
||||
'adsbot-google',
|
||||
|
||||
// 百度
|
||||
'baiduspider',
|
||||
'baiduspider-mobile',
|
||||
|
||||
// 必应
|
||||
'bingbot',
|
||||
'msnbot',
|
||||
|
||||
// 360
|
||||
'360spider',
|
||||
|
||||
// 搜狗
|
||||
'sogou spider',
|
||||
'sogou-orion',
|
||||
|
||||
// 雅虎
|
||||
'slurp',
|
||||
'yahoo',
|
||||
|
||||
// Yandex
|
||||
'yandex',
|
||||
|
||||
// DuckDuckGo
|
||||
'duckduckbot',
|
||||
|
||||
// 头头条
|
||||
'bytespider',
|
||||
|
||||
// 神马
|
||||
'yisouspider',
|
||||
|
||||
// 其他
|
||||
'spider',
|
||||
'crawl',
|
||||
'bot',
|
||||
'curl',
|
||||
'wget',
|
||||
];
|
||||
|
||||
// 爬虫检测函数
|
||||
export function isCrawler(userAgent) {
|
||||
if (!userAgent) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ua = userAgent.toLowerCase();
|
||||
|
||||
// 检查是否匹配已知的爬虫User-Agent
|
||||
return SEARCH_ENGINE_BOTS.some(bot => ua.includes(bot));
|
||||
}
|
||||
|
||||
// Node.js Express 中间件
|
||||
export function crawlerMiddleware(options = {}) {
|
||||
const {
|
||||
staticPagesPath = '../static-pages',
|
||||
fallbackPath = '/index.html',
|
||||
debug = false
|
||||
} = options;
|
||||
|
||||
return (req, res, next) => {
|
||||
const userAgent = req.headers['user-agent'];
|
||||
|
||||
// 检测是否是爬虫
|
||||
if (isCrawler(userAgent)) {
|
||||
if (debug) {
|
||||
console.log(`🕷️ 检测到爬虫: ${userAgent}`);
|
||||
console.log(`📍 请求路径: ${req.path}`);
|
||||
}
|
||||
|
||||
// 尝试返回对应的静态页面
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
const staticDir = path.join(__dirname, staticPagesPath);
|
||||
|
||||
// 将路径转换为文件名
|
||||
let filename = req.path
|
||||
.replace(/^\//, '')
|
||||
.replace(/\/+/g, '-')
|
||||
.replace(/:/g, '-');
|
||||
|
||||
if (!filename) {
|
||||
filename = 'index';
|
||||
}
|
||||
|
||||
const filepath = path.join(staticDir, `${filename}.html`);
|
||||
|
||||
// 检查静态文件是否存在
|
||||
if (fs.existsSync(filepath)) {
|
||||
if (debug) {
|
||||
console.log(`📄 返回静态页面: ${filepath}`);
|
||||
}
|
||||
|
||||
const html = fs.readFileSync(filepath, 'utf-8');
|
||||
res.setHeader('Content-Type', 'text/html; charset=utf-8');
|
||||
res.send(html);
|
||||
return;
|
||||
} else {
|
||||
if (debug) {
|
||||
console.log(`⚠️ 静态页面不存在: ${filepath}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 正常用户或未找到静态页面,继续处理
|
||||
next();
|
||||
};
|
||||
}
|
||||
|
||||
// Nginx 配置生成器
|
||||
export function generateNginxConfig(options = {}) {
|
||||
const {
|
||||
serverName = 'www.tianyuancha.cn',
|
||||
spaRoot = '/var/www/tyc-webview-v2/dist',
|
||||
staticRoot = '/var/www/tyc-webview-v2/static-pages',
|
||||
sslEnabled = false,
|
||||
sslCertPath = '',
|
||||
sslKeyPath = ''
|
||||
} = options;
|
||||
|
||||
const protocol = sslEnabled ? 'https' : 'http';
|
||||
const listenPort = sslEnabled ? '443 ssl' : '80';
|
||||
const sslConfig = sslEnabled ? `
|
||||
ssl_certificate ${sslCertPath};
|
||||
ssl_certificate_key ${sslKeyPath};
|
||||
ssl_protocols TLSv1.2 TLSv1.3;
|
||||
ssl_ciphers HIGH:!aNULL:!MD5;
|
||||
` : '';
|
||||
|
||||
const httpRedirect = sslEnabled ? `
|
||||
# HTTP 重定向到 HTTPS
|
||||
server {
|
||||
listen 80;
|
||||
server_name ${serverName};
|
||||
return 301 https://$server_name$request_uri;
|
||||
}
|
||||
` : '';
|
||||
|
||||
return `# Nginx 配置文件 - 天远查 SPA + SEO 静态页面
|
||||
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
|
||||
|
||||
${httpRedirect}
|
||||
|
||||
server {
|
||||
listen ${listenPort};
|
||||
server_name ${serverName};
|
||||
charset utf-8;
|
||||
|
||||
${sslConfig}
|
||||
# 日志配置
|
||||
access_log /var/log/nginx/${serverName}-access.log;
|
||||
error_log /var/log/nginx/${serverName}-error.log;
|
||||
|
||||
# 爬虫检测
|
||||
set $is_bot 0;
|
||||
|
||||
# Google
|
||||
if ($http_user_agent ~* (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google)) {
|
||||
set $is_bot 1;
|
||||
}
|
||||
|
||||
# 百度
|
||||
if ($http_user_agent ~* (baiduspider|baiduspider-mobile)) {
|
||||
set $is_bot 1;
|
||||
}
|
||||
|
||||
# 必应
|
||||
if ($http_user_agent ~* (bingbot|msnbot)) {
|
||||
set $is_bot 1;
|
||||
}
|
||||
|
||||
# 360
|
||||
if ($http_user_agent ~* "360spider") {
|
||||
set $is_bot 1;
|
||||
}
|
||||
|
||||
# 搜狗
|
||||
if ($http_user_agent ~* "(sogou spider|sogou-orion)") {
|
||||
set $is_bot 1;
|
||||
}
|
||||
|
||||
# 其他爬虫
|
||||
if ($http_user_agent ~* "(spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider)") {
|
||||
set $is_bot 1;
|
||||
}
|
||||
|
||||
# 根路径处理
|
||||
location = / {
|
||||
if ($is_bot = 1) {
|
||||
root ${staticRoot};
|
||||
try_files /index.html /index.html;
|
||||
break;
|
||||
}
|
||||
root ${spaRoot};
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
|
||||
# 静态页面处理
|
||||
location ~ ^/([^/]+)(/.*)?$ {
|
||||
# 将路径转换为文件名
|
||||
set $static_filename $1;
|
||||
set $rest_path $2;
|
||||
|
||||
if ($is_bot = 1) {
|
||||
root ${staticRoot};
|
||||
try_files /$static_filename.html /index.html;
|
||||
break;
|
||||
}
|
||||
root ${spaRoot};
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
|
||||
# SPA 路由回退
|
||||
location / {
|
||||
root ${spaRoot};
|
||||
try_files $uri $uri/ /index.html;
|
||||
add_header Cache-Control "no-cache";
|
||||
}
|
||||
|
||||
# 静态资源缓存
|
||||
location ~* \\.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
|
||||
root ${spaRoot};
|
||||
expires 1y;
|
||||
add_header Cache-Control "public, immutable";
|
||||
}
|
||||
|
||||
# API 代理
|
||||
location /api/ {
|
||||
proxy_pass http://localhost:8080;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection 'upgrade';
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_cache_bypass $http_upgrade;
|
||||
}
|
||||
|
||||
# Gzip 压缩
|
||||
gzip on;
|
||||
gzip_vary on;
|
||||
gzip_min_length 1024;
|
||||
gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/javascript application/json;
|
||||
|
||||
# 安全头
|
||||
add_header X-Frame-Options "SAMEORIGIN";
|
||||
add_header X-Content-Type-Options "nosniff";
|
||||
add_header X-XSS-Protection "1; mode=block";
|
||||
}
|
||||
`;
|
||||
}
|
||||
|
||||
// 生成 .htaccess 文件(Apache)
|
||||
export function generateHtaccessConfig(options = {}) {
|
||||
const {
|
||||
staticRoot = '/static-pages',
|
||||
spaRoot = '/'
|
||||
} = options;
|
||||
|
||||
return `# Apache .htaccess 配置 - 天远查 SPA + SEO 静态页面
|
||||
# 自动生成时间: ${new Date().toLocaleString('zh-CN')}
|
||||
|
||||
# 启用重写引擎
|
||||
RewriteEngine On
|
||||
|
||||
# 爬虫检测
|
||||
RewriteCond %{HTTP_USER_AGENT} (googlebot|googlebot-image|googlebot-news|mediapartners-google|adsbot-google) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (baiduspider|baiduspider-mobile) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (bingbot|msnbot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} "360spider" [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (sogou spider|sogou-orion) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (spider|crawl|bot|slurp|yandex|duckduckbot|bytespider|yisouspider) [NC]
|
||||
|
||||
# 爬虫访问静态页面
|
||||
RewriteRule ^$ ${staticRoot}/index.html [L]
|
||||
RewriteRule ^([^/]+)/?$ ${staticRoot}/$1.html [L]
|
||||
RewriteRule ^([^/]+)/(.+)$ ${staticRoot}/$1.html [L]
|
||||
|
||||
# 正常用户访问 SPA
|
||||
RewriteCond %{REQUEST_FILENAME} !-f
|
||||
RewriteCond %{REQUEST_FILENAME} !-d
|
||||
RewriteRule . ${spaRoot}index.html [L]
|
||||
|
||||
# Gzip 压缩
|
||||
<IfModule mod_deflate.c>
|
||||
AddOutputFilterByType DEFLATE text/html text/plain text/xml text/css text/javascript application/javascript application/json
|
||||
</IfModule>
|
||||
|
||||
# 缓存控制
|
||||
<IfModule mod_expires.c>
|
||||
ExpiresActive On
|
||||
ExpiresByType image/jpg "access plus 1 year"
|
||||
ExpiresByType image/jpeg "access plus 1 year"
|
||||
ExpiresByType image/gif "access plus 1 year"
|
||||
ExpiresByType image/png "access plus 1 year"
|
||||
ExpiresByType text/css "access plus 1 month"
|
||||
ExpiresByType application/javascript "access plus 1 month"
|
||||
</IfModule>
|
||||
|
||||
# 安全头
|
||||
<IfModule mod_headers.c>
|
||||
Header set X-Frame-Options "SAMEORIGIN"
|
||||
Header set X-Content-Type-Options "nosniff"
|
||||
Header set X-XSS-Protection "1; mode=block"
|
||||
</IfModule>
|
||||
`;
|
||||
}
|
||||
|
||||
// 导出工具函数
|
||||
export { SEARCH_ENGINE_BOTS };
|
||||
Reference in New Issue
Block a user