283 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			283 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package health
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"fmt"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"tyapi-server/internal/shared/interfaces"
 | |
| 
 | |
| 	"go.uber.org/zap"
 | |
| )
 | |
| 
 | |
| // HealthChecker 健康检查器实现
 | |
| type HealthChecker struct {
 | |
| 	services map[string]interfaces.Service
 | |
| 	cache    map[string]*interfaces.HealthStatus
 | |
| 	cacheTTL time.Duration
 | |
| 	mutex    sync.RWMutex
 | |
| 	logger   *zap.Logger
 | |
| }
 | |
| 
 | |
| // NewHealthChecker 创建健康检查器
 | |
| func NewHealthChecker(logger *zap.Logger) *HealthChecker {
 | |
| 	return &HealthChecker{
 | |
| 		services: make(map[string]interfaces.Service),
 | |
| 		cache:    make(map[string]*interfaces.HealthStatus),
 | |
| 		cacheTTL: 30 * time.Second, // 缓存30秒
 | |
| 		logger:   logger,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // RegisterService 注册服务
 | |
| func (h *HealthChecker) RegisterService(service interfaces.Service) {
 | |
| 	h.mutex.Lock()
 | |
| 	defer h.mutex.Unlock()
 | |
| 
 | |
| 	h.services[service.Name()] = service
 | |
| 	h.logger.Info("服务已注册健康检查", zap.String("service", service.Name()))
 | |
| }
 | |
| 
 | |
| // CheckHealth 检查单个服务健康状态
 | |
| func (h *HealthChecker) CheckHealth(ctx context.Context, serviceName string) *interfaces.HealthStatus {
 | |
| 	h.mutex.RLock()
 | |
| 	service, exists := h.services[serviceName]
 | |
| 	if !exists {
 | |
| 		h.mutex.RUnlock()
 | |
| 		return &interfaces.HealthStatus{
 | |
| 			Status:       "DOWN",
 | |
| 			Message:      "服务未找到",
 | |
| 			Details:      map[string]interface{}{"error": "服务未注册"},
 | |
| 			CheckedAt:    time.Now().Unix(),
 | |
| 			ResponseTime: 0,
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// 检查缓存
 | |
| 	if cached, exists := h.cache[serviceName]; exists {
 | |
| 		if time.Since(time.Unix(cached.CheckedAt, 0)) < h.cacheTTL {
 | |
| 			h.mutex.RUnlock()
 | |
| 			return cached
 | |
| 		}
 | |
| 	}
 | |
| 	h.mutex.RUnlock()
 | |
| 
 | |
| 	// 执行健康检查
 | |
| 	start := time.Now()
 | |
| 	status := &interfaces.HealthStatus{
 | |
| 		CheckedAt: start.Unix(),
 | |
| 	}
 | |
| 
 | |
| 	// 设置超时上下文
 | |
| 	checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
 | |
| 	defer cancel()
 | |
| 
 | |
| 	err := service.HealthCheck(checkCtx)
 | |
| 	responseTime := time.Since(start).Milliseconds()
 | |
| 	status.ResponseTime = responseTime
 | |
| 
 | |
| 	if err != nil {
 | |
| 		status.Status = "DOWN"
 | |
| 		status.Message = "健康检查失败"
 | |
| 		status.Details = map[string]interface{}{
 | |
| 			"error":        err.Error(),
 | |
| 			"service_name": serviceName,
 | |
| 			"check_time":   start.Format(time.RFC3339),
 | |
| 		}
 | |
| 		h.logger.Warn("服务健康检查失败",
 | |
| 			zap.String("service", serviceName),
 | |
| 			zap.Error(err),
 | |
| 			zap.Int64("response_time_ms", responseTime))
 | |
| 	} else {
 | |
| 		status.Status = "UP"
 | |
| 		status.Message = "服务运行正常"
 | |
| 		status.Details = map[string]interface{}{
 | |
| 			"service_name": serviceName,
 | |
| 			"check_time":   start.Format(time.RFC3339),
 | |
| 		}
 | |
| 		h.logger.Debug("服务健康检查通过",
 | |
| 			zap.String("service", serviceName),
 | |
| 			zap.Int64("response_time_ms", responseTime))
 | |
| 	}
 | |
| 
 | |
| 	// 更新缓存
 | |
| 	h.mutex.Lock()
 | |
| 	h.cache[serviceName] = status
 | |
| 	h.mutex.Unlock()
 | |
| 
 | |
| 	return status
 | |
| }
 | |
| 
 | |
| // CheckAllHealth 检查所有服务的健康状态
 | |
| func (h *HealthChecker) CheckAllHealth(ctx context.Context) map[string]*interfaces.HealthStatus {
 | |
| 	h.mutex.RLock()
 | |
| 	serviceNames := make([]string, 0, len(h.services))
 | |
| 	for name := range h.services {
 | |
| 		serviceNames = append(serviceNames, name)
 | |
| 	}
 | |
| 	h.mutex.RUnlock()
 | |
| 
 | |
| 	results := make(map[string]*interfaces.HealthStatus)
 | |
| 	var wg sync.WaitGroup
 | |
| 	var mutex sync.Mutex
 | |
| 
 | |
| 	// 并发检查所有服务
 | |
| 	for _, serviceName := range serviceNames {
 | |
| 		wg.Add(1)
 | |
| 		go func(name string) {
 | |
| 			defer wg.Done()
 | |
| 			status := h.CheckHealth(ctx, name)
 | |
| 
 | |
| 			mutex.Lock()
 | |
| 			results[name] = status
 | |
| 			mutex.Unlock()
 | |
| 		}(serviceName)
 | |
| 	}
 | |
| 
 | |
| 	wg.Wait()
 | |
| 	return results
 | |
| }
 | |
| 
 | |
| // GetOverallStatus 获取整体健康状态
 | |
| func (h *HealthChecker) GetOverallStatus(ctx context.Context) *interfaces.HealthStatus {
 | |
| 	allStatus := h.CheckAllHealth(ctx)
 | |
| 
 | |
| 	overall := &interfaces.HealthStatus{
 | |
| 		CheckedAt:    time.Now().Unix(),
 | |
| 		ResponseTime: 0,
 | |
| 		Details:      make(map[string]interface{}),
 | |
| 	}
 | |
| 
 | |
| 	var totalResponseTime int64
 | |
| 	healthyCount := 0
 | |
| 	totalCount := len(allStatus)
 | |
| 
 | |
| 	for serviceName, status := range allStatus {
 | |
| 		overall.Details[serviceName] = map[string]interface{}{
 | |
| 			"status":        status.Status,
 | |
| 			"message":       status.Message,
 | |
| 			"response_time": status.ResponseTime,
 | |
| 		}
 | |
| 
 | |
| 		totalResponseTime += status.ResponseTime
 | |
| 		if status.Status == "UP" {
 | |
| 			healthyCount++
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if totalCount > 0 {
 | |
| 		overall.ResponseTime = totalResponseTime / int64(totalCount)
 | |
| 	}
 | |
| 
 | |
| 	// 确定整体状态
 | |
| 	if healthyCount == totalCount {
 | |
| 		overall.Status = "UP"
 | |
| 		overall.Message = "所有服务运行正常"
 | |
| 	} else if healthyCount == 0 {
 | |
| 		overall.Status = "DOWN"
 | |
| 		overall.Message = "所有服务均已下线"
 | |
| 	} else {
 | |
| 		overall.Status = "DEGRADED"
 | |
| 		overall.Message = fmt.Sprintf("%d/%d 个服务运行正常", healthyCount, totalCount)
 | |
| 	}
 | |
| 
 | |
| 	return overall
 | |
| }
 | |
| 
 | |
| // GetServiceNames 获取所有注册的服务名称
 | |
| func (h *HealthChecker) GetServiceNames() []string {
 | |
| 	h.mutex.RLock()
 | |
| 	defer h.mutex.RUnlock()
 | |
| 
 | |
| 	names := make([]string, 0, len(h.services))
 | |
| 	for name := range h.services {
 | |
| 		names = append(names, name)
 | |
| 	}
 | |
| 	return names
 | |
| }
 | |
| 
 | |
| // RemoveService 移除服务
 | |
| func (h *HealthChecker) RemoveService(serviceName string) {
 | |
| 	h.mutex.Lock()
 | |
| 	defer h.mutex.Unlock()
 | |
| 
 | |
| 	delete(h.services, serviceName)
 | |
| 	delete(h.cache, serviceName)
 | |
| 
 | |
| 	h.logger.Info("服务已从健康检查中移除", zap.String("service", serviceName))
 | |
| }
 | |
| 
 | |
| // ClearCache 清除缓存
 | |
| func (h *HealthChecker) ClearCache() {
 | |
| 	h.mutex.Lock()
 | |
| 	defer h.mutex.Unlock()
 | |
| 
 | |
| 	h.cache = make(map[string]*interfaces.HealthStatus)
 | |
| 	h.logger.Debug("健康检查缓存已清除")
 | |
| }
 | |
| 
 | |
| // GetCacheStats 获取缓存统计
 | |
| func (h *HealthChecker) GetCacheStats() map[string]interface{} {
 | |
| 	h.mutex.RLock()
 | |
| 	defer h.mutex.RUnlock()
 | |
| 
 | |
| 	stats := map[string]interface{}{
 | |
| 		"total_services":    len(h.services),
 | |
| 		"cached_results":    len(h.cache),
 | |
| 		"cache_ttl_seconds": h.cacheTTL.Seconds(),
 | |
| 	}
 | |
| 
 | |
| 	// 计算缓存命中率
 | |
| 	if len(h.services) > 0 {
 | |
| 		hitRate := float64(len(h.cache)) / float64(len(h.services)) * 100
 | |
| 		stats["cache_hit_rate"] = fmt.Sprintf("%.2f%%", hitRate)
 | |
| 	}
 | |
| 
 | |
| 	return stats
 | |
| }
 | |
| 
 | |
| // SetCacheTTL 设置缓存TTL
 | |
| func (h *HealthChecker) SetCacheTTL(ttl time.Duration) {
 | |
| 	h.mutex.Lock()
 | |
| 	defer h.mutex.Unlock()
 | |
| 
 | |
| 	h.cacheTTL = ttl
 | |
| 	h.logger.Info("健康检查缓存TTL已更新", zap.Duration("ttl", ttl))
 | |
| }
 | |
| 
 | |
| // StartPeriodicCheck 启动定期健康检查
 | |
| func (h *HealthChecker) StartPeriodicCheck(ctx context.Context, interval time.Duration) {
 | |
| 	ticker := time.NewTicker(interval)
 | |
| 	defer ticker.Stop()
 | |
| 
 | |
| 	h.logger.Info("已启动定期健康检查", zap.Duration("interval", interval))
 | |
| 
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-ctx.Done():
 | |
| 			h.logger.Info("已停止定期健康检查")
 | |
| 			return
 | |
| 		case <-ticker.C:
 | |
| 			h.performPeriodicCheck(ctx)
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // performPeriodicCheck 执行定期检查
 | |
| func (h *HealthChecker) performPeriodicCheck(ctx context.Context) {
 | |
| 	overall := h.GetOverallStatus(ctx)
 | |
| 
 | |
| 	h.logger.Info("定期健康检查已完成",
 | |
| 		zap.String("overall_status", overall.Status),
 | |
| 		zap.String("message", overall.Message),
 | |
| 		zap.Int64("response_time_ms", overall.ResponseTime))
 | |
| 
 | |
| 	// 如果有服务下线,记录警告
 | |
| 	if overall.Status != "UP" {
 | |
| 		h.logger.Warn("部分服务不健康",
 | |
| 			zap.String("status", overall.Status),
 | |
| 			zap.Any("details", overall.Details))
 | |
| 	}
 | |
| }
 |