package health import ( "context" "fmt" "sync" "time" "tyapi-server/internal/shared/interfaces" "go.uber.org/zap" ) // HealthChecker 健康检查器实现 type HealthChecker struct { services map[string]interfaces.Service cache map[string]*interfaces.HealthStatus cacheTTL time.Duration mutex sync.RWMutex logger *zap.Logger } // NewHealthChecker 创建健康检查器 func NewHealthChecker(logger *zap.Logger) *HealthChecker { return &HealthChecker{ services: make(map[string]interfaces.Service), cache: make(map[string]*interfaces.HealthStatus), cacheTTL: 30 * time.Second, // 缓存30秒 logger: logger, } } // RegisterService 注册服务 func (h *HealthChecker) RegisterService(service interfaces.Service) { h.mutex.Lock() defer h.mutex.Unlock() h.services[service.Name()] = service h.logger.Info("服务已注册健康检查", zap.String("service", service.Name())) } // CheckHealth 检查单个服务健康状态 func (h *HealthChecker) CheckHealth(ctx context.Context, serviceName string) *interfaces.HealthStatus { h.mutex.RLock() service, exists := h.services[serviceName] if !exists { h.mutex.RUnlock() return &interfaces.HealthStatus{ Status: "DOWN", Message: "服务未找到", Details: map[string]interface{}{"error": "服务未注册"}, CheckedAt: time.Now().Unix(), ResponseTime: 0, } } // 检查缓存 if cached, exists := h.cache[serviceName]; exists { if time.Since(time.Unix(cached.CheckedAt, 0)) < h.cacheTTL { h.mutex.RUnlock() return cached } } h.mutex.RUnlock() // 执行健康检查 start := time.Now() status := &interfaces.HealthStatus{ CheckedAt: start.Unix(), } // 设置超时上下文 checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() err := service.HealthCheck(checkCtx) responseTime := time.Since(start).Milliseconds() status.ResponseTime = responseTime if err != nil { status.Status = "DOWN" status.Message = "健康检查失败" status.Details = map[string]interface{}{ "error": err.Error(), "service_name": serviceName, "check_time": start.Format(time.RFC3339), } h.logger.Warn("服务健康检查失败", zap.String("service", serviceName), zap.Error(err), zap.Int64("response_time_ms", responseTime)) } else { status.Status = "UP" status.Message = "服务运行正常" status.Details = map[string]interface{}{ "service_name": serviceName, "check_time": start.Format(time.RFC3339), } h.logger.Debug("服务健康检查通过", zap.String("service", serviceName), zap.Int64("response_time_ms", responseTime)) } // 更新缓存 h.mutex.Lock() h.cache[serviceName] = status h.mutex.Unlock() return status } // CheckAllHealth 检查所有服务的健康状态 func (h *HealthChecker) CheckAllHealth(ctx context.Context) map[string]*interfaces.HealthStatus { h.mutex.RLock() serviceNames := make([]string, 0, len(h.services)) for name := range h.services { serviceNames = append(serviceNames, name) } h.mutex.RUnlock() results := make(map[string]*interfaces.HealthStatus) var wg sync.WaitGroup var mutex sync.Mutex // 并发检查所有服务 for _, serviceName := range serviceNames { wg.Add(1) go func(name string) { defer wg.Done() status := h.CheckHealth(ctx, name) mutex.Lock() results[name] = status mutex.Unlock() }(serviceName) } wg.Wait() return results } // GetOverallStatus 获取整体健康状态 func (h *HealthChecker) GetOverallStatus(ctx context.Context) *interfaces.HealthStatus { allStatus := h.CheckAllHealth(ctx) overall := &interfaces.HealthStatus{ CheckedAt: time.Now().Unix(), ResponseTime: 0, Details: make(map[string]interface{}), } var totalResponseTime int64 healthyCount := 0 totalCount := len(allStatus) for serviceName, status := range allStatus { overall.Details[serviceName] = map[string]interface{}{ "status": status.Status, "message": status.Message, "response_time": status.ResponseTime, } totalResponseTime += status.ResponseTime if status.Status == "UP" { healthyCount++ } } if totalCount > 0 { overall.ResponseTime = totalResponseTime / int64(totalCount) } // 确定整体状态 if healthyCount == totalCount { overall.Status = "UP" overall.Message = "所有服务运行正常" } else if healthyCount == 0 { overall.Status = "DOWN" overall.Message = "所有服务均已下线" } else { overall.Status = "DEGRADED" overall.Message = fmt.Sprintf("%d/%d 个服务运行正常", healthyCount, totalCount) } return overall } // GetServiceNames 获取所有注册的服务名称 func (h *HealthChecker) GetServiceNames() []string { h.mutex.RLock() defer h.mutex.RUnlock() names := make([]string, 0, len(h.services)) for name := range h.services { names = append(names, name) } return names } // RemoveService 移除服务 func (h *HealthChecker) RemoveService(serviceName string) { h.mutex.Lock() defer h.mutex.Unlock() delete(h.services, serviceName) delete(h.cache, serviceName) h.logger.Info("服务已从健康检查中移除", zap.String("service", serviceName)) } // ClearCache 清除缓存 func (h *HealthChecker) ClearCache() { h.mutex.Lock() defer h.mutex.Unlock() h.cache = make(map[string]*interfaces.HealthStatus) h.logger.Debug("健康检查缓存已清除") } // GetCacheStats 获取缓存统计 func (h *HealthChecker) GetCacheStats() map[string]interface{} { h.mutex.RLock() defer h.mutex.RUnlock() stats := map[string]interface{}{ "total_services": len(h.services), "cached_results": len(h.cache), "cache_ttl_seconds": h.cacheTTL.Seconds(), } // 计算缓存命中率 if len(h.services) > 0 { hitRate := float64(len(h.cache)) / float64(len(h.services)) * 100 stats["cache_hit_rate"] = fmt.Sprintf("%.2f%%", hitRate) } return stats } // SetCacheTTL 设置缓存TTL func (h *HealthChecker) SetCacheTTL(ttl time.Duration) { h.mutex.Lock() defer h.mutex.Unlock() h.cacheTTL = ttl h.logger.Info("健康检查缓存TTL已更新", zap.Duration("ttl", ttl)) } // StartPeriodicCheck 启动定期健康检查 func (h *HealthChecker) StartPeriodicCheck(ctx context.Context, interval time.Duration) { ticker := time.NewTicker(interval) defer ticker.Stop() h.logger.Info("已启动定期健康检查", zap.Duration("interval", interval)) for { select { case <-ctx.Done(): h.logger.Info("已停止定期健康检查") return case <-ticker.C: h.performPeriodicCheck(ctx) } } } // performPeriodicCheck 执行定期检查 func (h *HealthChecker) performPeriodicCheck(ctx context.Context) { overall := h.GetOverallStatus(ctx) h.logger.Info("定期健康检查已完成", zap.String("overall_status", overall.Status), zap.String("message", overall.Message), zap.Int64("response_time_ms", overall.ResponseTime)) // 如果有服务下线,记录警告 if overall.Status != "UP" { h.logger.Warn("部分服务不健康", zap.String("status", overall.Status), zap.Any("details", overall.Details)) } }