| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | package health | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import ( | 
					
						
							|  |  |  | 	"context" | 
					
						
							|  |  |  | 	"fmt" | 
					
						
							|  |  |  | 	"sync" | 
					
						
							|  |  |  | 	"time" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	"tyapi-server/internal/shared/interfaces" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	"go.uber.org/zap" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // HealthChecker 健康检查器实现 | 
					
						
							|  |  |  | type HealthChecker struct { | 
					
						
							|  |  |  | 	services map[string]interfaces.Service | 
					
						
							|  |  |  | 	cache    map[string]*interfaces.HealthStatus | 
					
						
							|  |  |  | 	cacheTTL time.Duration | 
					
						
							|  |  |  | 	mutex    sync.RWMutex | 
					
						
							|  |  |  | 	logger   *zap.Logger | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // NewHealthChecker 创建健康检查器 | 
					
						
							|  |  |  | func NewHealthChecker(logger *zap.Logger) *HealthChecker { | 
					
						
							|  |  |  | 	return &HealthChecker{ | 
					
						
							|  |  |  | 		services: make(map[string]interfaces.Service), | 
					
						
							|  |  |  | 		cache:    make(map[string]*interfaces.HealthStatus), | 
					
						
							|  |  |  | 		cacheTTL: 30 * time.Second, // 缓存30秒 | 
					
						
							|  |  |  | 		logger:   logger, | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // RegisterService 注册服务 | 
					
						
							|  |  |  | func (h *HealthChecker) RegisterService(service interfaces.Service) { | 
					
						
							|  |  |  | 	h.mutex.Lock() | 
					
						
							|  |  |  | 	defer h.mutex.Unlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	h.services[service.Name()] = service | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 	h.logger.Info("服务已注册健康检查", zap.String("service", service.Name())) | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // CheckHealth 检查单个服务健康状态 | 
					
						
							|  |  |  | func (h *HealthChecker) CheckHealth(ctx context.Context, serviceName string) *interfaces.HealthStatus { | 
					
						
							|  |  |  | 	h.mutex.RLock() | 
					
						
							|  |  |  | 	service, exists := h.services[serviceName] | 
					
						
							|  |  |  | 	if !exists { | 
					
						
							|  |  |  | 		h.mutex.RUnlock() | 
					
						
							|  |  |  | 		return &interfaces.HealthStatus{ | 
					
						
							|  |  |  | 			Status:       "DOWN", | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 			Message:      "服务未找到", | 
					
						
							|  |  |  | 			Details:      map[string]interface{}{"error": "服务未注册"}, | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 			CheckedAt:    time.Now().Unix(), | 
					
						
							|  |  |  | 			ResponseTime: 0, | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 检查缓存 | 
					
						
							|  |  |  | 	if cached, exists := h.cache[serviceName]; exists { | 
					
						
							|  |  |  | 		if time.Since(time.Unix(cached.CheckedAt, 0)) < h.cacheTTL { | 
					
						
							|  |  |  | 			h.mutex.RUnlock() | 
					
						
							|  |  |  | 			return cached | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	h.mutex.RUnlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 执行健康检查 | 
					
						
							|  |  |  | 	start := time.Now() | 
					
						
							|  |  |  | 	status := &interfaces.HealthStatus{ | 
					
						
							|  |  |  | 		CheckedAt: start.Unix(), | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 设置超时上下文 | 
					
						
							|  |  |  | 	checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second) | 
					
						
							|  |  |  | 	defer cancel() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	err := service.HealthCheck(checkCtx) | 
					
						
							|  |  |  | 	responseTime := time.Since(start).Milliseconds() | 
					
						
							|  |  |  | 	status.ResponseTime = responseTime | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if err != nil { | 
					
						
							|  |  |  | 		status.Status = "DOWN" | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		status.Message = "健康检查失败" | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 		status.Details = map[string]interface{}{ | 
					
						
							|  |  |  | 			"error":        err.Error(), | 
					
						
							|  |  |  | 			"service_name": serviceName, | 
					
						
							|  |  |  | 			"check_time":   start.Format(time.RFC3339), | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		h.logger.Warn("服务健康检查失败", | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 			zap.String("service", serviceName), | 
					
						
							|  |  |  | 			zap.Error(err), | 
					
						
							|  |  |  | 			zap.Int64("response_time_ms", responseTime)) | 
					
						
							|  |  |  | 	} else { | 
					
						
							|  |  |  | 		status.Status = "UP" | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		status.Message = "服务运行正常" | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 		status.Details = map[string]interface{}{ | 
					
						
							|  |  |  | 			"service_name": serviceName, | 
					
						
							|  |  |  | 			"check_time":   start.Format(time.RFC3339), | 
					
						
							|  |  |  | 		} | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		h.logger.Debug("服务健康检查通过", | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 			zap.String("service", serviceName), | 
					
						
							|  |  |  | 			zap.Int64("response_time_ms", responseTime)) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 更新缓存 | 
					
						
							|  |  |  | 	h.mutex.Lock() | 
					
						
							|  |  |  | 	h.cache[serviceName] = status | 
					
						
							|  |  |  | 	h.mutex.Unlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return status | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // CheckAllHealth 检查所有服务的健康状态 | 
					
						
							|  |  |  | func (h *HealthChecker) CheckAllHealth(ctx context.Context) map[string]*interfaces.HealthStatus { | 
					
						
							|  |  |  | 	h.mutex.RLock() | 
					
						
							|  |  |  | 	serviceNames := make([]string, 0, len(h.services)) | 
					
						
							|  |  |  | 	for name := range h.services { | 
					
						
							|  |  |  | 		serviceNames = append(serviceNames, name) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	h.mutex.RUnlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	results := make(map[string]*interfaces.HealthStatus) | 
					
						
							|  |  |  | 	var wg sync.WaitGroup | 
					
						
							|  |  |  | 	var mutex sync.Mutex | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 并发检查所有服务 | 
					
						
							|  |  |  | 	for _, serviceName := range serviceNames { | 
					
						
							|  |  |  | 		wg.Add(1) | 
					
						
							|  |  |  | 		go func(name string) { | 
					
						
							|  |  |  | 			defer wg.Done() | 
					
						
							|  |  |  | 			status := h.CheckHealth(ctx, name) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 			mutex.Lock() | 
					
						
							|  |  |  | 			results[name] = status | 
					
						
							|  |  |  | 			mutex.Unlock() | 
					
						
							|  |  |  | 		}(serviceName) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	wg.Wait() | 
					
						
							|  |  |  | 	return results | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // GetOverallStatus 获取整体健康状态 | 
					
						
							|  |  |  | func (h *HealthChecker) GetOverallStatus(ctx context.Context) *interfaces.HealthStatus { | 
					
						
							|  |  |  | 	allStatus := h.CheckAllHealth(ctx) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	overall := &interfaces.HealthStatus{ | 
					
						
							|  |  |  | 		CheckedAt:    time.Now().Unix(), | 
					
						
							|  |  |  | 		ResponseTime: 0, | 
					
						
							|  |  |  | 		Details:      make(map[string]interface{}), | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	var totalResponseTime int64 | 
					
						
							|  |  |  | 	healthyCount := 0 | 
					
						
							|  |  |  | 	totalCount := len(allStatus) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for serviceName, status := range allStatus { | 
					
						
							|  |  |  | 		overall.Details[serviceName] = map[string]interface{}{ | 
					
						
							|  |  |  | 			"status":        status.Status, | 
					
						
							|  |  |  | 			"message":       status.Message, | 
					
						
							|  |  |  | 			"response_time": status.ResponseTime, | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 		totalResponseTime += status.ResponseTime | 
					
						
							|  |  |  | 		if status.Status == "UP" { | 
					
						
							|  |  |  | 			healthyCount++ | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	if totalCount > 0 { | 
					
						
							|  |  |  | 		overall.ResponseTime = totalResponseTime / int64(totalCount) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 确定整体状态 | 
					
						
							|  |  |  | 	if healthyCount == totalCount { | 
					
						
							|  |  |  | 		overall.Status = "UP" | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		overall.Message = "所有服务运行正常" | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 	} else if healthyCount == 0 { | 
					
						
							|  |  |  | 		overall.Status = "DOWN" | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		overall.Message = "所有服务均已下线" | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 	} else { | 
					
						
							|  |  |  | 		overall.Status = "DEGRADED" | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		overall.Message = fmt.Sprintf("%d/%d 个服务运行正常", healthyCount, totalCount) | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return overall | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // GetServiceNames 获取所有注册的服务名称 | 
					
						
							|  |  |  | func (h *HealthChecker) GetServiceNames() []string { | 
					
						
							|  |  |  | 	h.mutex.RLock() | 
					
						
							|  |  |  | 	defer h.mutex.RUnlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	names := make([]string, 0, len(h.services)) | 
					
						
							|  |  |  | 	for name := range h.services { | 
					
						
							|  |  |  | 		names = append(names, name) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 	return names | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // RemoveService 移除服务 | 
					
						
							|  |  |  | func (h *HealthChecker) RemoveService(serviceName string) { | 
					
						
							|  |  |  | 	h.mutex.Lock() | 
					
						
							|  |  |  | 	defer h.mutex.Unlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	delete(h.services, serviceName) | 
					
						
							|  |  |  | 	delete(h.cache, serviceName) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 	h.logger.Info("服务已从健康检查中移除", zap.String("service", serviceName)) | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // ClearCache 清除缓存 | 
					
						
							|  |  |  | func (h *HealthChecker) ClearCache() { | 
					
						
							|  |  |  | 	h.mutex.Lock() | 
					
						
							|  |  |  | 	defer h.mutex.Unlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	h.cache = make(map[string]*interfaces.HealthStatus) | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 	h.logger.Debug("健康检查缓存已清除") | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // GetCacheStats 获取缓存统计 | 
					
						
							|  |  |  | func (h *HealthChecker) GetCacheStats() map[string]interface{} { | 
					
						
							|  |  |  | 	h.mutex.RLock() | 
					
						
							|  |  |  | 	defer h.mutex.RUnlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	stats := map[string]interface{}{ | 
					
						
							|  |  |  | 		"total_services":    len(h.services), | 
					
						
							|  |  |  | 		"cached_results":    len(h.cache), | 
					
						
							|  |  |  | 		"cache_ttl_seconds": h.cacheTTL.Seconds(), | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 计算缓存命中率 | 
					
						
							|  |  |  | 	if len(h.services) > 0 { | 
					
						
							|  |  |  | 		hitRate := float64(len(h.cache)) / float64(len(h.services)) * 100 | 
					
						
							|  |  |  | 		stats["cache_hit_rate"] = fmt.Sprintf("%.2f%%", hitRate) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return stats | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // SetCacheTTL 设置缓存TTL | 
					
						
							|  |  |  | func (h *HealthChecker) SetCacheTTL(ttl time.Duration) { | 
					
						
							|  |  |  | 	h.mutex.Lock() | 
					
						
							|  |  |  | 	defer h.mutex.Unlock() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	h.cacheTTL = ttl | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 	h.logger.Info("健康检查缓存TTL已更新", zap.Duration("ttl", ttl)) | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // StartPeriodicCheck 启动定期健康检查 | 
					
						
							|  |  |  | func (h *HealthChecker) StartPeriodicCheck(ctx context.Context, interval time.Duration) { | 
					
						
							|  |  |  | 	ticker := time.NewTicker(interval) | 
					
						
							|  |  |  | 	defer ticker.Stop() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 	h.logger.Info("已启动定期健康检查", zap.Duration("interval", interval)) | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	for { | 
					
						
							|  |  |  | 		select { | 
					
						
							|  |  |  | 		case <-ctx.Done(): | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 			h.logger.Info("已停止定期健康检查") | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 			return | 
					
						
							|  |  |  | 		case <-ticker.C: | 
					
						
							|  |  |  | 			h.performPeriodicCheck(ctx) | 
					
						
							|  |  |  | 		} | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // performPeriodicCheck 执行定期检查 | 
					
						
							|  |  |  | func (h *HealthChecker) performPeriodicCheck(ctx context.Context) { | 
					
						
							|  |  |  | 	overall := h.GetOverallStatus(ctx) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 	h.logger.Info("定期健康检查已完成", | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 		zap.String("overall_status", overall.Status), | 
					
						
							|  |  |  | 		zap.String("message", overall.Message), | 
					
						
							|  |  |  | 		zap.Int64("response_time_ms", overall.ResponseTime)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	// 如果有服务下线,记录警告 | 
					
						
							|  |  |  | 	if overall.Status != "UP" { | 
					
						
							| 
									
										
										
										
											2025-07-02 16:17:59 +08:00
										 |  |  | 		h.logger.Warn("部分服务不健康", | 
					
						
							| 
									
										
										
										
											2025-06-30 19:21:56 +08:00
										 |  |  | 			zap.String("status", overall.Status), | 
					
						
							|  |  |  | 			zap.Any("details", overall.Details)) | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | } |