Files
tyapi-server/internal/shared/health/health_checker.go

283 lines
7.0 KiB
Go

package health
import (
"context"
"fmt"
"sync"
"time"
"tyapi-server/internal/shared/interfaces"
"go.uber.org/zap"
)
// HealthChecker 健康检查器实现
type HealthChecker struct {
services map[string]interfaces.Service
cache map[string]*interfaces.HealthStatus
cacheTTL time.Duration
mutex sync.RWMutex
logger *zap.Logger
}
// NewHealthChecker 创建健康检查器
func NewHealthChecker(logger *zap.Logger) *HealthChecker {
return &HealthChecker{
services: make(map[string]interfaces.Service),
cache: make(map[string]*interfaces.HealthStatus),
cacheTTL: 30 * time.Second, // 缓存30秒
logger: logger,
}
}
// RegisterService 注册服务
func (h *HealthChecker) RegisterService(service interfaces.Service) {
h.mutex.Lock()
defer h.mutex.Unlock()
h.services[service.Name()] = service
h.logger.Info("Registered service for health check", zap.String("service", service.Name()))
}
// CheckHealth 检查单个服务健康状态
func (h *HealthChecker) CheckHealth(ctx context.Context, serviceName string) *interfaces.HealthStatus {
h.mutex.RLock()
service, exists := h.services[serviceName]
if !exists {
h.mutex.RUnlock()
return &interfaces.HealthStatus{
Status: "DOWN",
Message: "Service not found",
Details: map[string]interface{}{"error": "service not registered"},
CheckedAt: time.Now().Unix(),
ResponseTime: 0,
}
}
// 检查缓存
if cached, exists := h.cache[serviceName]; exists {
if time.Since(time.Unix(cached.CheckedAt, 0)) < h.cacheTTL {
h.mutex.RUnlock()
return cached
}
}
h.mutex.RUnlock()
// 执行健康检查
start := time.Now()
status := &interfaces.HealthStatus{
CheckedAt: start.Unix(),
}
// 设置超时上下文
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
err := service.HealthCheck(checkCtx)
responseTime := time.Since(start).Milliseconds()
status.ResponseTime = responseTime
if err != nil {
status.Status = "DOWN"
status.Message = "Health check failed"
status.Details = map[string]interface{}{
"error": err.Error(),
"service_name": serviceName,
"check_time": start.Format(time.RFC3339),
}
h.logger.Warn("Service health check failed",
zap.String("service", serviceName),
zap.Error(err),
zap.Int64("response_time_ms", responseTime))
} else {
status.Status = "UP"
status.Message = "Service is healthy"
status.Details = map[string]interface{}{
"service_name": serviceName,
"check_time": start.Format(time.RFC3339),
}
h.logger.Debug("Service health check passed",
zap.String("service", serviceName),
zap.Int64("response_time_ms", responseTime))
}
// 更新缓存
h.mutex.Lock()
h.cache[serviceName] = status
h.mutex.Unlock()
return status
}
// CheckAllHealth 检查所有服务的健康状态
func (h *HealthChecker) CheckAllHealth(ctx context.Context) map[string]*interfaces.HealthStatus {
h.mutex.RLock()
serviceNames := make([]string, 0, len(h.services))
for name := range h.services {
serviceNames = append(serviceNames, name)
}
h.mutex.RUnlock()
results := make(map[string]*interfaces.HealthStatus)
var wg sync.WaitGroup
var mutex sync.Mutex
// 并发检查所有服务
for _, serviceName := range serviceNames {
wg.Add(1)
go func(name string) {
defer wg.Done()
status := h.CheckHealth(ctx, name)
mutex.Lock()
results[name] = status
mutex.Unlock()
}(serviceName)
}
wg.Wait()
return results
}
// GetOverallStatus 获取整体健康状态
func (h *HealthChecker) GetOverallStatus(ctx context.Context) *interfaces.HealthStatus {
allStatus := h.CheckAllHealth(ctx)
overall := &interfaces.HealthStatus{
CheckedAt: time.Now().Unix(),
ResponseTime: 0,
Details: make(map[string]interface{}),
}
var totalResponseTime int64
healthyCount := 0
totalCount := len(allStatus)
for serviceName, status := range allStatus {
overall.Details[serviceName] = map[string]interface{}{
"status": status.Status,
"message": status.Message,
"response_time": status.ResponseTime,
}
totalResponseTime += status.ResponseTime
if status.Status == "UP" {
healthyCount++
}
}
if totalCount > 0 {
overall.ResponseTime = totalResponseTime / int64(totalCount)
}
// 确定整体状态
if healthyCount == totalCount {
overall.Status = "UP"
overall.Message = "All services are healthy"
} else if healthyCount == 0 {
overall.Status = "DOWN"
overall.Message = "All services are down"
} else {
overall.Status = "DEGRADED"
overall.Message = fmt.Sprintf("%d of %d services are healthy", healthyCount, totalCount)
}
return overall
}
// GetServiceNames 获取所有注册的服务名称
func (h *HealthChecker) GetServiceNames() []string {
h.mutex.RLock()
defer h.mutex.RUnlock()
names := make([]string, 0, len(h.services))
for name := range h.services {
names = append(names, name)
}
return names
}
// RemoveService 移除服务
func (h *HealthChecker) RemoveService(serviceName string) {
h.mutex.Lock()
defer h.mutex.Unlock()
delete(h.services, serviceName)
delete(h.cache, serviceName)
h.logger.Info("Removed service from health check", zap.String("service", serviceName))
}
// ClearCache 清除缓存
func (h *HealthChecker) ClearCache() {
h.mutex.Lock()
defer h.mutex.Unlock()
h.cache = make(map[string]*interfaces.HealthStatus)
h.logger.Debug("Health check cache cleared")
}
// GetCacheStats 获取缓存统计
func (h *HealthChecker) GetCacheStats() map[string]interface{} {
h.mutex.RLock()
defer h.mutex.RUnlock()
stats := map[string]interface{}{
"total_services": len(h.services),
"cached_results": len(h.cache),
"cache_ttl_seconds": h.cacheTTL.Seconds(),
}
// 计算缓存命中率
if len(h.services) > 0 {
hitRate := float64(len(h.cache)) / float64(len(h.services)) * 100
stats["cache_hit_rate"] = fmt.Sprintf("%.2f%%", hitRate)
}
return stats
}
// SetCacheTTL 设置缓存TTL
func (h *HealthChecker) SetCacheTTL(ttl time.Duration) {
h.mutex.Lock()
defer h.mutex.Unlock()
h.cacheTTL = ttl
h.logger.Info("Updated health check cache TTL", zap.Duration("ttl", ttl))
}
// StartPeriodicCheck 启动定期健康检查
func (h *HealthChecker) StartPeriodicCheck(ctx context.Context, interval time.Duration) {
ticker := time.NewTicker(interval)
defer ticker.Stop()
h.logger.Info("Started periodic health check", zap.Duration("interval", interval))
for {
select {
case <-ctx.Done():
h.logger.Info("Stopped periodic health check")
return
case <-ticker.C:
h.performPeriodicCheck(ctx)
}
}
}
// performPeriodicCheck 执行定期检查
func (h *HealthChecker) performPeriodicCheck(ctx context.Context) {
overall := h.GetOverallStatus(ctx)
h.logger.Info("Periodic health check completed",
zap.String("overall_status", overall.Status),
zap.String("message", overall.Message),
zap.Int64("response_time_ms", overall.ResponseTime))
// 如果有服务下线,记录警告
if overall.Status != "UP" {
h.logger.Warn("Some services are not healthy",
zap.String("status", overall.Status),
zap.Any("details", overall.Details))
}
}