283 lines
7.0 KiB
Go
283 lines
7.0 KiB
Go
package health
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"tyapi-server/internal/shared/interfaces"
|
|
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// HealthChecker 健康检查器实现
|
|
type HealthChecker struct {
|
|
services map[string]interfaces.Service
|
|
cache map[string]*interfaces.HealthStatus
|
|
cacheTTL time.Duration
|
|
mutex sync.RWMutex
|
|
logger *zap.Logger
|
|
}
|
|
|
|
// NewHealthChecker 创建健康检查器
|
|
func NewHealthChecker(logger *zap.Logger) *HealthChecker {
|
|
return &HealthChecker{
|
|
services: make(map[string]interfaces.Service),
|
|
cache: make(map[string]*interfaces.HealthStatus),
|
|
cacheTTL: 30 * time.Second, // 缓存30秒
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
// RegisterService 注册服务
|
|
func (h *HealthChecker) RegisterService(service interfaces.Service) {
|
|
h.mutex.Lock()
|
|
defer h.mutex.Unlock()
|
|
|
|
h.services[service.Name()] = service
|
|
h.logger.Info("Registered service for health check", zap.String("service", service.Name()))
|
|
}
|
|
|
|
// CheckHealth 检查单个服务健康状态
|
|
func (h *HealthChecker) CheckHealth(ctx context.Context, serviceName string) *interfaces.HealthStatus {
|
|
h.mutex.RLock()
|
|
service, exists := h.services[serviceName]
|
|
if !exists {
|
|
h.mutex.RUnlock()
|
|
return &interfaces.HealthStatus{
|
|
Status: "DOWN",
|
|
Message: "Service not found",
|
|
Details: map[string]interface{}{"error": "service not registered"},
|
|
CheckedAt: time.Now().Unix(),
|
|
ResponseTime: 0,
|
|
}
|
|
}
|
|
|
|
// 检查缓存
|
|
if cached, exists := h.cache[serviceName]; exists {
|
|
if time.Since(time.Unix(cached.CheckedAt, 0)) < h.cacheTTL {
|
|
h.mutex.RUnlock()
|
|
return cached
|
|
}
|
|
}
|
|
h.mutex.RUnlock()
|
|
|
|
// 执行健康检查
|
|
start := time.Now()
|
|
status := &interfaces.HealthStatus{
|
|
CheckedAt: start.Unix(),
|
|
}
|
|
|
|
// 设置超时上下文
|
|
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
|
|
err := service.HealthCheck(checkCtx)
|
|
responseTime := time.Since(start).Milliseconds()
|
|
status.ResponseTime = responseTime
|
|
|
|
if err != nil {
|
|
status.Status = "DOWN"
|
|
status.Message = "Health check failed"
|
|
status.Details = map[string]interface{}{
|
|
"error": err.Error(),
|
|
"service_name": serviceName,
|
|
"check_time": start.Format(time.RFC3339),
|
|
}
|
|
h.logger.Warn("Service health check failed",
|
|
zap.String("service", serviceName),
|
|
zap.Error(err),
|
|
zap.Int64("response_time_ms", responseTime))
|
|
} else {
|
|
status.Status = "UP"
|
|
status.Message = "Service is healthy"
|
|
status.Details = map[string]interface{}{
|
|
"service_name": serviceName,
|
|
"check_time": start.Format(time.RFC3339),
|
|
}
|
|
h.logger.Debug("Service health check passed",
|
|
zap.String("service", serviceName),
|
|
zap.Int64("response_time_ms", responseTime))
|
|
}
|
|
|
|
// 更新缓存
|
|
h.mutex.Lock()
|
|
h.cache[serviceName] = status
|
|
h.mutex.Unlock()
|
|
|
|
return status
|
|
}
|
|
|
|
// CheckAllHealth 检查所有服务的健康状态
|
|
func (h *HealthChecker) CheckAllHealth(ctx context.Context) map[string]*interfaces.HealthStatus {
|
|
h.mutex.RLock()
|
|
serviceNames := make([]string, 0, len(h.services))
|
|
for name := range h.services {
|
|
serviceNames = append(serviceNames, name)
|
|
}
|
|
h.mutex.RUnlock()
|
|
|
|
results := make(map[string]*interfaces.HealthStatus)
|
|
var wg sync.WaitGroup
|
|
var mutex sync.Mutex
|
|
|
|
// 并发检查所有服务
|
|
for _, serviceName := range serviceNames {
|
|
wg.Add(1)
|
|
go func(name string) {
|
|
defer wg.Done()
|
|
status := h.CheckHealth(ctx, name)
|
|
|
|
mutex.Lock()
|
|
results[name] = status
|
|
mutex.Unlock()
|
|
}(serviceName)
|
|
}
|
|
|
|
wg.Wait()
|
|
return results
|
|
}
|
|
|
|
// GetOverallStatus 获取整体健康状态
|
|
func (h *HealthChecker) GetOverallStatus(ctx context.Context) *interfaces.HealthStatus {
|
|
allStatus := h.CheckAllHealth(ctx)
|
|
|
|
overall := &interfaces.HealthStatus{
|
|
CheckedAt: time.Now().Unix(),
|
|
ResponseTime: 0,
|
|
Details: make(map[string]interface{}),
|
|
}
|
|
|
|
var totalResponseTime int64
|
|
healthyCount := 0
|
|
totalCount := len(allStatus)
|
|
|
|
for serviceName, status := range allStatus {
|
|
overall.Details[serviceName] = map[string]interface{}{
|
|
"status": status.Status,
|
|
"message": status.Message,
|
|
"response_time": status.ResponseTime,
|
|
}
|
|
|
|
totalResponseTime += status.ResponseTime
|
|
if status.Status == "UP" {
|
|
healthyCount++
|
|
}
|
|
}
|
|
|
|
if totalCount > 0 {
|
|
overall.ResponseTime = totalResponseTime / int64(totalCount)
|
|
}
|
|
|
|
// 确定整体状态
|
|
if healthyCount == totalCount {
|
|
overall.Status = "UP"
|
|
overall.Message = "All services are healthy"
|
|
} else if healthyCount == 0 {
|
|
overall.Status = "DOWN"
|
|
overall.Message = "All services are down"
|
|
} else {
|
|
overall.Status = "DEGRADED"
|
|
overall.Message = fmt.Sprintf("%d of %d services are healthy", healthyCount, totalCount)
|
|
}
|
|
|
|
return overall
|
|
}
|
|
|
|
// GetServiceNames 获取所有注册的服务名称
|
|
func (h *HealthChecker) GetServiceNames() []string {
|
|
h.mutex.RLock()
|
|
defer h.mutex.RUnlock()
|
|
|
|
names := make([]string, 0, len(h.services))
|
|
for name := range h.services {
|
|
names = append(names, name)
|
|
}
|
|
return names
|
|
}
|
|
|
|
// RemoveService 移除服务
|
|
func (h *HealthChecker) RemoveService(serviceName string) {
|
|
h.mutex.Lock()
|
|
defer h.mutex.Unlock()
|
|
|
|
delete(h.services, serviceName)
|
|
delete(h.cache, serviceName)
|
|
|
|
h.logger.Info("Removed service from health check", zap.String("service", serviceName))
|
|
}
|
|
|
|
// ClearCache 清除缓存
|
|
func (h *HealthChecker) ClearCache() {
|
|
h.mutex.Lock()
|
|
defer h.mutex.Unlock()
|
|
|
|
h.cache = make(map[string]*interfaces.HealthStatus)
|
|
h.logger.Debug("Health check cache cleared")
|
|
}
|
|
|
|
// GetCacheStats 获取缓存统计
|
|
func (h *HealthChecker) GetCacheStats() map[string]interface{} {
|
|
h.mutex.RLock()
|
|
defer h.mutex.RUnlock()
|
|
|
|
stats := map[string]interface{}{
|
|
"total_services": len(h.services),
|
|
"cached_results": len(h.cache),
|
|
"cache_ttl_seconds": h.cacheTTL.Seconds(),
|
|
}
|
|
|
|
// 计算缓存命中率
|
|
if len(h.services) > 0 {
|
|
hitRate := float64(len(h.cache)) / float64(len(h.services)) * 100
|
|
stats["cache_hit_rate"] = fmt.Sprintf("%.2f%%", hitRate)
|
|
}
|
|
|
|
return stats
|
|
}
|
|
|
|
// SetCacheTTL 设置缓存TTL
|
|
func (h *HealthChecker) SetCacheTTL(ttl time.Duration) {
|
|
h.mutex.Lock()
|
|
defer h.mutex.Unlock()
|
|
|
|
h.cacheTTL = ttl
|
|
h.logger.Info("Updated health check cache TTL", zap.Duration("ttl", ttl))
|
|
}
|
|
|
|
// StartPeriodicCheck 启动定期健康检查
|
|
func (h *HealthChecker) StartPeriodicCheck(ctx context.Context, interval time.Duration) {
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
|
|
h.logger.Info("Started periodic health check", zap.Duration("interval", interval))
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
h.logger.Info("Stopped periodic health check")
|
|
return
|
|
case <-ticker.C:
|
|
h.performPeriodicCheck(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// performPeriodicCheck 执行定期检查
|
|
func (h *HealthChecker) performPeriodicCheck(ctx context.Context) {
|
|
overall := h.GetOverallStatus(ctx)
|
|
|
|
h.logger.Info("Periodic health check completed",
|
|
zap.String("overall_status", overall.Status),
|
|
zap.String("message", overall.Message),
|
|
zap.Int64("response_time_ms", overall.ResponseTime))
|
|
|
|
// 如果有服务下线,记录警告
|
|
if overall.Status != "UP" {
|
|
h.logger.Warn("Some services are not healthy",
|
|
zap.String("status", overall.Status),
|
|
zap.Any("details", overall.Details))
|
|
}
|
|
}
|