Initial commit: Basic project structure and dependencies
This commit is contained in:
282
internal/shared/health/health_checker.go
Normal file
282
internal/shared/health/health_checker.go
Normal file
@@ -0,0 +1,282 @@
|
||||
package health
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tyapi-server/internal/shared/interfaces"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// HealthChecker 健康检查器实现
|
||||
type HealthChecker struct {
|
||||
services map[string]interfaces.Service
|
||||
cache map[string]*interfaces.HealthStatus
|
||||
cacheTTL time.Duration
|
||||
mutex sync.RWMutex
|
||||
logger *zap.Logger
|
||||
}
|
||||
|
||||
// NewHealthChecker 创建健康检查器
|
||||
func NewHealthChecker(logger *zap.Logger) *HealthChecker {
|
||||
return &HealthChecker{
|
||||
services: make(map[string]interfaces.Service),
|
||||
cache: make(map[string]*interfaces.HealthStatus),
|
||||
cacheTTL: 30 * time.Second, // 缓存30秒
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterService 注册服务
|
||||
func (h *HealthChecker) RegisterService(service interfaces.Service) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
h.services[service.Name()] = service
|
||||
h.logger.Info("Registered service for health check", zap.String("service", service.Name()))
|
||||
}
|
||||
|
||||
// CheckHealth 检查单个服务健康状态
|
||||
func (h *HealthChecker) CheckHealth(ctx context.Context, serviceName string) *interfaces.HealthStatus {
|
||||
h.mutex.RLock()
|
||||
service, exists := h.services[serviceName]
|
||||
if !exists {
|
||||
h.mutex.RUnlock()
|
||||
return &interfaces.HealthStatus{
|
||||
Status: "DOWN",
|
||||
Message: "Service not found",
|
||||
Details: map[string]interface{}{"error": "service not registered"},
|
||||
CheckedAt: time.Now().Unix(),
|
||||
ResponseTime: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// 检查缓存
|
||||
if cached, exists := h.cache[serviceName]; exists {
|
||||
if time.Since(time.Unix(cached.CheckedAt, 0)) < h.cacheTTL {
|
||||
h.mutex.RUnlock()
|
||||
return cached
|
||||
}
|
||||
}
|
||||
h.mutex.RUnlock()
|
||||
|
||||
// 执行健康检查
|
||||
start := time.Now()
|
||||
status := &interfaces.HealthStatus{
|
||||
CheckedAt: start.Unix(),
|
||||
}
|
||||
|
||||
// 设置超时上下文
|
||||
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
err := service.HealthCheck(checkCtx)
|
||||
responseTime := time.Since(start).Milliseconds()
|
||||
status.ResponseTime = responseTime
|
||||
|
||||
if err != nil {
|
||||
status.Status = "DOWN"
|
||||
status.Message = "Health check failed"
|
||||
status.Details = map[string]interface{}{
|
||||
"error": err.Error(),
|
||||
"service_name": serviceName,
|
||||
"check_time": start.Format(time.RFC3339),
|
||||
}
|
||||
h.logger.Warn("Service health check failed",
|
||||
zap.String("service", serviceName),
|
||||
zap.Error(err),
|
||||
zap.Int64("response_time_ms", responseTime))
|
||||
} else {
|
||||
status.Status = "UP"
|
||||
status.Message = "Service is healthy"
|
||||
status.Details = map[string]interface{}{
|
||||
"service_name": serviceName,
|
||||
"check_time": start.Format(time.RFC3339),
|
||||
}
|
||||
h.logger.Debug("Service health check passed",
|
||||
zap.String("service", serviceName),
|
||||
zap.Int64("response_time_ms", responseTime))
|
||||
}
|
||||
|
||||
// 更新缓存
|
||||
h.mutex.Lock()
|
||||
h.cache[serviceName] = status
|
||||
h.mutex.Unlock()
|
||||
|
||||
return status
|
||||
}
|
||||
|
||||
// CheckAllHealth 检查所有服务的健康状态
|
||||
func (h *HealthChecker) CheckAllHealth(ctx context.Context) map[string]*interfaces.HealthStatus {
|
||||
h.mutex.RLock()
|
||||
serviceNames := make([]string, 0, len(h.services))
|
||||
for name := range h.services {
|
||||
serviceNames = append(serviceNames, name)
|
||||
}
|
||||
h.mutex.RUnlock()
|
||||
|
||||
results := make(map[string]*interfaces.HealthStatus)
|
||||
var wg sync.WaitGroup
|
||||
var mutex sync.Mutex
|
||||
|
||||
// 并发检查所有服务
|
||||
for _, serviceName := range serviceNames {
|
||||
wg.Add(1)
|
||||
go func(name string) {
|
||||
defer wg.Done()
|
||||
status := h.CheckHealth(ctx, name)
|
||||
|
||||
mutex.Lock()
|
||||
results[name] = status
|
||||
mutex.Unlock()
|
||||
}(serviceName)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
return results
|
||||
}
|
||||
|
||||
// GetOverallStatus 获取整体健康状态
|
||||
func (h *HealthChecker) GetOverallStatus(ctx context.Context) *interfaces.HealthStatus {
|
||||
allStatus := h.CheckAllHealth(ctx)
|
||||
|
||||
overall := &interfaces.HealthStatus{
|
||||
CheckedAt: time.Now().Unix(),
|
||||
ResponseTime: 0,
|
||||
Details: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
var totalResponseTime int64
|
||||
healthyCount := 0
|
||||
totalCount := len(allStatus)
|
||||
|
||||
for serviceName, status := range allStatus {
|
||||
overall.Details[serviceName] = map[string]interface{}{
|
||||
"status": status.Status,
|
||||
"message": status.Message,
|
||||
"response_time": status.ResponseTime,
|
||||
}
|
||||
|
||||
totalResponseTime += status.ResponseTime
|
||||
if status.Status == "UP" {
|
||||
healthyCount++
|
||||
}
|
||||
}
|
||||
|
||||
if totalCount > 0 {
|
||||
overall.ResponseTime = totalResponseTime / int64(totalCount)
|
||||
}
|
||||
|
||||
// 确定整体状态
|
||||
if healthyCount == totalCount {
|
||||
overall.Status = "UP"
|
||||
overall.Message = "All services are healthy"
|
||||
} else if healthyCount == 0 {
|
||||
overall.Status = "DOWN"
|
||||
overall.Message = "All services are down"
|
||||
} else {
|
||||
overall.Status = "DEGRADED"
|
||||
overall.Message = fmt.Sprintf("%d of %d services are healthy", healthyCount, totalCount)
|
||||
}
|
||||
|
||||
return overall
|
||||
}
|
||||
|
||||
// GetServiceNames 获取所有注册的服务名称
|
||||
func (h *HealthChecker) GetServiceNames() []string {
|
||||
h.mutex.RLock()
|
||||
defer h.mutex.RUnlock()
|
||||
|
||||
names := make([]string, 0, len(h.services))
|
||||
for name := range h.services {
|
||||
names = append(names, name)
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
// RemoveService 移除服务
|
||||
func (h *HealthChecker) RemoveService(serviceName string) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
delete(h.services, serviceName)
|
||||
delete(h.cache, serviceName)
|
||||
|
||||
h.logger.Info("Removed service from health check", zap.String("service", serviceName))
|
||||
}
|
||||
|
||||
// ClearCache 清除缓存
|
||||
func (h *HealthChecker) ClearCache() {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
h.cache = make(map[string]*interfaces.HealthStatus)
|
||||
h.logger.Debug("Health check cache cleared")
|
||||
}
|
||||
|
||||
// GetCacheStats 获取缓存统计
|
||||
func (h *HealthChecker) GetCacheStats() map[string]interface{} {
|
||||
h.mutex.RLock()
|
||||
defer h.mutex.RUnlock()
|
||||
|
||||
stats := map[string]interface{}{
|
||||
"total_services": len(h.services),
|
||||
"cached_results": len(h.cache),
|
||||
"cache_ttl_seconds": h.cacheTTL.Seconds(),
|
||||
}
|
||||
|
||||
// 计算缓存命中率
|
||||
if len(h.services) > 0 {
|
||||
hitRate := float64(len(h.cache)) / float64(len(h.services)) * 100
|
||||
stats["cache_hit_rate"] = fmt.Sprintf("%.2f%%", hitRate)
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// SetCacheTTL 设置缓存TTL
|
||||
func (h *HealthChecker) SetCacheTTL(ttl time.Duration) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
h.cacheTTL = ttl
|
||||
h.logger.Info("Updated health check cache TTL", zap.Duration("ttl", ttl))
|
||||
}
|
||||
|
||||
// StartPeriodicCheck 启动定期健康检查
|
||||
func (h *HealthChecker) StartPeriodicCheck(ctx context.Context, interval time.Duration) {
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
h.logger.Info("Started periodic health check", zap.Duration("interval", interval))
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
h.logger.Info("Stopped periodic health check")
|
||||
return
|
||||
case <-ticker.C:
|
||||
h.performPeriodicCheck(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// performPeriodicCheck 执行定期检查
|
||||
func (h *HealthChecker) performPeriodicCheck(ctx context.Context) {
|
||||
overall := h.GetOverallStatus(ctx)
|
||||
|
||||
h.logger.Info("Periodic health check completed",
|
||||
zap.String("overall_status", overall.Status),
|
||||
zap.String("message", overall.Message),
|
||||
zap.Int64("response_time_ms", overall.ResponseTime))
|
||||
|
||||
// 如果有服务下线,记录警告
|
||||
if overall.Status != "UP" {
|
||||
h.logger.Warn("Some services are not healthy",
|
||||
zap.String("status", overall.Status),
|
||||
zap.Any("details", overall.Details))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user