package tracing import ( "context" "fmt" "sync" "time" "github.com/gin-gonic/gin" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.opentelemetry.io/otel/trace" "go.uber.org/zap" ) // TracerConfig 追踪器配置 type TracerConfig struct { ServiceName string ServiceVersion string Environment string Endpoint string SampleRate float64 Enabled bool } // DefaultTracerConfig 默认追踪器配置 func DefaultTracerConfig() TracerConfig { return TracerConfig{ ServiceName: "tyapi-server", ServiceVersion: "1.0.0", Environment: "development", Endpoint: "http://localhost:4317", SampleRate: 0.1, Enabled: true, } } // Tracer 链路追踪器 type Tracer struct { config TracerConfig logger *zap.Logger provider *sdktrace.TracerProvider tracer trace.Tracer mutex sync.RWMutex initialized bool shutdown func(context.Context) error } // NewTracer 创建链路追踪器 func NewTracer(config TracerConfig, logger *zap.Logger) *Tracer { return &Tracer{ config: config, logger: logger, } } // Initialize 初始化追踪器 func (t *Tracer) Initialize(ctx context.Context) error { t.mutex.Lock() defer t.mutex.Unlock() if t.initialized { return nil } if !t.config.Enabled { t.logger.Info("Tracing is disabled") return nil } // 创建资源 res, err := resource.New(ctx, resource.WithAttributes( attribute.String("service.name", t.config.ServiceName), attribute.String("service.version", t.config.ServiceVersion), attribute.String("environment", t.config.Environment), ), ) if err != nil { return fmt.Errorf("failed to create resource: %w", err) } // 创建采样器 sampler := sdktrace.TraceIDRatioBased(t.config.SampleRate) // 创建导出器 var spanProcessor sdktrace.SpanProcessor if t.config.Endpoint != "" { // 使用OTLP gRPC导出器(支持Jaeger、Tempo等) exporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithEndpoint(t.config.Endpoint), otlptracegrpc.WithInsecure(), // 开发环境使用,生产环境应配置TLS otlptracegrpc.WithTimeout(time.Second*10), otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ Enabled: true, InitialInterval: time.Millisecond * 100, MaxInterval: time.Second * 5, MaxElapsedTime: time.Second * 30, }), ) if err != nil { t.logger.Warn("Failed to create OTLP exporter, using noop exporter", zap.Error(err), zap.String("endpoint", t.config.Endpoint)) spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{}) } else { // 在生产环境中使用批处理器以提高性能 spanProcessor = sdktrace.NewBatchSpanProcessor(exporter, sdktrace.WithBatchTimeout(time.Second*5), sdktrace.WithMaxExportBatchSize(512), sdktrace.WithMaxQueueSize(2048), sdktrace.WithExportTimeout(time.Second*30), ) t.logger.Info("OTLP exporter initialized successfully", zap.String("endpoint", t.config.Endpoint)) } } else { // 如果没有配置端点,使用空导出器 spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{}) t.logger.Info("Using noop exporter (no endpoint configured)") } // 创建TracerProvider provider := sdktrace.NewTracerProvider( sdktrace.WithResource(res), sdktrace.WithSampler(sampler), sdktrace.WithSpanProcessor(spanProcessor), ) // 设置全局TracerProvider otel.SetTracerProvider(provider) // 创建Tracer tracer := provider.Tracer(t.config.ServiceName) t.provider = provider t.tracer = tracer t.shutdown = func(ctx context.Context) error { return provider.Shutdown(ctx) } t.initialized = true t.logger.Info("Tracing initialized successfully", zap.String("service", t.config.ServiceName), zap.Float64("sample_rate", t.config.SampleRate)) return nil } // StartSpan 开始一个新的span func (t *Tracer) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) { if !t.initialized || !t.config.Enabled { return ctx, trace.SpanFromContext(ctx) } return t.tracer.Start(ctx, name, opts...) } // StartHTTPSpan 开始一个HTTP span func (t *Tracer) StartHTTPSpan(ctx context.Context, method, path string) (context.Context, trace.Span) { spanName := fmt.Sprintf("%s %s", method, path) // 检查是否已有错误标记,如果有则使用"error"作为操作名 // 这样可以匹配Jaeger采样配置中的错误操作策略 if ctx.Value("otel_error_request") != nil { spanName = "error" } ctx, span := t.StartSpan(ctx, spanName, trace.WithSpanKind(trace.SpanKindServer), trace.WithAttributes( attribute.String("http.method", method), attribute.String("http.route", path), ), ) // 保存原始操作名,以便在错误发生时可以更新 if ctx.Value("otel_error_request") == nil { ctx = context.WithValue(ctx, "otel_original_operation", spanName) } return ctx, span } // StartDBSpan 开始一个数据库span func (t *Tracer) StartDBSpan(ctx context.Context, operation, table string) (context.Context, trace.Span) { spanName := fmt.Sprintf("db.%s.%s", operation, table) return t.StartSpan(ctx, spanName, trace.WithSpanKind(trace.SpanKindClient), trace.WithAttributes( attribute.String("db.operation", operation), attribute.String("db.table", table), attribute.String("db.system", "postgresql"), ), ) } // StartCacheSpan 开始一个缓存span func (t *Tracer) StartCacheSpan(ctx context.Context, operation, key string) (context.Context, trace.Span) { spanName := fmt.Sprintf("cache.%s", operation) return t.StartSpan(ctx, spanName, trace.WithSpanKind(trace.SpanKindClient), trace.WithAttributes( attribute.String("cache.operation", operation), attribute.String("cache.system", "redis"), ), ) } // StartExternalAPISpan 开始一个外部API调用span func (t *Tracer) StartExternalAPISpan(ctx context.Context, service, operation string) (context.Context, trace.Span) { spanName := fmt.Sprintf("api.%s.%s", service, operation) return t.StartSpan(ctx, spanName, trace.WithSpanKind(trace.SpanKindClient), trace.WithAttributes( attribute.String("api.service", service), attribute.String("api.operation", operation), ), ) } // AddSpanAttributes 添加span属性 func (t *Tracer) AddSpanAttributes(span trace.Span, attrs ...attribute.KeyValue) { if span.IsRecording() { span.SetAttributes(attrs...) } } // SetSpanError 设置span错误 func (t *Tracer) SetSpanError(span trace.Span, err error) { if span.IsRecording() { span.SetStatus(codes.Error, err.Error()) span.RecordError(err) // 将span操作名更新为"error",以匹配Jaeger采样配置 // 注意:这是一种变通方法,因为OpenTelemetry不支持直接更改span名称 // 我们通过添加特殊属性来标识这是一个错误span span.SetAttributes( attribute.String("error.operation", "true"), attribute.String("operation.type", "error"), ) // 记录错误日志,包含trace ID便于关联 if t.logger != nil { ctx := trace.ContextWithSpan(context.Background(), span) t.logger.Error("操作发生错误", zap.Error(err), zap.String("trace_id", t.GetTraceID(ctx)), zap.String("span_id", t.GetSpanID(ctx)), ) } } } // SetSpanSuccess 设置span成功 func (t *Tracer) SetSpanSuccess(span trace.Span) { if span.IsRecording() { span.SetStatus(codes.Ok, "success") } } // SetHTTPStatus 根据HTTP状态码设置span状态 func (t *Tracer) SetHTTPStatus(span trace.Span, statusCode int) { if !span.IsRecording() { return } // 添加HTTP状态码属性 span.SetAttributes(attribute.Int("http.status_code", statusCode)) // 对于4xx和5xx错误,标记为错误并应用错误采样策略 if statusCode >= 400 { errorMsg := fmt.Sprintf("HTTP %d", statusCode) span.SetStatus(codes.Error, errorMsg) // 添加错误操作标记,以匹配Jaeger采样配置 span.SetAttributes( attribute.String("error.operation", "true"), attribute.String("operation.type", "error"), ) // 记录HTTP错误 if t.logger != nil { ctx := trace.ContextWithSpan(context.Background(), span) t.logger.Warn("HTTP请求错误", zap.Int("status_code", statusCode), zap.String("trace_id", t.GetTraceID(ctx)), zap.String("span_id", t.GetSpanID(ctx)), ) } } else { span.SetStatus(codes.Ok, "success") } } // GetTraceID 获取当前上下文的trace ID func (t *Tracer) GetTraceID(ctx context.Context) string { span := trace.SpanFromContext(ctx) if span.SpanContext().IsValid() { return span.SpanContext().TraceID().String() } return "" } // GetSpanID 获取当前上下文的span ID func (t *Tracer) GetSpanID(ctx context.Context) string { span := trace.SpanFromContext(ctx) if span.SpanContext().IsValid() { return span.SpanContext().SpanID().String() } return "" } // IsTracing 检查是否正在追踪 func (t *Tracer) IsTracing(ctx context.Context) bool { span := trace.SpanFromContext(ctx) return span.SpanContext().IsValid() && span.IsRecording() } // Shutdown 关闭追踪器 func (t *Tracer) Shutdown(ctx context.Context) error { t.mutex.Lock() defer t.mutex.Unlock() if !t.initialized || t.shutdown == nil { return nil } err := t.shutdown(ctx) if err != nil { t.logger.Error("Failed to shutdown tracer", zap.Error(err)) return err } t.initialized = false t.logger.Info("Tracer shutdown successfully") return nil } // GetStats 获取追踪统计信息 func (t *Tracer) GetStats() map[string]interface{} { t.mutex.RLock() defer t.mutex.RUnlock() return map[string]interface{}{ "initialized": t.initialized, "enabled": t.config.Enabled, "service_name": t.config.ServiceName, "service_version": t.config.ServiceVersion, "environment": t.config.Environment, "sample_rate": t.config.SampleRate, "endpoint": t.config.Endpoint, } } // 实现Service接口 // Name 返回服务名称 func (t *Tracer) Name() string { return "tracer" } // HealthCheck 健康检查 func (t *Tracer) HealthCheck(ctx context.Context) error { if !t.config.Enabled { return nil } if !t.initialized { return fmt.Errorf("tracer not initialized") } return nil } // noopExporter 简单的无操作导出器(用于演示) type noopExporter struct{} func (e *noopExporter) ExportSpans(ctx context.Context, spans []sdktrace.ReadOnlySpan) error { // 在实际应用中,这里应该将spans发送到Jaeger或其他追踪系统 return nil } func (e *noopExporter) Shutdown(ctx context.Context) error { return nil } // TraceMiddleware 追踪中间件工厂 func (t *Tracer) TraceMiddleware() gin.HandlerFunc { return func(c *gin.Context) { if !t.initialized || !t.config.Enabled { c.Next() return } // 开始HTTP span ctx, span := t.StartHTTPSpan(c.Request.Context(), c.Request.Method, c.FullPath()) defer span.End() // 将trace ID添加到响应头 traceID := t.GetTraceID(ctx) if traceID != "" { c.Header("X-Trace-ID", traceID) } // 将span上下文存储到gin上下文 c.Request = c.Request.WithContext(ctx) // 处理请求 c.Next() // 设置HTTP状态码 t.SetHTTPStatus(span, c.Writer.Status()) // 添加响应信息 t.AddSpanAttributes(span, attribute.Int("http.status_code", c.Writer.Status()), attribute.Int("http.response_size", c.Writer.Size()), ) // 添加错误信息 if len(c.Errors) > 0 { errMsg := c.Errors.String() t.SetSpanError(span, fmt.Errorf(errMsg)) } } } // GinTraceMiddleware 兼容旧的方法名,保持向后兼容 func (t *Tracer) GinTraceMiddleware() gin.HandlerFunc { return t.TraceMiddleware() } // WithTracing 添加追踪到上下文的辅助函数 func WithTracing(ctx context.Context, tracer *Tracer, name string) (context.Context, trace.Span) { return tracer.StartSpan(ctx, name) } // TraceFunction 追踪函数执行的辅助函数 func (t *Tracer) TraceFunction(ctx context.Context, name string, fn func(context.Context) error) error { ctx, span := t.StartSpan(ctx, name) defer span.End() err := fn(ctx) if err != nil { t.SetSpanError(span, err) } else { t.SetSpanSuccess(span) } return err } // TraceFunctionWithResult 追踪带返回值的函数执行 func TraceFunctionWithResult[T any](ctx context.Context, tracer *Tracer, name string, fn func(context.Context) (T, error)) (T, error) { ctx, span := tracer.StartSpan(ctx, name) defer span.End() result, err := fn(ctx) if err != nil { tracer.SetSpanError(span, err) } else { tracer.SetSpanSuccess(span) } return result, err }