475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package tracing
 | ||
| 
 | ||
| import (
 | ||
| 	"context"
 | ||
| 	"fmt"
 | ||
| 	"sync"
 | ||
| 	"time"
 | ||
| 
 | ||
| 	"github.com/gin-gonic/gin"
 | ||
| 	"go.opentelemetry.io/otel"
 | ||
| 	"go.opentelemetry.io/otel/attribute"
 | ||
| 	"go.opentelemetry.io/otel/codes"
 | ||
| 	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
 | ||
| 	"go.opentelemetry.io/otel/sdk/resource"
 | ||
| 	sdktrace "go.opentelemetry.io/otel/sdk/trace"
 | ||
| 	"go.opentelemetry.io/otel/trace"
 | ||
| 	"go.uber.org/zap"
 | ||
| )
 | ||
| 
 | ||
| // TracerConfig 追踪器配置
 | ||
| type TracerConfig struct {
 | ||
| 	ServiceName    string
 | ||
| 	ServiceVersion string
 | ||
| 	Environment    string
 | ||
| 	Endpoint       string
 | ||
| 	SampleRate     float64
 | ||
| 	Enabled        bool
 | ||
| }
 | ||
| 
 | ||
| // DefaultTracerConfig 默认追踪器配置
 | ||
| func DefaultTracerConfig() TracerConfig {
 | ||
| 	return TracerConfig{
 | ||
| 		ServiceName:    "tyapi-server",
 | ||
| 		ServiceVersion: "1.0.0",
 | ||
| 		Environment:    "development",
 | ||
| 		Endpoint:       "http://localhost:4317",
 | ||
| 		SampleRate:     0.1,
 | ||
| 		Enabled:        true,
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // Tracer 链路追踪器
 | ||
| type Tracer struct {
 | ||
| 	config      TracerConfig
 | ||
| 	logger      *zap.Logger
 | ||
| 	provider    *sdktrace.TracerProvider
 | ||
| 	tracer      trace.Tracer
 | ||
| 	mutex       sync.RWMutex
 | ||
| 	initialized bool
 | ||
| 	shutdown    func(context.Context) error
 | ||
| }
 | ||
| 
 | ||
| // NewTracer 创建链路追踪器
 | ||
| func NewTracer(config TracerConfig, logger *zap.Logger) *Tracer {
 | ||
| 	return &Tracer{
 | ||
| 		config: config,
 | ||
| 		logger: logger,
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // Initialize 初始化追踪器
 | ||
| func (t *Tracer) Initialize(ctx context.Context) error {
 | ||
| 	t.mutex.Lock()
 | ||
| 	defer t.mutex.Unlock()
 | ||
| 
 | ||
| 	if t.initialized {
 | ||
| 		return nil
 | ||
| 	}
 | ||
| 
 | ||
| 	if !t.config.Enabled {
 | ||
| 		t.logger.Info("Tracing is disabled")
 | ||
| 		return nil
 | ||
| 	}
 | ||
| 
 | ||
| 	// 创建资源
 | ||
| 	res, err := resource.New(ctx,
 | ||
| 		resource.WithAttributes(
 | ||
| 			attribute.String("service.name", t.config.ServiceName),
 | ||
| 			attribute.String("service.version", t.config.ServiceVersion),
 | ||
| 			attribute.String("environment", t.config.Environment),
 | ||
| 		),
 | ||
| 	)
 | ||
| 	if err != nil {
 | ||
| 		return fmt.Errorf("failed to create resource: %w", err)
 | ||
| 	}
 | ||
| 
 | ||
| 	// 创建采样器
 | ||
| 	sampler := sdktrace.TraceIDRatioBased(t.config.SampleRate)
 | ||
| 
 | ||
| 	// 创建导出器
 | ||
| 	var spanProcessor sdktrace.SpanProcessor
 | ||
| 	if t.config.Endpoint != "" {
 | ||
| 		// 使用OTLP gRPC导出器(支持Jaeger、Tempo等)
 | ||
| 		exporter, err := otlptracegrpc.New(ctx,
 | ||
| 			otlptracegrpc.WithEndpoint(t.config.Endpoint),
 | ||
| 			otlptracegrpc.WithInsecure(), // 开发环境使用,生产环境应配置TLS
 | ||
| 			otlptracegrpc.WithTimeout(time.Second*10),
 | ||
| 			otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{
 | ||
| 				Enabled:         true,
 | ||
| 				InitialInterval: time.Millisecond * 100,
 | ||
| 				MaxInterval:     time.Second * 5,
 | ||
| 				MaxElapsedTime:  time.Second * 30,
 | ||
| 			}),
 | ||
| 		)
 | ||
| 		if err != nil {
 | ||
| 			t.logger.Warn("Failed to create OTLP exporter, using noop exporter",
 | ||
| 				zap.Error(err),
 | ||
| 				zap.String("endpoint", t.config.Endpoint))
 | ||
| 			spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
 | ||
| 		} else {
 | ||
| 			// 在生产环境中使用批处理器以提高性能
 | ||
| 			spanProcessor = sdktrace.NewBatchSpanProcessor(exporter,
 | ||
| 				sdktrace.WithBatchTimeout(time.Second*5),
 | ||
| 				sdktrace.WithMaxExportBatchSize(512),
 | ||
| 				sdktrace.WithMaxQueueSize(2048),
 | ||
| 				sdktrace.WithExportTimeout(time.Second*30),
 | ||
| 			)
 | ||
| 			t.logger.Info("OTLP exporter initialized successfully",
 | ||
| 				zap.String("endpoint", t.config.Endpoint))
 | ||
| 		}
 | ||
| 	} else {
 | ||
| 		// 如果没有配置端点,使用空导出器
 | ||
| 		spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
 | ||
| 		t.logger.Info("Using noop exporter (no endpoint configured)")
 | ||
| 	}
 | ||
| 
 | ||
| 	// 创建TracerProvider
 | ||
| 	provider := sdktrace.NewTracerProvider(
 | ||
| 		sdktrace.WithResource(res),
 | ||
| 		sdktrace.WithSampler(sampler),
 | ||
| 		sdktrace.WithSpanProcessor(spanProcessor),
 | ||
| 	)
 | ||
| 
 | ||
| 	// 设置全局TracerProvider
 | ||
| 	otel.SetTracerProvider(provider)
 | ||
| 
 | ||
| 	// 创建Tracer
 | ||
| 	tracer := provider.Tracer(t.config.ServiceName)
 | ||
| 
 | ||
| 	t.provider = provider
 | ||
| 	t.tracer = tracer
 | ||
| 	t.shutdown = func(ctx context.Context) error {
 | ||
| 		return provider.Shutdown(ctx)
 | ||
| 	}
 | ||
| 	t.initialized = true
 | ||
| 
 | ||
| 	t.logger.Info("Tracing initialized successfully",
 | ||
| 		zap.String("service", t.config.ServiceName),
 | ||
| 		zap.Float64("sample_rate", t.config.SampleRate))
 | ||
| 
 | ||
| 	return nil
 | ||
| }
 | ||
| 
 | ||
| // StartSpan 开始一个新的span
 | ||
| func (t *Tracer) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
 | ||
| 	if !t.initialized || !t.config.Enabled {
 | ||
| 		return ctx, trace.SpanFromContext(ctx)
 | ||
| 	}
 | ||
| 
 | ||
| 	return t.tracer.Start(ctx, name, opts...)
 | ||
| }
 | ||
| 
 | ||
| // StartHTTPSpan 开始一个HTTP span
 | ||
| func (t *Tracer) StartHTTPSpan(ctx context.Context, method, path string) (context.Context, trace.Span) {
 | ||
| 	spanName := fmt.Sprintf("%s %s", method, path)
 | ||
| 
 | ||
| 	// 检查是否已有错误标记,如果有则使用"error"作为操作名
 | ||
| 	// 这样可以匹配Jaeger采样配置中的错误操作策略
 | ||
| 	if ctx.Value("otel_error_request") != nil {
 | ||
| 		spanName = "error"
 | ||
| 	}
 | ||
| 
 | ||
| 	ctx, span := t.StartSpan(ctx, spanName,
 | ||
| 		trace.WithSpanKind(trace.SpanKindServer),
 | ||
| 		trace.WithAttributes(
 | ||
| 			attribute.String("http.method", method),
 | ||
| 			attribute.String("http.route", path),
 | ||
| 		),
 | ||
| 	)
 | ||
| 
 | ||
| 	// 保存原始操作名,以便在错误发生时可以更新
 | ||
| 	if ctx.Value("otel_error_request") == nil {
 | ||
| 		ctx = context.WithValue(ctx, "otel_original_operation", spanName)
 | ||
| 	}
 | ||
| 
 | ||
| 	return ctx, span
 | ||
| }
 | ||
| 
 | ||
| // StartDBSpan 开始一个数据库span
 | ||
| func (t *Tracer) StartDBSpan(ctx context.Context, operation, table string) (context.Context, trace.Span) {
 | ||
| 	spanName := fmt.Sprintf("db.%s.%s", operation, table)
 | ||
| 
 | ||
| 	return t.StartSpan(ctx, spanName,
 | ||
| 		trace.WithSpanKind(trace.SpanKindClient),
 | ||
| 		trace.WithAttributes(
 | ||
| 			attribute.String("db.operation", operation),
 | ||
| 			attribute.String("db.table", table),
 | ||
| 			attribute.String("db.system", "postgresql"),
 | ||
| 		),
 | ||
| 	)
 | ||
| }
 | ||
| 
 | ||
| // StartCacheSpan 开始一个缓存span
 | ||
| func (t *Tracer) StartCacheSpan(ctx context.Context, operation, key string) (context.Context, trace.Span) {
 | ||
| 	spanName := fmt.Sprintf("cache.%s", operation)
 | ||
| 
 | ||
| 	return t.StartSpan(ctx, spanName,
 | ||
| 		trace.WithSpanKind(trace.SpanKindClient),
 | ||
| 		trace.WithAttributes(
 | ||
| 			attribute.String("cache.operation", operation),
 | ||
| 			attribute.String("cache.system", "redis"),
 | ||
| 		),
 | ||
| 	)
 | ||
| }
 | ||
| 
 | ||
| // StartExternalAPISpan 开始一个外部API调用span
 | ||
| func (t *Tracer) StartExternalAPISpan(ctx context.Context, service, operation string) (context.Context, trace.Span) {
 | ||
| 	spanName := fmt.Sprintf("api.%s.%s", service, operation)
 | ||
| 
 | ||
| 	return t.StartSpan(ctx, spanName,
 | ||
| 		trace.WithSpanKind(trace.SpanKindClient),
 | ||
| 		trace.WithAttributes(
 | ||
| 			attribute.String("api.service", service),
 | ||
| 			attribute.String("api.operation", operation),
 | ||
| 		),
 | ||
| 	)
 | ||
| }
 | ||
| 
 | ||
| // AddSpanAttributes 添加span属性
 | ||
| func (t *Tracer) AddSpanAttributes(span trace.Span, attrs ...attribute.KeyValue) {
 | ||
| 	if span.IsRecording() {
 | ||
| 		span.SetAttributes(attrs...)
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // SetSpanError 设置span错误
 | ||
| func (t *Tracer) SetSpanError(span trace.Span, err error) {
 | ||
| 	if span.IsRecording() {
 | ||
| 		span.SetStatus(codes.Error, err.Error())
 | ||
| 		span.RecordError(err)
 | ||
| 
 | ||
| 		// 将span操作名更新为"error",以匹配Jaeger采样配置
 | ||
| 		// 注意:这是一种变通方法,因为OpenTelemetry不支持直接更改span名称
 | ||
| 		// 我们通过添加特殊属性来标识这是一个错误span
 | ||
| 		span.SetAttributes(
 | ||
| 			attribute.String("error.operation", "true"),
 | ||
| 			attribute.String("operation.type", "error"),
 | ||
| 		)
 | ||
| 
 | ||
| 		// 记录错误日志,包含trace ID便于关联
 | ||
| 		if t.logger != nil {
 | ||
| 			ctx := trace.ContextWithSpan(context.Background(), span)
 | ||
| 			t.logger.Error("操作发生错误",
 | ||
| 				zap.Error(err),
 | ||
| 				zap.String("trace_id", t.GetTraceID(ctx)),
 | ||
| 				zap.String("span_id", t.GetSpanID(ctx)),
 | ||
| 			)
 | ||
| 		}
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // SetSpanSuccess 设置span成功
 | ||
| func (t *Tracer) SetSpanSuccess(span trace.Span) {
 | ||
| 	if span.IsRecording() {
 | ||
| 		span.SetStatus(codes.Ok, "success")
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // SetHTTPStatus 根据HTTP状态码设置span状态
 | ||
| func (t *Tracer) SetHTTPStatus(span trace.Span, statusCode int) {
 | ||
| 	if !span.IsRecording() {
 | ||
| 		return
 | ||
| 	}
 | ||
| 
 | ||
| 	// 添加HTTP状态码属性
 | ||
| 	span.SetAttributes(attribute.Int("http.status_code", statusCode))
 | ||
| 
 | ||
| 	// 对于4xx和5xx错误,标记为错误并应用错误采样策略
 | ||
| 	if statusCode >= 400 {
 | ||
| 		errorMsg := fmt.Sprintf("HTTP %d", statusCode)
 | ||
| 		span.SetStatus(codes.Error, errorMsg)
 | ||
| 
 | ||
| 		// 添加错误操作标记,以匹配Jaeger采样配置
 | ||
| 		span.SetAttributes(
 | ||
| 			attribute.String("error.operation", "true"),
 | ||
| 			attribute.String("operation.type", "error"),
 | ||
| 		)
 | ||
| 
 | ||
| 		// 记录HTTP错误
 | ||
| 		if t.logger != nil {
 | ||
| 			ctx := trace.ContextWithSpan(context.Background(), span)
 | ||
| 			t.logger.Warn("HTTP请求错误",
 | ||
| 				zap.Int("status_code", statusCode),
 | ||
| 				zap.String("trace_id", t.GetTraceID(ctx)),
 | ||
| 				zap.String("span_id", t.GetSpanID(ctx)),
 | ||
| 			)
 | ||
| 		}
 | ||
| 	} else {
 | ||
| 		span.SetStatus(codes.Ok, "success")
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // GetTraceID 获取当前上下文的trace ID
 | ||
| func (t *Tracer) GetTraceID(ctx context.Context) string {
 | ||
| 	span := trace.SpanFromContext(ctx)
 | ||
| 	if span.SpanContext().IsValid() {
 | ||
| 		return span.SpanContext().TraceID().String()
 | ||
| 	}
 | ||
| 	return ""
 | ||
| }
 | ||
| 
 | ||
| // GetSpanID 获取当前上下文的span ID
 | ||
| func (t *Tracer) GetSpanID(ctx context.Context) string {
 | ||
| 	span := trace.SpanFromContext(ctx)
 | ||
| 	if span.SpanContext().IsValid() {
 | ||
| 		return span.SpanContext().SpanID().String()
 | ||
| 	}
 | ||
| 	return ""
 | ||
| }
 | ||
| 
 | ||
| // IsTracing 检查是否正在追踪
 | ||
| func (t *Tracer) IsTracing(ctx context.Context) bool {
 | ||
| 	span := trace.SpanFromContext(ctx)
 | ||
| 	return span.SpanContext().IsValid() && span.IsRecording()
 | ||
| }
 | ||
| 
 | ||
| // Shutdown 关闭追踪器
 | ||
| func (t *Tracer) Shutdown(ctx context.Context) error {
 | ||
| 	t.mutex.Lock()
 | ||
| 	defer t.mutex.Unlock()
 | ||
| 
 | ||
| 	if !t.initialized || t.shutdown == nil {
 | ||
| 		return nil
 | ||
| 	}
 | ||
| 
 | ||
| 	err := t.shutdown(ctx)
 | ||
| 	if err != nil {
 | ||
| 		t.logger.Error("Failed to shutdown tracer", zap.Error(err))
 | ||
| 		return err
 | ||
| 	}
 | ||
| 
 | ||
| 	t.initialized = false
 | ||
| 	t.logger.Info("Tracer shutdown successfully")
 | ||
| 	return nil
 | ||
| }
 | ||
| 
 | ||
| // GetStats 获取追踪统计信息
 | ||
| func (t *Tracer) GetStats() map[string]interface{} {
 | ||
| 	t.mutex.RLock()
 | ||
| 	defer t.mutex.RUnlock()
 | ||
| 
 | ||
| 	return map[string]interface{}{
 | ||
| 		"initialized":     t.initialized,
 | ||
| 		"enabled":         t.config.Enabled,
 | ||
| 		"service_name":    t.config.ServiceName,
 | ||
| 		"service_version": t.config.ServiceVersion,
 | ||
| 		"environment":     t.config.Environment,
 | ||
| 		"sample_rate":     t.config.SampleRate,
 | ||
| 		"endpoint":        t.config.Endpoint,
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // 实现Service接口
 | ||
| 
 | ||
| // Name 返回服务名称
 | ||
| func (t *Tracer) Name() string {
 | ||
| 	return "tracer"
 | ||
| }
 | ||
| 
 | ||
| // HealthCheck 健康检查
 | ||
| func (t *Tracer) HealthCheck(ctx context.Context) error {
 | ||
| 	if !t.config.Enabled {
 | ||
| 		return nil
 | ||
| 	}
 | ||
| 
 | ||
| 	if !t.initialized {
 | ||
| 		return fmt.Errorf("tracer not initialized")
 | ||
| 	}
 | ||
| 
 | ||
| 	return nil
 | ||
| }
 | ||
| 
 | ||
| // noopExporter 简单的无操作导出器(用于演示)
 | ||
| type noopExporter struct{}
 | ||
| 
 | ||
| func (e *noopExporter) ExportSpans(ctx context.Context, spans []sdktrace.ReadOnlySpan) error {
 | ||
| 	// 在实际应用中,这里应该将spans发送到Jaeger或其他追踪系统
 | ||
| 	return nil
 | ||
| }
 | ||
| 
 | ||
| func (e *noopExporter) Shutdown(ctx context.Context) error {
 | ||
| 	return nil
 | ||
| }
 | ||
| 
 | ||
| // TraceMiddleware 追踪中间件工厂
 | ||
| func (t *Tracer) TraceMiddleware() gin.HandlerFunc {
 | ||
| 	return func(c *gin.Context) {
 | ||
| 		if !t.initialized || !t.config.Enabled {
 | ||
| 			c.Next()
 | ||
| 			return
 | ||
| 		}
 | ||
| 
 | ||
| 		// 开始HTTP span
 | ||
| 		ctx, span := t.StartHTTPSpan(c.Request.Context(), c.Request.Method, c.FullPath())
 | ||
| 		defer span.End()
 | ||
| 
 | ||
| 		// 将trace ID添加到响应头
 | ||
| 		traceID := t.GetTraceID(ctx)
 | ||
| 		if traceID != "" {
 | ||
| 			c.Header("X-Trace-ID", traceID)
 | ||
| 		}
 | ||
| 
 | ||
| 		// 将span上下文存储到gin上下文
 | ||
| 		c.Request = c.Request.WithContext(ctx)
 | ||
| 
 | ||
| 		// 处理请求
 | ||
| 		c.Next()
 | ||
| 
 | ||
| 		// 设置HTTP状态码
 | ||
| 		t.SetHTTPStatus(span, c.Writer.Status())
 | ||
| 
 | ||
| 		// 添加响应信息
 | ||
| 		t.AddSpanAttributes(span,
 | ||
| 			attribute.Int("http.status_code", c.Writer.Status()),
 | ||
| 			attribute.Int("http.response_size", c.Writer.Size()),
 | ||
| 		)
 | ||
| 
 | ||
| 		// 添加错误信息
 | ||
| 		if len(c.Errors) > 0 {
 | ||
| 			errMsg := c.Errors.String()
 | ||
| 			t.SetSpanError(span, fmt.Errorf(errMsg))
 | ||
| 		}
 | ||
| 	}
 | ||
| }
 | ||
| 
 | ||
| // GinTraceMiddleware 兼容旧的方法名,保持向后兼容
 | ||
| func (t *Tracer) GinTraceMiddleware() gin.HandlerFunc {
 | ||
| 	return t.TraceMiddleware()
 | ||
| }
 | ||
| 
 | ||
| // WithTracing 添加追踪到上下文的辅助函数
 | ||
| func WithTracing(ctx context.Context, tracer *Tracer, name string) (context.Context, trace.Span) {
 | ||
| 	return tracer.StartSpan(ctx, name)
 | ||
| }
 | ||
| 
 | ||
| // TraceFunction 追踪函数执行的辅助函数
 | ||
| func (t *Tracer) TraceFunction(ctx context.Context, name string, fn func(context.Context) error) error {
 | ||
| 	ctx, span := t.StartSpan(ctx, name)
 | ||
| 	defer span.End()
 | ||
| 
 | ||
| 	err := fn(ctx)
 | ||
| 	if err != nil {
 | ||
| 		t.SetSpanError(span, err)
 | ||
| 	} else {
 | ||
| 		t.SetSpanSuccess(span)
 | ||
| 	}
 | ||
| 
 | ||
| 	return err
 | ||
| }
 | ||
| 
 | ||
| // TraceFunctionWithResult 追踪带返回值的函数执行
 | ||
| func TraceFunctionWithResult[T any](ctx context.Context, tracer *Tracer, name string, fn func(context.Context) (T, error)) (T, error) {
 | ||
| 	ctx, span := tracer.StartSpan(ctx, name)
 | ||
| 	defer span.End()
 | ||
| 
 | ||
| 	result, err := fn(ctx)
 | ||
| 	if err != nil {
 | ||
| 		tracer.SetSpanError(span, err)
 | ||
| 	} else {
 | ||
| 		tracer.SetSpanSuccess(span)
 | ||
| 	}
 | ||
| 
 | ||
| 	return result, err
 | ||
| }
 |