475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
		
		
			
		
	
	
			475 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
|  | package tracing | |||
|  | 
 | |||
|  | import ( | |||
|  | 	"context" | |||
|  | 	"fmt" | |||
|  | 	"sync" | |||
|  | 	"time" | |||
|  | 
 | |||
|  | 	"github.com/gin-gonic/gin" | |||
|  | 	"go.opentelemetry.io/otel" | |||
|  | 	"go.opentelemetry.io/otel/attribute" | |||
|  | 	"go.opentelemetry.io/otel/codes" | |||
|  | 	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" | |||
|  | 	"go.opentelemetry.io/otel/sdk/resource" | |||
|  | 	sdktrace "go.opentelemetry.io/otel/sdk/trace" | |||
|  | 	"go.opentelemetry.io/otel/trace" | |||
|  | 	"go.uber.org/zap" | |||
|  | ) | |||
|  | 
 | |||
|  | // TracerConfig 追踪器配置 | |||
|  | type TracerConfig struct { | |||
|  | 	ServiceName    string | |||
|  | 	ServiceVersion string | |||
|  | 	Environment    string | |||
|  | 	Endpoint       string | |||
|  | 	SampleRate     float64 | |||
|  | 	Enabled        bool | |||
|  | } | |||
|  | 
 | |||
|  | // DefaultTracerConfig 默认追踪器配置 | |||
|  | func DefaultTracerConfig() TracerConfig { | |||
|  | 	return TracerConfig{ | |||
|  | 		ServiceName:    "tyapi-server", | |||
|  | 		ServiceVersion: "1.0.0", | |||
|  | 		Environment:    "development", | |||
|  | 		Endpoint:       "http://localhost:4317", | |||
|  | 		SampleRate:     0.1, | |||
|  | 		Enabled:        true, | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // Tracer 链路追踪器 | |||
|  | type Tracer struct { | |||
|  | 	config      TracerConfig | |||
|  | 	logger      *zap.Logger | |||
|  | 	provider    *sdktrace.TracerProvider | |||
|  | 	tracer      trace.Tracer | |||
|  | 	mutex       sync.RWMutex | |||
|  | 	initialized bool | |||
|  | 	shutdown    func(context.Context) error | |||
|  | } | |||
|  | 
 | |||
|  | // NewTracer 创建链路追踪器 | |||
|  | func NewTracer(config TracerConfig, logger *zap.Logger) *Tracer { | |||
|  | 	return &Tracer{ | |||
|  | 		config: config, | |||
|  | 		logger: logger, | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // Initialize 初始化追踪器 | |||
|  | func (t *Tracer) Initialize(ctx context.Context) error { | |||
|  | 	t.mutex.Lock() | |||
|  | 	defer t.mutex.Unlock() | |||
|  | 
 | |||
|  | 	if t.initialized { | |||
|  | 		return nil | |||
|  | 	} | |||
|  | 
 | |||
|  | 	if !t.config.Enabled { | |||
|  | 		t.logger.Info("Tracing is disabled") | |||
|  | 		return nil | |||
|  | 	} | |||
|  | 
 | |||
|  | 	// 创建资源 | |||
|  | 	res, err := resource.New(ctx, | |||
|  | 		resource.WithAttributes( | |||
|  | 			attribute.String("service.name", t.config.ServiceName), | |||
|  | 			attribute.String("service.version", t.config.ServiceVersion), | |||
|  | 			attribute.String("environment", t.config.Environment), | |||
|  | 		), | |||
|  | 	) | |||
|  | 	if err != nil { | |||
|  | 		return fmt.Errorf("failed to create resource: %w", err) | |||
|  | 	} | |||
|  | 
 | |||
|  | 	// 创建采样器 | |||
|  | 	sampler := sdktrace.TraceIDRatioBased(t.config.SampleRate) | |||
|  | 
 | |||
|  | 	// 创建导出器 | |||
|  | 	var spanProcessor sdktrace.SpanProcessor | |||
|  | 	if t.config.Endpoint != "" { | |||
|  | 		// 使用OTLP gRPC导出器(支持Jaeger、Tempo等) | |||
|  | 		exporter, err := otlptracegrpc.New(ctx, | |||
|  | 			otlptracegrpc.WithEndpoint(t.config.Endpoint), | |||
|  | 			otlptracegrpc.WithInsecure(), // 开发环境使用,生产环境应配置TLS | |||
|  | 			otlptracegrpc.WithTimeout(time.Second*10), | |||
|  | 			otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{ | |||
|  | 				Enabled:         true, | |||
|  | 				InitialInterval: time.Millisecond * 100, | |||
|  | 				MaxInterval:     time.Second * 5, | |||
|  | 				MaxElapsedTime:  time.Second * 30, | |||
|  | 			}), | |||
|  | 		) | |||
|  | 		if err != nil { | |||
|  | 			t.logger.Warn("Failed to create OTLP exporter, using noop exporter", | |||
|  | 				zap.Error(err), | |||
|  | 				zap.String("endpoint", t.config.Endpoint)) | |||
|  | 			spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{}) | |||
|  | 		} else { | |||
|  | 			// 在生产环境中使用批处理器以提高性能 | |||
|  | 			spanProcessor = sdktrace.NewBatchSpanProcessor(exporter, | |||
|  | 				sdktrace.WithBatchTimeout(time.Second*5), | |||
|  | 				sdktrace.WithMaxExportBatchSize(512), | |||
|  | 				sdktrace.WithMaxQueueSize(2048), | |||
|  | 				sdktrace.WithExportTimeout(time.Second*30), | |||
|  | 			) | |||
|  | 			t.logger.Info("OTLP exporter initialized successfully", | |||
|  | 				zap.String("endpoint", t.config.Endpoint)) | |||
|  | 		} | |||
|  | 	} else { | |||
|  | 		// 如果没有配置端点,使用空导出器 | |||
|  | 		spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{}) | |||
|  | 		t.logger.Info("Using noop exporter (no endpoint configured)") | |||
|  | 	} | |||
|  | 
 | |||
|  | 	// 创建TracerProvider | |||
|  | 	provider := sdktrace.NewTracerProvider( | |||
|  | 		sdktrace.WithResource(res), | |||
|  | 		sdktrace.WithSampler(sampler), | |||
|  | 		sdktrace.WithSpanProcessor(spanProcessor), | |||
|  | 	) | |||
|  | 
 | |||
|  | 	// 设置全局TracerProvider | |||
|  | 	otel.SetTracerProvider(provider) | |||
|  | 
 | |||
|  | 	// 创建Tracer | |||
|  | 	tracer := provider.Tracer(t.config.ServiceName) | |||
|  | 
 | |||
|  | 	t.provider = provider | |||
|  | 	t.tracer = tracer | |||
|  | 	t.shutdown = func(ctx context.Context) error { | |||
|  | 		return provider.Shutdown(ctx) | |||
|  | 	} | |||
|  | 	t.initialized = true | |||
|  | 
 | |||
|  | 	t.logger.Info("Tracing initialized successfully", | |||
|  | 		zap.String("service", t.config.ServiceName), | |||
|  | 		zap.Float64("sample_rate", t.config.SampleRate)) | |||
|  | 
 | |||
|  | 	return nil | |||
|  | } | |||
|  | 
 | |||
|  | // StartSpan 开始一个新的span | |||
|  | func (t *Tracer) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) { | |||
|  | 	if !t.initialized || !t.config.Enabled { | |||
|  | 		return ctx, trace.SpanFromContext(ctx) | |||
|  | 	} | |||
|  | 
 | |||
|  | 	return t.tracer.Start(ctx, name, opts...) | |||
|  | } | |||
|  | 
 | |||
|  | // StartHTTPSpan 开始一个HTTP span | |||
|  | func (t *Tracer) StartHTTPSpan(ctx context.Context, method, path string) (context.Context, trace.Span) { | |||
|  | 	spanName := fmt.Sprintf("%s %s", method, path) | |||
|  | 
 | |||
|  | 	// 检查是否已有错误标记,如果有则使用"error"作为操作名 | |||
|  | 	// 这样可以匹配Jaeger采样配置中的错误操作策略 | |||
|  | 	if ctx.Value("otel_error_request") != nil { | |||
|  | 		spanName = "error" | |||
|  | 	} | |||
|  | 
 | |||
|  | 	ctx, span := t.StartSpan(ctx, spanName, | |||
|  | 		trace.WithSpanKind(trace.SpanKindServer), | |||
|  | 		trace.WithAttributes( | |||
|  | 			attribute.String("http.method", method), | |||
|  | 			attribute.String("http.route", path), | |||
|  | 		), | |||
|  | 	) | |||
|  | 
 | |||
|  | 	// 保存原始操作名,以便在错误发生时可以更新 | |||
|  | 	if ctx.Value("otel_error_request") == nil { | |||
|  | 		ctx = context.WithValue(ctx, "otel_original_operation", spanName) | |||
|  | 	} | |||
|  | 
 | |||
|  | 	return ctx, span | |||
|  | } | |||
|  | 
 | |||
|  | // StartDBSpan 开始一个数据库span | |||
|  | func (t *Tracer) StartDBSpan(ctx context.Context, operation, table string) (context.Context, trace.Span) { | |||
|  | 	spanName := fmt.Sprintf("db.%s.%s", operation, table) | |||
|  | 
 | |||
|  | 	return t.StartSpan(ctx, spanName, | |||
|  | 		trace.WithSpanKind(trace.SpanKindClient), | |||
|  | 		trace.WithAttributes( | |||
|  | 			attribute.String("db.operation", operation), | |||
|  | 			attribute.String("db.table", table), | |||
|  | 			attribute.String("db.system", "postgresql"), | |||
|  | 		), | |||
|  | 	) | |||
|  | } | |||
|  | 
 | |||
|  | // StartCacheSpan 开始一个缓存span | |||
|  | func (t *Tracer) StartCacheSpan(ctx context.Context, operation, key string) (context.Context, trace.Span) { | |||
|  | 	spanName := fmt.Sprintf("cache.%s", operation) | |||
|  | 
 | |||
|  | 	return t.StartSpan(ctx, spanName, | |||
|  | 		trace.WithSpanKind(trace.SpanKindClient), | |||
|  | 		trace.WithAttributes( | |||
|  | 			attribute.String("cache.operation", operation), | |||
|  | 			attribute.String("cache.system", "redis"), | |||
|  | 		), | |||
|  | 	) | |||
|  | } | |||
|  | 
 | |||
|  | // StartExternalAPISpan 开始一个外部API调用span | |||
|  | func (t *Tracer) StartExternalAPISpan(ctx context.Context, service, operation string) (context.Context, trace.Span) { | |||
|  | 	spanName := fmt.Sprintf("api.%s.%s", service, operation) | |||
|  | 
 | |||
|  | 	return t.StartSpan(ctx, spanName, | |||
|  | 		trace.WithSpanKind(trace.SpanKindClient), | |||
|  | 		trace.WithAttributes( | |||
|  | 			attribute.String("api.service", service), | |||
|  | 			attribute.String("api.operation", operation), | |||
|  | 		), | |||
|  | 	) | |||
|  | } | |||
|  | 
 | |||
|  | // AddSpanAttributes 添加span属性 | |||
|  | func (t *Tracer) AddSpanAttributes(span trace.Span, attrs ...attribute.KeyValue) { | |||
|  | 	if span.IsRecording() { | |||
|  | 		span.SetAttributes(attrs...) | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // SetSpanError 设置span错误 | |||
|  | func (t *Tracer) SetSpanError(span trace.Span, err error) { | |||
|  | 	if span.IsRecording() { | |||
|  | 		span.SetStatus(codes.Error, err.Error()) | |||
|  | 		span.RecordError(err) | |||
|  | 
 | |||
|  | 		// 将span操作名更新为"error",以匹配Jaeger采样配置 | |||
|  | 		// 注意:这是一种变通方法,因为OpenTelemetry不支持直接更改span名称 | |||
|  | 		// 我们通过添加特殊属性来标识这是一个错误span | |||
|  | 		span.SetAttributes( | |||
|  | 			attribute.String("error.operation", "true"), | |||
|  | 			attribute.String("operation.type", "error"), | |||
|  | 		) | |||
|  | 
 | |||
|  | 		// 记录错误日志,包含trace ID便于关联 | |||
|  | 		if t.logger != nil { | |||
|  | 			ctx := trace.ContextWithSpan(context.Background(), span) | |||
|  | 			t.logger.Error("操作发生错误", | |||
|  | 				zap.Error(err), | |||
|  | 				zap.String("trace_id", t.GetTraceID(ctx)), | |||
|  | 				zap.String("span_id", t.GetSpanID(ctx)), | |||
|  | 			) | |||
|  | 		} | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // SetSpanSuccess 设置span成功 | |||
|  | func (t *Tracer) SetSpanSuccess(span trace.Span) { | |||
|  | 	if span.IsRecording() { | |||
|  | 		span.SetStatus(codes.Ok, "success") | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // SetHTTPStatus 根据HTTP状态码设置span状态 | |||
|  | func (t *Tracer) SetHTTPStatus(span trace.Span, statusCode int) { | |||
|  | 	if !span.IsRecording() { | |||
|  | 		return | |||
|  | 	} | |||
|  | 
 | |||
|  | 	// 添加HTTP状态码属性 | |||
|  | 	span.SetAttributes(attribute.Int("http.status_code", statusCode)) | |||
|  | 
 | |||
|  | 	// 对于4xx和5xx错误,标记为错误并应用错误采样策略 | |||
|  | 	if statusCode >= 400 { | |||
|  | 		errorMsg := fmt.Sprintf("HTTP %d", statusCode) | |||
|  | 		span.SetStatus(codes.Error, errorMsg) | |||
|  | 
 | |||
|  | 		// 添加错误操作标记,以匹配Jaeger采样配置 | |||
|  | 		span.SetAttributes( | |||
|  | 			attribute.String("error.operation", "true"), | |||
|  | 			attribute.String("operation.type", "error"), | |||
|  | 		) | |||
|  | 
 | |||
|  | 		// 记录HTTP错误 | |||
|  | 		if t.logger != nil { | |||
|  | 			ctx := trace.ContextWithSpan(context.Background(), span) | |||
|  | 			t.logger.Warn("HTTP请求错误", | |||
|  | 				zap.Int("status_code", statusCode), | |||
|  | 				zap.String("trace_id", t.GetTraceID(ctx)), | |||
|  | 				zap.String("span_id", t.GetSpanID(ctx)), | |||
|  | 			) | |||
|  | 		} | |||
|  | 	} else { | |||
|  | 		span.SetStatus(codes.Ok, "success") | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // GetTraceID 获取当前上下文的trace ID | |||
|  | func (t *Tracer) GetTraceID(ctx context.Context) string { | |||
|  | 	span := trace.SpanFromContext(ctx) | |||
|  | 	if span.SpanContext().IsValid() { | |||
|  | 		return span.SpanContext().TraceID().String() | |||
|  | 	} | |||
|  | 	return "" | |||
|  | } | |||
|  | 
 | |||
|  | // GetSpanID 获取当前上下文的span ID | |||
|  | func (t *Tracer) GetSpanID(ctx context.Context) string { | |||
|  | 	span := trace.SpanFromContext(ctx) | |||
|  | 	if span.SpanContext().IsValid() { | |||
|  | 		return span.SpanContext().SpanID().String() | |||
|  | 	} | |||
|  | 	return "" | |||
|  | } | |||
|  | 
 | |||
|  | // IsTracing 检查是否正在追踪 | |||
|  | func (t *Tracer) IsTracing(ctx context.Context) bool { | |||
|  | 	span := trace.SpanFromContext(ctx) | |||
|  | 	return span.SpanContext().IsValid() && span.IsRecording() | |||
|  | } | |||
|  | 
 | |||
|  | // Shutdown 关闭追踪器 | |||
|  | func (t *Tracer) Shutdown(ctx context.Context) error { | |||
|  | 	t.mutex.Lock() | |||
|  | 	defer t.mutex.Unlock() | |||
|  | 
 | |||
|  | 	if !t.initialized || t.shutdown == nil { | |||
|  | 		return nil | |||
|  | 	} | |||
|  | 
 | |||
|  | 	err := t.shutdown(ctx) | |||
|  | 	if err != nil { | |||
|  | 		t.logger.Error("Failed to shutdown tracer", zap.Error(err)) | |||
|  | 		return err | |||
|  | 	} | |||
|  | 
 | |||
|  | 	t.initialized = false | |||
|  | 	t.logger.Info("Tracer shutdown successfully") | |||
|  | 	return nil | |||
|  | } | |||
|  | 
 | |||
|  | // GetStats 获取追踪统计信息 | |||
|  | func (t *Tracer) GetStats() map[string]interface{} { | |||
|  | 	t.mutex.RLock() | |||
|  | 	defer t.mutex.RUnlock() | |||
|  | 
 | |||
|  | 	return map[string]interface{}{ | |||
|  | 		"initialized":     t.initialized, | |||
|  | 		"enabled":         t.config.Enabled, | |||
|  | 		"service_name":    t.config.ServiceName, | |||
|  | 		"service_version": t.config.ServiceVersion, | |||
|  | 		"environment":     t.config.Environment, | |||
|  | 		"sample_rate":     t.config.SampleRate, | |||
|  | 		"endpoint":        t.config.Endpoint, | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // 实现Service接口 | |||
|  | 
 | |||
|  | // Name 返回服务名称 | |||
|  | func (t *Tracer) Name() string { | |||
|  | 	return "tracer" | |||
|  | } | |||
|  | 
 | |||
|  | // HealthCheck 健康检查 | |||
|  | func (t *Tracer) HealthCheck(ctx context.Context) error { | |||
|  | 	if !t.config.Enabled { | |||
|  | 		return nil | |||
|  | 	} | |||
|  | 
 | |||
|  | 	if !t.initialized { | |||
|  | 		return fmt.Errorf("tracer not initialized") | |||
|  | 	} | |||
|  | 
 | |||
|  | 	return nil | |||
|  | } | |||
|  | 
 | |||
|  | // noopExporter 简单的无操作导出器(用于演示) | |||
|  | type noopExporter struct{} | |||
|  | 
 | |||
|  | func (e *noopExporter) ExportSpans(ctx context.Context, spans []sdktrace.ReadOnlySpan) error { | |||
|  | 	// 在实际应用中,这里应该将spans发送到Jaeger或其他追踪系统 | |||
|  | 	return nil | |||
|  | } | |||
|  | 
 | |||
|  | func (e *noopExporter) Shutdown(ctx context.Context) error { | |||
|  | 	return nil | |||
|  | } | |||
|  | 
 | |||
|  | // TraceMiddleware 追踪中间件工厂 | |||
|  | func (t *Tracer) TraceMiddleware() gin.HandlerFunc { | |||
|  | 	return func(c *gin.Context) { | |||
|  | 		if !t.initialized || !t.config.Enabled { | |||
|  | 			c.Next() | |||
|  | 			return | |||
|  | 		} | |||
|  | 
 | |||
|  | 		// 开始HTTP span | |||
|  | 		ctx, span := t.StartHTTPSpan(c.Request.Context(), c.Request.Method, c.FullPath()) | |||
|  | 		defer span.End() | |||
|  | 
 | |||
|  | 		// 将trace ID添加到响应头 | |||
|  | 		traceID := t.GetTraceID(ctx) | |||
|  | 		if traceID != "" { | |||
|  | 			c.Header("X-Trace-ID", traceID) | |||
|  | 		} | |||
|  | 
 | |||
|  | 		// 将span上下文存储到gin上下文 | |||
|  | 		c.Request = c.Request.WithContext(ctx) | |||
|  | 
 | |||
|  | 		// 处理请求 | |||
|  | 		c.Next() | |||
|  | 
 | |||
|  | 		// 设置HTTP状态码 | |||
|  | 		t.SetHTTPStatus(span, c.Writer.Status()) | |||
|  | 
 | |||
|  | 		// 添加响应信息 | |||
|  | 		t.AddSpanAttributes(span, | |||
|  | 			attribute.Int("http.status_code", c.Writer.Status()), | |||
|  | 			attribute.Int("http.response_size", c.Writer.Size()), | |||
|  | 		) | |||
|  | 
 | |||
|  | 		// 添加错误信息 | |||
|  | 		if len(c.Errors) > 0 { | |||
|  | 			errMsg := c.Errors.String() | |||
|  | 			t.SetSpanError(span, fmt.Errorf(errMsg)) | |||
|  | 		} | |||
|  | 	} | |||
|  | } | |||
|  | 
 | |||
|  | // GinTraceMiddleware 兼容旧的方法名,保持向后兼容 | |||
|  | func (t *Tracer) GinTraceMiddleware() gin.HandlerFunc { | |||
|  | 	return t.TraceMiddleware() | |||
|  | } | |||
|  | 
 | |||
|  | // WithTracing 添加追踪到上下文的辅助函数 | |||
|  | func WithTracing(ctx context.Context, tracer *Tracer, name string) (context.Context, trace.Span) { | |||
|  | 	return tracer.StartSpan(ctx, name) | |||
|  | } | |||
|  | 
 | |||
|  | // TraceFunction 追踪函数执行的辅助函数 | |||
|  | func (t *Tracer) TraceFunction(ctx context.Context, name string, fn func(context.Context) error) error { | |||
|  | 	ctx, span := t.StartSpan(ctx, name) | |||
|  | 	defer span.End() | |||
|  | 
 | |||
|  | 	err := fn(ctx) | |||
|  | 	if err != nil { | |||
|  | 		t.SetSpanError(span, err) | |||
|  | 	} else { | |||
|  | 		t.SetSpanSuccess(span) | |||
|  | 	} | |||
|  | 
 | |||
|  | 	return err | |||
|  | } | |||
|  | 
 | |||
|  | // TraceFunctionWithResult 追踪带返回值的函数执行 | |||
|  | func TraceFunctionWithResult[T any](ctx context.Context, tracer *Tracer, name string, fn func(context.Context) (T, error)) (T, error) { | |||
|  | 	ctx, span := tracer.StartSpan(ctx, name) | |||
|  | 	defer span.End() | |||
|  | 
 | |||
|  | 	result, err := fn(ctx) | |||
|  | 	if err != nil { | |||
|  | 		tracer.SetSpanError(span, err) | |||
|  | 	} else { | |||
|  | 		tracer.SetSpanSuccess(span) | |||
|  | 	} | |||
|  | 
 | |||
|  | 	return result, err | |||
|  | } |