Files
tyapi-server/internal/shared/tracing/tracer.go

475 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package tracing
import (
"context"
"fmt"
"sync"
"time"
"github.com/gin-gonic/gin"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
)
// TracerConfig 追踪器配置
type TracerConfig struct {
ServiceName string
ServiceVersion string
Environment string
Endpoint string
SampleRate float64
Enabled bool
}
// DefaultTracerConfig 默认追踪器配置
func DefaultTracerConfig() TracerConfig {
return TracerConfig{
ServiceName: "tyapi-server",
ServiceVersion: "1.0.0",
Environment: "development",
Endpoint: "http://localhost:4317",
SampleRate: 0.1,
Enabled: true,
}
}
// Tracer 链路追踪器
type Tracer struct {
config TracerConfig
logger *zap.Logger
provider *sdktrace.TracerProvider
tracer trace.Tracer
mutex sync.RWMutex
initialized bool
shutdown func(context.Context) error
}
// NewTracer 创建链路追踪器
func NewTracer(config TracerConfig, logger *zap.Logger) *Tracer {
return &Tracer{
config: config,
logger: logger,
}
}
// Initialize 初始化追踪器
func (t *Tracer) Initialize(ctx context.Context) error {
t.mutex.Lock()
defer t.mutex.Unlock()
if t.initialized {
return nil
}
if !t.config.Enabled {
t.logger.Info("Tracing is disabled")
return nil
}
// 创建资源
res, err := resource.New(ctx,
resource.WithAttributes(
attribute.String("service.name", t.config.ServiceName),
attribute.String("service.version", t.config.ServiceVersion),
attribute.String("environment", t.config.Environment),
),
)
if err != nil {
return fmt.Errorf("failed to create resource: %w", err)
}
// 创建采样器
sampler := sdktrace.TraceIDRatioBased(t.config.SampleRate)
// 创建导出器
var spanProcessor sdktrace.SpanProcessor
if t.config.Endpoint != "" {
// 使用OTLP gRPC导出器支持Jaeger、Tempo等
exporter, err := otlptracegrpc.New(ctx,
otlptracegrpc.WithEndpoint(t.config.Endpoint),
otlptracegrpc.WithInsecure(), // 开发环境使用生产环境应配置TLS
otlptracegrpc.WithTimeout(time.Second*10),
otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{
Enabled: true,
InitialInterval: time.Millisecond * 100,
MaxInterval: time.Second * 5,
MaxElapsedTime: time.Second * 30,
}),
)
if err != nil {
t.logger.Warn("Failed to create OTLP exporter, using noop exporter",
zap.Error(err),
zap.String("endpoint", t.config.Endpoint))
spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
} else {
// 在生产环境中使用批处理器以提高性能
spanProcessor = sdktrace.NewBatchSpanProcessor(exporter,
sdktrace.WithBatchTimeout(time.Second*5),
sdktrace.WithMaxExportBatchSize(512),
sdktrace.WithMaxQueueSize(2048),
sdktrace.WithExportTimeout(time.Second*30),
)
t.logger.Info("OTLP exporter initialized successfully",
zap.String("endpoint", t.config.Endpoint))
}
} else {
// 如果没有配置端点,使用空导出器
spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
t.logger.Info("Using noop exporter (no endpoint configured)")
}
// 创建TracerProvider
provider := sdktrace.NewTracerProvider(
sdktrace.WithResource(res),
sdktrace.WithSampler(sampler),
sdktrace.WithSpanProcessor(spanProcessor),
)
// 设置全局TracerProvider
otel.SetTracerProvider(provider)
// 创建Tracer
tracer := provider.Tracer(t.config.ServiceName)
t.provider = provider
t.tracer = tracer
t.shutdown = func(ctx context.Context) error {
return provider.Shutdown(ctx)
}
t.initialized = true
t.logger.Info("Tracing initialized successfully",
zap.String("service", t.config.ServiceName),
zap.Float64("sample_rate", t.config.SampleRate))
return nil
}
// StartSpan 开始一个新的span
func (t *Tracer) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
if !t.initialized || !t.config.Enabled {
return ctx, trace.SpanFromContext(ctx)
}
return t.tracer.Start(ctx, name, opts...)
}
// StartHTTPSpan 开始一个HTTP span
func (t *Tracer) StartHTTPSpan(ctx context.Context, method, path string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("%s %s", method, path)
// 检查是否已有错误标记,如果有则使用"error"作为操作名
// 这样可以匹配Jaeger采样配置中的错误操作策略
if ctx.Value("otel_error_request") != nil {
spanName = "error"
}
ctx, span := t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindServer),
trace.WithAttributes(
attribute.String("http.method", method),
attribute.String("http.route", path),
),
)
// 保存原始操作名,以便在错误发生时可以更新
if ctx.Value("otel_error_request") == nil {
ctx = context.WithValue(ctx, "otel_original_operation", spanName)
}
return ctx, span
}
// StartDBSpan 开始一个数据库span
func (t *Tracer) StartDBSpan(ctx context.Context, operation, table string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("db.%s.%s", operation, table)
return t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(
attribute.String("db.operation", operation),
attribute.String("db.table", table),
attribute.String("db.system", "postgresql"),
),
)
}
// StartCacheSpan 开始一个缓存span
func (t *Tracer) StartCacheSpan(ctx context.Context, operation, key string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("cache.%s", operation)
return t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(
attribute.String("cache.operation", operation),
attribute.String("cache.system", "redis"),
),
)
}
// StartExternalAPISpan 开始一个外部API调用span
func (t *Tracer) StartExternalAPISpan(ctx context.Context, service, operation string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("api.%s.%s", service, operation)
return t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(
attribute.String("api.service", service),
attribute.String("api.operation", operation),
),
)
}
// AddSpanAttributes 添加span属性
func (t *Tracer) AddSpanAttributes(span trace.Span, attrs ...attribute.KeyValue) {
if span.IsRecording() {
span.SetAttributes(attrs...)
}
}
// SetSpanError 设置span错误
func (t *Tracer) SetSpanError(span trace.Span, err error) {
if span.IsRecording() {
span.SetStatus(codes.Error, err.Error())
span.RecordError(err)
// 将span操作名更新为"error"以匹配Jaeger采样配置
// 注意这是一种变通方法因为OpenTelemetry不支持直接更改span名称
// 我们通过添加特殊属性来标识这是一个错误span
span.SetAttributes(
attribute.String("error.operation", "true"),
attribute.String("operation.type", "error"),
)
// 记录错误日志包含trace ID便于关联
if t.logger != nil {
ctx := trace.ContextWithSpan(context.Background(), span)
t.logger.Error("操作发生错误",
zap.Error(err),
zap.String("trace_id", t.GetTraceID(ctx)),
zap.String("span_id", t.GetSpanID(ctx)),
)
}
}
}
// SetSpanSuccess 设置span成功
func (t *Tracer) SetSpanSuccess(span trace.Span) {
if span.IsRecording() {
span.SetStatus(codes.Ok, "success")
}
}
// SetHTTPStatus 根据HTTP状态码设置span状态
func (t *Tracer) SetHTTPStatus(span trace.Span, statusCode int) {
if !span.IsRecording() {
return
}
// 添加HTTP状态码属性
span.SetAttributes(attribute.Int("http.status_code", statusCode))
// 对于4xx和5xx错误标记为错误并应用错误采样策略
if statusCode >= 400 {
errorMsg := fmt.Sprintf("HTTP %d", statusCode)
span.SetStatus(codes.Error, errorMsg)
// 添加错误操作标记以匹配Jaeger采样配置
span.SetAttributes(
attribute.String("error.operation", "true"),
attribute.String("operation.type", "error"),
)
// 记录HTTP错误
if t.logger != nil {
ctx := trace.ContextWithSpan(context.Background(), span)
t.logger.Warn("HTTP请求错误",
zap.Int("status_code", statusCode),
zap.String("trace_id", t.GetTraceID(ctx)),
zap.String("span_id", t.GetSpanID(ctx)),
)
}
} else {
span.SetStatus(codes.Ok, "success")
}
}
// GetTraceID 获取当前上下文的trace ID
func (t *Tracer) GetTraceID(ctx context.Context) string {
span := trace.SpanFromContext(ctx)
if span.SpanContext().IsValid() {
return span.SpanContext().TraceID().String()
}
return ""
}
// GetSpanID 获取当前上下文的span ID
func (t *Tracer) GetSpanID(ctx context.Context) string {
span := trace.SpanFromContext(ctx)
if span.SpanContext().IsValid() {
return span.SpanContext().SpanID().String()
}
return ""
}
// IsTracing 检查是否正在追踪
func (t *Tracer) IsTracing(ctx context.Context) bool {
span := trace.SpanFromContext(ctx)
return span.SpanContext().IsValid() && span.IsRecording()
}
// Shutdown 关闭追踪器
func (t *Tracer) Shutdown(ctx context.Context) error {
t.mutex.Lock()
defer t.mutex.Unlock()
if !t.initialized || t.shutdown == nil {
return nil
}
err := t.shutdown(ctx)
if err != nil {
t.logger.Error("Failed to shutdown tracer", zap.Error(err))
return err
}
t.initialized = false
t.logger.Info("Tracer shutdown successfully")
return nil
}
// GetStats 获取追踪统计信息
func (t *Tracer) GetStats() map[string]interface{} {
t.mutex.RLock()
defer t.mutex.RUnlock()
return map[string]interface{}{
"initialized": t.initialized,
"enabled": t.config.Enabled,
"service_name": t.config.ServiceName,
"service_version": t.config.ServiceVersion,
"environment": t.config.Environment,
"sample_rate": t.config.SampleRate,
"endpoint": t.config.Endpoint,
}
}
// 实现Service接口
// Name 返回服务名称
func (t *Tracer) Name() string {
return "tracer"
}
// HealthCheck 健康检查
func (t *Tracer) HealthCheck(ctx context.Context) error {
if !t.config.Enabled {
return nil
}
if !t.initialized {
return fmt.Errorf("tracer not initialized")
}
return nil
}
// noopExporter 简单的无操作导出器(用于演示)
type noopExporter struct{}
func (e *noopExporter) ExportSpans(ctx context.Context, spans []sdktrace.ReadOnlySpan) error {
// 在实际应用中这里应该将spans发送到Jaeger或其他追踪系统
return nil
}
func (e *noopExporter) Shutdown(ctx context.Context) error {
return nil
}
// TraceMiddleware 追踪中间件工厂
func (t *Tracer) TraceMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
if !t.initialized || !t.config.Enabled {
c.Next()
return
}
// 开始HTTP span
ctx, span := t.StartHTTPSpan(c.Request.Context(), c.Request.Method, c.FullPath())
defer span.End()
// 将trace ID添加到响应头
traceID := t.GetTraceID(ctx)
if traceID != "" {
c.Header("X-Trace-ID", traceID)
}
// 将span上下文存储到gin上下文
c.Request = c.Request.WithContext(ctx)
// 处理请求
c.Next()
// 设置HTTP状态码
t.SetHTTPStatus(span, c.Writer.Status())
// 添加响应信息
t.AddSpanAttributes(span,
attribute.Int("http.status_code", c.Writer.Status()),
attribute.Int("http.response_size", c.Writer.Size()),
)
// 添加错误信息
if len(c.Errors) > 0 {
errMsg := c.Errors.String()
t.SetSpanError(span, fmt.Errorf(errMsg))
}
}
}
// GinTraceMiddleware 兼容旧的方法名,保持向后兼容
func (t *Tracer) GinTraceMiddleware() gin.HandlerFunc {
return t.TraceMiddleware()
}
// WithTracing 添加追踪到上下文的辅助函数
func WithTracing(ctx context.Context, tracer *Tracer, name string) (context.Context, trace.Span) {
return tracer.StartSpan(ctx, name)
}
// TraceFunction 追踪函数执行的辅助函数
func (t *Tracer) TraceFunction(ctx context.Context, name string, fn func(context.Context) error) error {
ctx, span := t.StartSpan(ctx, name)
defer span.End()
err := fn(ctx)
if err != nil {
t.SetSpanError(span, err)
} else {
t.SetSpanSuccess(span)
}
return err
}
// TraceFunctionWithResult 追踪带返回值的函数执行
func TraceFunctionWithResult[T any](ctx context.Context, tracer *Tracer, name string, fn func(context.Context) (T, error)) (T, error) {
ctx, span := tracer.StartSpan(ctx, name)
defer span.End()
result, err := fn(ctx)
if err != nil {
tracer.SetSpanError(span, err)
} else {
tracer.SetSpanSuccess(span)
}
return result, err
}