Files
tyapi-server/internal/shared/tracing/tracer.go

475 lines
12 KiB
Go
Raw Normal View History

2025-07-02 16:17:59 +08:00
package tracing
import (
"context"
"fmt"
"sync"
"time"
"github.com/gin-gonic/gin"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
)
// TracerConfig 追踪器配置
type TracerConfig struct {
ServiceName string
ServiceVersion string
Environment string
Endpoint string
SampleRate float64
Enabled bool
}
// DefaultTracerConfig 默认追踪器配置
func DefaultTracerConfig() TracerConfig {
return TracerConfig{
ServiceName: "tyapi-server",
ServiceVersion: "1.0.0",
Environment: "development",
Endpoint: "http://localhost:4317",
SampleRate: 0.1,
Enabled: true,
}
}
// Tracer 链路追踪器
type Tracer struct {
config TracerConfig
logger *zap.Logger
provider *sdktrace.TracerProvider
tracer trace.Tracer
mutex sync.RWMutex
initialized bool
shutdown func(context.Context) error
}
// NewTracer 创建链路追踪器
func NewTracer(config TracerConfig, logger *zap.Logger) *Tracer {
return &Tracer{
config: config,
logger: logger,
}
}
// Initialize 初始化追踪器
func (t *Tracer) Initialize(ctx context.Context) error {
t.mutex.Lock()
defer t.mutex.Unlock()
if t.initialized {
return nil
}
if !t.config.Enabled {
t.logger.Info("Tracing is disabled")
return nil
}
// 创建资源
res, err := resource.New(ctx,
resource.WithAttributes(
attribute.String("service.name", t.config.ServiceName),
attribute.String("service.version", t.config.ServiceVersion),
attribute.String("environment", t.config.Environment),
),
)
if err != nil {
return fmt.Errorf("failed to create resource: %w", err)
}
// 创建采样器
sampler := sdktrace.TraceIDRatioBased(t.config.SampleRate)
// 创建导出器
var spanProcessor sdktrace.SpanProcessor
if t.config.Endpoint != "" {
// 使用OTLP gRPC导出器支持Jaeger、Tempo等
exporter, err := otlptracegrpc.New(ctx,
otlptracegrpc.WithEndpoint(t.config.Endpoint),
otlptracegrpc.WithInsecure(), // 开发环境使用生产环境应配置TLS
otlptracegrpc.WithTimeout(time.Second*10),
otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{
Enabled: true,
InitialInterval: time.Millisecond * 100,
MaxInterval: time.Second * 5,
MaxElapsedTime: time.Second * 30,
}),
)
if err != nil {
t.logger.Warn("Failed to create OTLP exporter, using noop exporter",
zap.Error(err),
zap.String("endpoint", t.config.Endpoint))
spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
} else {
// 在生产环境中使用批处理器以提高性能
spanProcessor = sdktrace.NewBatchSpanProcessor(exporter,
sdktrace.WithBatchTimeout(time.Second*5),
sdktrace.WithMaxExportBatchSize(512),
sdktrace.WithMaxQueueSize(2048),
sdktrace.WithExportTimeout(time.Second*30),
)
t.logger.Info("OTLP exporter initialized successfully",
zap.String("endpoint", t.config.Endpoint))
}
} else {
// 如果没有配置端点,使用空导出器
spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
t.logger.Info("Using noop exporter (no endpoint configured)")
}
// 创建TracerProvider
provider := sdktrace.NewTracerProvider(
sdktrace.WithResource(res),
sdktrace.WithSampler(sampler),
sdktrace.WithSpanProcessor(spanProcessor),
)
// 设置全局TracerProvider
otel.SetTracerProvider(provider)
// 创建Tracer
tracer := provider.Tracer(t.config.ServiceName)
t.provider = provider
t.tracer = tracer
t.shutdown = func(ctx context.Context) error {
return provider.Shutdown(ctx)
}
t.initialized = true
t.logger.Info("Tracing initialized successfully",
zap.String("service", t.config.ServiceName),
zap.Float64("sample_rate", t.config.SampleRate))
return nil
}
// StartSpan 开始一个新的span
func (t *Tracer) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
if !t.initialized || !t.config.Enabled {
return ctx, trace.SpanFromContext(ctx)
}
return t.tracer.Start(ctx, name, opts...)
}
// StartHTTPSpan 开始一个HTTP span
func (t *Tracer) StartHTTPSpan(ctx context.Context, method, path string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("%s %s", method, path)
// 检查是否已有错误标记,如果有则使用"error"作为操作名
// 这样可以匹配Jaeger采样配置中的错误操作策略
if ctx.Value("otel_error_request") != nil {
spanName = "error"
}
ctx, span := t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindServer),
trace.WithAttributes(
attribute.String("http.method", method),
attribute.String("http.route", path),
),
)
// 保存原始操作名,以便在错误发生时可以更新
if ctx.Value("otel_error_request") == nil {
ctx = context.WithValue(ctx, "otel_original_operation", spanName)
}
return ctx, span
}
// StartDBSpan 开始一个数据库span
func (t *Tracer) StartDBSpan(ctx context.Context, operation, table string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("db.%s.%s", operation, table)
return t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(
attribute.String("db.operation", operation),
attribute.String("db.table", table),
attribute.String("db.system", "postgresql"),
),
)
}
// StartCacheSpan 开始一个缓存span
func (t *Tracer) StartCacheSpan(ctx context.Context, operation, key string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("cache.%s", operation)
return t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(
attribute.String("cache.operation", operation),
attribute.String("cache.system", "redis"),
),
)
}
// StartExternalAPISpan 开始一个外部API调用span
func (t *Tracer) StartExternalAPISpan(ctx context.Context, service, operation string) (context.Context, trace.Span) {
spanName := fmt.Sprintf("api.%s.%s", service, operation)
return t.StartSpan(ctx, spanName,
trace.WithSpanKind(trace.SpanKindClient),
trace.WithAttributes(
attribute.String("api.service", service),
attribute.String("api.operation", operation),
),
)
}
// AddSpanAttributes 添加span属性
func (t *Tracer) AddSpanAttributes(span trace.Span, attrs ...attribute.KeyValue) {
if span.IsRecording() {
span.SetAttributes(attrs...)
}
}
// SetSpanError 设置span错误
func (t *Tracer) SetSpanError(span trace.Span, err error) {
if span.IsRecording() {
span.SetStatus(codes.Error, err.Error())
span.RecordError(err)
// 将span操作名更新为"error"以匹配Jaeger采样配置
// 注意这是一种变通方法因为OpenTelemetry不支持直接更改span名称
// 我们通过添加特殊属性来标识这是一个错误span
span.SetAttributes(
attribute.String("error.operation", "true"),
attribute.String("operation.type", "error"),
)
// 记录错误日志包含trace ID便于关联
if t.logger != nil {
ctx := trace.ContextWithSpan(context.Background(), span)
t.logger.Error("操作发生错误",
zap.Error(err),
zap.String("trace_id", t.GetTraceID(ctx)),
zap.String("span_id", t.GetSpanID(ctx)),
)
}
}
}
// SetSpanSuccess 设置span成功
func (t *Tracer) SetSpanSuccess(span trace.Span) {
if span.IsRecording() {
span.SetStatus(codes.Ok, "success")
}
}
// SetHTTPStatus 根据HTTP状态码设置span状态
func (t *Tracer) SetHTTPStatus(span trace.Span, statusCode int) {
if !span.IsRecording() {
return
}
// 添加HTTP状态码属性
span.SetAttributes(attribute.Int("http.status_code", statusCode))
// 对于4xx和5xx错误标记为错误并应用错误采样策略
if statusCode >= 400 {
errorMsg := fmt.Sprintf("HTTP %d", statusCode)
span.SetStatus(codes.Error, errorMsg)
// 添加错误操作标记以匹配Jaeger采样配置
span.SetAttributes(
attribute.String("error.operation", "true"),
attribute.String("operation.type", "error"),
)
// 记录HTTP错误
if t.logger != nil {
ctx := trace.ContextWithSpan(context.Background(), span)
t.logger.Warn("HTTP请求错误",
zap.Int("status_code", statusCode),
zap.String("trace_id", t.GetTraceID(ctx)),
zap.String("span_id", t.GetSpanID(ctx)),
)
}
} else {
span.SetStatus(codes.Ok, "success")
}
}
// GetTraceID 获取当前上下文的trace ID
func (t *Tracer) GetTraceID(ctx context.Context) string {
span := trace.SpanFromContext(ctx)
if span.SpanContext().IsValid() {
return span.SpanContext().TraceID().String()
}
return ""
}
// GetSpanID 获取当前上下文的span ID
func (t *Tracer) GetSpanID(ctx context.Context) string {
span := trace.SpanFromContext(ctx)
if span.SpanContext().IsValid() {
return span.SpanContext().SpanID().String()
}
return ""
}
// IsTracing 检查是否正在追踪
func (t *Tracer) IsTracing(ctx context.Context) bool {
span := trace.SpanFromContext(ctx)
return span.SpanContext().IsValid() && span.IsRecording()
}
// Shutdown 关闭追踪器
func (t *Tracer) Shutdown(ctx context.Context) error {
t.mutex.Lock()
defer t.mutex.Unlock()
if !t.initialized || t.shutdown == nil {
return nil
}
err := t.shutdown(ctx)
if err != nil {
t.logger.Error("Failed to shutdown tracer", zap.Error(err))
return err
}
t.initialized = false
t.logger.Info("Tracer shutdown successfully")
return nil
}
// GetStats 获取追踪统计信息
func (t *Tracer) GetStats() map[string]interface{} {
t.mutex.RLock()
defer t.mutex.RUnlock()
return map[string]interface{}{
"initialized": t.initialized,
"enabled": t.config.Enabled,
"service_name": t.config.ServiceName,
"service_version": t.config.ServiceVersion,
"environment": t.config.Environment,
"sample_rate": t.config.SampleRate,
"endpoint": t.config.Endpoint,
}
}
// 实现Service接口
// Name 返回服务名称
func (t *Tracer) Name() string {
return "tracer"
}
// HealthCheck 健康检查
func (t *Tracer) HealthCheck(ctx context.Context) error {
if !t.config.Enabled {
return nil
}
if !t.initialized {
return fmt.Errorf("tracer not initialized")
}
return nil
}
// noopExporter 简单的无操作导出器(用于演示)
type noopExporter struct{}
func (e *noopExporter) ExportSpans(ctx context.Context, spans []sdktrace.ReadOnlySpan) error {
// 在实际应用中这里应该将spans发送到Jaeger或其他追踪系统
return nil
}
func (e *noopExporter) Shutdown(ctx context.Context) error {
return nil
}
// TraceMiddleware 追踪中间件工厂
func (t *Tracer) TraceMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
if !t.initialized || !t.config.Enabled {
c.Next()
return
}
// 开始HTTP span
ctx, span := t.StartHTTPSpan(c.Request.Context(), c.Request.Method, c.FullPath())
defer span.End()
// 将trace ID添加到响应头
traceID := t.GetTraceID(ctx)
if traceID != "" {
c.Header("X-Trace-ID", traceID)
}
// 将span上下文存储到gin上下文
c.Request = c.Request.WithContext(ctx)
// 处理请求
c.Next()
// 设置HTTP状态码
t.SetHTTPStatus(span, c.Writer.Status())
// 添加响应信息
t.AddSpanAttributes(span,
attribute.Int("http.status_code", c.Writer.Status()),
attribute.Int("http.response_size", c.Writer.Size()),
)
// 添加错误信息
if len(c.Errors) > 0 {
errMsg := c.Errors.String()
t.SetSpanError(span, fmt.Errorf(errMsg))
}
}
}
// GinTraceMiddleware 兼容旧的方法名,保持向后兼容
func (t *Tracer) GinTraceMiddleware() gin.HandlerFunc {
return t.TraceMiddleware()
}
// WithTracing 添加追踪到上下文的辅助函数
func WithTracing(ctx context.Context, tracer *Tracer, name string) (context.Context, trace.Span) {
return tracer.StartSpan(ctx, name)
}
// TraceFunction 追踪函数执行的辅助函数
func (t *Tracer) TraceFunction(ctx context.Context, name string, fn func(context.Context) error) error {
ctx, span := t.StartSpan(ctx, name)
defer span.End()
err := fn(ctx)
if err != nil {
t.SetSpanError(span, err)
} else {
t.SetSpanSuccess(span)
}
return err
}
// TraceFunctionWithResult 追踪带返回值的函数执行
func TraceFunctionWithResult[T any](ctx context.Context, tracer *Tracer, name string, fn func(context.Context) (T, error)) (T, error) {
ctx, span := tracer.StartSpan(ctx, name)
defer span.End()
result, err := fn(ctx)
if err != nil {
tracer.SetSpanError(span, err)
} else {
tracer.SetSpanSuccess(span)
}
return result, err
}