475 lines
12 KiB
Go
475 lines
12 KiB
Go
|
|
package tracing
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"context"
|
|||
|
|
"fmt"
|
|||
|
|
"sync"
|
|||
|
|
"time"
|
|||
|
|
|
|||
|
|
"github.com/gin-gonic/gin"
|
|||
|
|
"go.opentelemetry.io/otel"
|
|||
|
|
"go.opentelemetry.io/otel/attribute"
|
|||
|
|
"go.opentelemetry.io/otel/codes"
|
|||
|
|
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
|||
|
|
"go.opentelemetry.io/otel/sdk/resource"
|
|||
|
|
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
|||
|
|
"go.opentelemetry.io/otel/trace"
|
|||
|
|
"go.uber.org/zap"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// TracerConfig 追踪器配置
|
|||
|
|
type TracerConfig struct {
|
|||
|
|
ServiceName string
|
|||
|
|
ServiceVersion string
|
|||
|
|
Environment string
|
|||
|
|
Endpoint string
|
|||
|
|
SampleRate float64
|
|||
|
|
Enabled bool
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DefaultTracerConfig 默认追踪器配置
|
|||
|
|
func DefaultTracerConfig() TracerConfig {
|
|||
|
|
return TracerConfig{
|
|||
|
|
ServiceName: "tyapi-server",
|
|||
|
|
ServiceVersion: "1.0.0",
|
|||
|
|
Environment: "development",
|
|||
|
|
Endpoint: "http://localhost:4317",
|
|||
|
|
SampleRate: 0.1,
|
|||
|
|
Enabled: true,
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Tracer 链路追踪器
|
|||
|
|
type Tracer struct {
|
|||
|
|
config TracerConfig
|
|||
|
|
logger *zap.Logger
|
|||
|
|
provider *sdktrace.TracerProvider
|
|||
|
|
tracer trace.Tracer
|
|||
|
|
mutex sync.RWMutex
|
|||
|
|
initialized bool
|
|||
|
|
shutdown func(context.Context) error
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NewTracer 创建链路追踪器
|
|||
|
|
func NewTracer(config TracerConfig, logger *zap.Logger) *Tracer {
|
|||
|
|
return &Tracer{
|
|||
|
|
config: config,
|
|||
|
|
logger: logger,
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Initialize 初始化追踪器
|
|||
|
|
func (t *Tracer) Initialize(ctx context.Context) error {
|
|||
|
|
t.mutex.Lock()
|
|||
|
|
defer t.mutex.Unlock()
|
|||
|
|
|
|||
|
|
if t.initialized {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if !t.config.Enabled {
|
|||
|
|
t.logger.Info("Tracing is disabled")
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 创建资源
|
|||
|
|
res, err := resource.New(ctx,
|
|||
|
|
resource.WithAttributes(
|
|||
|
|
attribute.String("service.name", t.config.ServiceName),
|
|||
|
|
attribute.String("service.version", t.config.ServiceVersion),
|
|||
|
|
attribute.String("environment", t.config.Environment),
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
if err != nil {
|
|||
|
|
return fmt.Errorf("failed to create resource: %w", err)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 创建采样器
|
|||
|
|
sampler := sdktrace.TraceIDRatioBased(t.config.SampleRate)
|
|||
|
|
|
|||
|
|
// 创建导出器
|
|||
|
|
var spanProcessor sdktrace.SpanProcessor
|
|||
|
|
if t.config.Endpoint != "" {
|
|||
|
|
// 使用OTLP gRPC导出器(支持Jaeger、Tempo等)
|
|||
|
|
exporter, err := otlptracegrpc.New(ctx,
|
|||
|
|
otlptracegrpc.WithEndpoint(t.config.Endpoint),
|
|||
|
|
otlptracegrpc.WithInsecure(), // 开发环境使用,生产环境应配置TLS
|
|||
|
|
otlptracegrpc.WithTimeout(time.Second*10),
|
|||
|
|
otlptracegrpc.WithRetry(otlptracegrpc.RetryConfig{
|
|||
|
|
Enabled: true,
|
|||
|
|
InitialInterval: time.Millisecond * 100,
|
|||
|
|
MaxInterval: time.Second * 5,
|
|||
|
|
MaxElapsedTime: time.Second * 30,
|
|||
|
|
}),
|
|||
|
|
)
|
|||
|
|
if err != nil {
|
|||
|
|
t.logger.Warn("Failed to create OTLP exporter, using noop exporter",
|
|||
|
|
zap.Error(err),
|
|||
|
|
zap.String("endpoint", t.config.Endpoint))
|
|||
|
|
spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
|
|||
|
|
} else {
|
|||
|
|
// 在生产环境中使用批处理器以提高性能
|
|||
|
|
spanProcessor = sdktrace.NewBatchSpanProcessor(exporter,
|
|||
|
|
sdktrace.WithBatchTimeout(time.Second*5),
|
|||
|
|
sdktrace.WithMaxExportBatchSize(512),
|
|||
|
|
sdktrace.WithMaxQueueSize(2048),
|
|||
|
|
sdktrace.WithExportTimeout(time.Second*30),
|
|||
|
|
)
|
|||
|
|
t.logger.Info("OTLP exporter initialized successfully",
|
|||
|
|
zap.String("endpoint", t.config.Endpoint))
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
// 如果没有配置端点,使用空导出器
|
|||
|
|
spanProcessor = sdktrace.NewSimpleSpanProcessor(&noopExporter{})
|
|||
|
|
t.logger.Info("Using noop exporter (no endpoint configured)")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 创建TracerProvider
|
|||
|
|
provider := sdktrace.NewTracerProvider(
|
|||
|
|
sdktrace.WithResource(res),
|
|||
|
|
sdktrace.WithSampler(sampler),
|
|||
|
|
sdktrace.WithSpanProcessor(spanProcessor),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// 设置全局TracerProvider
|
|||
|
|
otel.SetTracerProvider(provider)
|
|||
|
|
|
|||
|
|
// 创建Tracer
|
|||
|
|
tracer := provider.Tracer(t.config.ServiceName)
|
|||
|
|
|
|||
|
|
t.provider = provider
|
|||
|
|
t.tracer = tracer
|
|||
|
|
t.shutdown = func(ctx context.Context) error {
|
|||
|
|
return provider.Shutdown(ctx)
|
|||
|
|
}
|
|||
|
|
t.initialized = true
|
|||
|
|
|
|||
|
|
t.logger.Info("Tracing initialized successfully",
|
|||
|
|
zap.String("service", t.config.ServiceName),
|
|||
|
|
zap.Float64("sample_rate", t.config.SampleRate))
|
|||
|
|
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StartSpan 开始一个新的span
|
|||
|
|
func (t *Tracer) StartSpan(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
|
|||
|
|
if !t.initialized || !t.config.Enabled {
|
|||
|
|
return ctx, trace.SpanFromContext(ctx)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return t.tracer.Start(ctx, name, opts...)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StartHTTPSpan 开始一个HTTP span
|
|||
|
|
func (t *Tracer) StartHTTPSpan(ctx context.Context, method, path string) (context.Context, trace.Span) {
|
|||
|
|
spanName := fmt.Sprintf("%s %s", method, path)
|
|||
|
|
|
|||
|
|
// 检查是否已有错误标记,如果有则使用"error"作为操作名
|
|||
|
|
// 这样可以匹配Jaeger采样配置中的错误操作策略
|
|||
|
|
if ctx.Value("otel_error_request") != nil {
|
|||
|
|
spanName = "error"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
ctx, span := t.StartSpan(ctx, spanName,
|
|||
|
|
trace.WithSpanKind(trace.SpanKindServer),
|
|||
|
|
trace.WithAttributes(
|
|||
|
|
attribute.String("http.method", method),
|
|||
|
|
attribute.String("http.route", path),
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// 保存原始操作名,以便在错误发生时可以更新
|
|||
|
|
if ctx.Value("otel_error_request") == nil {
|
|||
|
|
ctx = context.WithValue(ctx, "otel_original_operation", spanName)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return ctx, span
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StartDBSpan 开始一个数据库span
|
|||
|
|
func (t *Tracer) StartDBSpan(ctx context.Context, operation, table string) (context.Context, trace.Span) {
|
|||
|
|
spanName := fmt.Sprintf("db.%s.%s", operation, table)
|
|||
|
|
|
|||
|
|
return t.StartSpan(ctx, spanName,
|
|||
|
|
trace.WithSpanKind(trace.SpanKindClient),
|
|||
|
|
trace.WithAttributes(
|
|||
|
|
attribute.String("db.operation", operation),
|
|||
|
|
attribute.String("db.table", table),
|
|||
|
|
attribute.String("db.system", "postgresql"),
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StartCacheSpan 开始一个缓存span
|
|||
|
|
func (t *Tracer) StartCacheSpan(ctx context.Context, operation, key string) (context.Context, trace.Span) {
|
|||
|
|
spanName := fmt.Sprintf("cache.%s", operation)
|
|||
|
|
|
|||
|
|
return t.StartSpan(ctx, spanName,
|
|||
|
|
trace.WithSpanKind(trace.SpanKindClient),
|
|||
|
|
trace.WithAttributes(
|
|||
|
|
attribute.String("cache.operation", operation),
|
|||
|
|
attribute.String("cache.system", "redis"),
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StartExternalAPISpan 开始一个外部API调用span
|
|||
|
|
func (t *Tracer) StartExternalAPISpan(ctx context.Context, service, operation string) (context.Context, trace.Span) {
|
|||
|
|
spanName := fmt.Sprintf("api.%s.%s", service, operation)
|
|||
|
|
|
|||
|
|
return t.StartSpan(ctx, spanName,
|
|||
|
|
trace.WithSpanKind(trace.SpanKindClient),
|
|||
|
|
trace.WithAttributes(
|
|||
|
|
attribute.String("api.service", service),
|
|||
|
|
attribute.String("api.operation", operation),
|
|||
|
|
),
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// AddSpanAttributes 添加span属性
|
|||
|
|
func (t *Tracer) AddSpanAttributes(span trace.Span, attrs ...attribute.KeyValue) {
|
|||
|
|
if span.IsRecording() {
|
|||
|
|
span.SetAttributes(attrs...)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SetSpanError 设置span错误
|
|||
|
|
func (t *Tracer) SetSpanError(span trace.Span, err error) {
|
|||
|
|
if span.IsRecording() {
|
|||
|
|
span.SetStatus(codes.Error, err.Error())
|
|||
|
|
span.RecordError(err)
|
|||
|
|
|
|||
|
|
// 将span操作名更新为"error",以匹配Jaeger采样配置
|
|||
|
|
// 注意:这是一种变通方法,因为OpenTelemetry不支持直接更改span名称
|
|||
|
|
// 我们通过添加特殊属性来标识这是一个错误span
|
|||
|
|
span.SetAttributes(
|
|||
|
|
attribute.String("error.operation", "true"),
|
|||
|
|
attribute.String("operation.type", "error"),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// 记录错误日志,包含trace ID便于关联
|
|||
|
|
if t.logger != nil {
|
|||
|
|
ctx := trace.ContextWithSpan(context.Background(), span)
|
|||
|
|
t.logger.Error("操作发生错误",
|
|||
|
|
zap.Error(err),
|
|||
|
|
zap.String("trace_id", t.GetTraceID(ctx)),
|
|||
|
|
zap.String("span_id", t.GetSpanID(ctx)),
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SetSpanSuccess 设置span成功
|
|||
|
|
func (t *Tracer) SetSpanSuccess(span trace.Span) {
|
|||
|
|
if span.IsRecording() {
|
|||
|
|
span.SetStatus(codes.Ok, "success")
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// SetHTTPStatus 根据HTTP状态码设置span状态
|
|||
|
|
func (t *Tracer) SetHTTPStatus(span trace.Span, statusCode int) {
|
|||
|
|
if !span.IsRecording() {
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 添加HTTP状态码属性
|
|||
|
|
span.SetAttributes(attribute.Int("http.status_code", statusCode))
|
|||
|
|
|
|||
|
|
// 对于4xx和5xx错误,标记为错误并应用错误采样策略
|
|||
|
|
if statusCode >= 400 {
|
|||
|
|
errorMsg := fmt.Sprintf("HTTP %d", statusCode)
|
|||
|
|
span.SetStatus(codes.Error, errorMsg)
|
|||
|
|
|
|||
|
|
// 添加错误操作标记,以匹配Jaeger采样配置
|
|||
|
|
span.SetAttributes(
|
|||
|
|
attribute.String("error.operation", "true"),
|
|||
|
|
attribute.String("operation.type", "error"),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// 记录HTTP错误
|
|||
|
|
if t.logger != nil {
|
|||
|
|
ctx := trace.ContextWithSpan(context.Background(), span)
|
|||
|
|
t.logger.Warn("HTTP请求错误",
|
|||
|
|
zap.Int("status_code", statusCode),
|
|||
|
|
zap.String("trace_id", t.GetTraceID(ctx)),
|
|||
|
|
zap.String("span_id", t.GetSpanID(ctx)),
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
span.SetStatus(codes.Ok, "success")
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GetTraceID 获取当前上下文的trace ID
|
|||
|
|
func (t *Tracer) GetTraceID(ctx context.Context) string {
|
|||
|
|
span := trace.SpanFromContext(ctx)
|
|||
|
|
if span.SpanContext().IsValid() {
|
|||
|
|
return span.SpanContext().TraceID().String()
|
|||
|
|
}
|
|||
|
|
return ""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GetSpanID 获取当前上下文的span ID
|
|||
|
|
func (t *Tracer) GetSpanID(ctx context.Context) string {
|
|||
|
|
span := trace.SpanFromContext(ctx)
|
|||
|
|
if span.SpanContext().IsValid() {
|
|||
|
|
return span.SpanContext().SpanID().String()
|
|||
|
|
}
|
|||
|
|
return ""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// IsTracing 检查是否正在追踪
|
|||
|
|
func (t *Tracer) IsTracing(ctx context.Context) bool {
|
|||
|
|
span := trace.SpanFromContext(ctx)
|
|||
|
|
return span.SpanContext().IsValid() && span.IsRecording()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Shutdown 关闭追踪器
|
|||
|
|
func (t *Tracer) Shutdown(ctx context.Context) error {
|
|||
|
|
t.mutex.Lock()
|
|||
|
|
defer t.mutex.Unlock()
|
|||
|
|
|
|||
|
|
if !t.initialized || t.shutdown == nil {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
err := t.shutdown(ctx)
|
|||
|
|
if err != nil {
|
|||
|
|
t.logger.Error("Failed to shutdown tracer", zap.Error(err))
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
t.initialized = false
|
|||
|
|
t.logger.Info("Tracer shutdown successfully")
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GetStats 获取追踪统计信息
|
|||
|
|
func (t *Tracer) GetStats() map[string]interface{} {
|
|||
|
|
t.mutex.RLock()
|
|||
|
|
defer t.mutex.RUnlock()
|
|||
|
|
|
|||
|
|
return map[string]interface{}{
|
|||
|
|
"initialized": t.initialized,
|
|||
|
|
"enabled": t.config.Enabled,
|
|||
|
|
"service_name": t.config.ServiceName,
|
|||
|
|
"service_version": t.config.ServiceVersion,
|
|||
|
|
"environment": t.config.Environment,
|
|||
|
|
"sample_rate": t.config.SampleRate,
|
|||
|
|
"endpoint": t.config.Endpoint,
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 实现Service接口
|
|||
|
|
|
|||
|
|
// Name 返回服务名称
|
|||
|
|
func (t *Tracer) Name() string {
|
|||
|
|
return "tracer"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HealthCheck 健康检查
|
|||
|
|
func (t *Tracer) HealthCheck(ctx context.Context) error {
|
|||
|
|
if !t.config.Enabled {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if !t.initialized {
|
|||
|
|
return fmt.Errorf("tracer not initialized")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// noopExporter 简单的无操作导出器(用于演示)
|
|||
|
|
type noopExporter struct{}
|
|||
|
|
|
|||
|
|
func (e *noopExporter) ExportSpans(ctx context.Context, spans []sdktrace.ReadOnlySpan) error {
|
|||
|
|
// 在实际应用中,这里应该将spans发送到Jaeger或其他追踪系统
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func (e *noopExporter) Shutdown(ctx context.Context) error {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TraceMiddleware 追踪中间件工厂
|
|||
|
|
func (t *Tracer) TraceMiddleware() gin.HandlerFunc {
|
|||
|
|
return func(c *gin.Context) {
|
|||
|
|
if !t.initialized || !t.config.Enabled {
|
|||
|
|
c.Next()
|
|||
|
|
return
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 开始HTTP span
|
|||
|
|
ctx, span := t.StartHTTPSpan(c.Request.Context(), c.Request.Method, c.FullPath())
|
|||
|
|
defer span.End()
|
|||
|
|
|
|||
|
|
// 将trace ID添加到响应头
|
|||
|
|
traceID := t.GetTraceID(ctx)
|
|||
|
|
if traceID != "" {
|
|||
|
|
c.Header("X-Trace-ID", traceID)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 将span上下文存储到gin上下文
|
|||
|
|
c.Request = c.Request.WithContext(ctx)
|
|||
|
|
|
|||
|
|
// 处理请求
|
|||
|
|
c.Next()
|
|||
|
|
|
|||
|
|
// 设置HTTP状态码
|
|||
|
|
t.SetHTTPStatus(span, c.Writer.Status())
|
|||
|
|
|
|||
|
|
// 添加响应信息
|
|||
|
|
t.AddSpanAttributes(span,
|
|||
|
|
attribute.Int("http.status_code", c.Writer.Status()),
|
|||
|
|
attribute.Int("http.response_size", c.Writer.Size()),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// 添加错误信息
|
|||
|
|
if len(c.Errors) > 0 {
|
|||
|
|
errMsg := c.Errors.String()
|
|||
|
|
t.SetSpanError(span, fmt.Errorf(errMsg))
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GinTraceMiddleware 兼容旧的方法名,保持向后兼容
|
|||
|
|
func (t *Tracer) GinTraceMiddleware() gin.HandlerFunc {
|
|||
|
|
return t.TraceMiddleware()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// WithTracing 添加追踪到上下文的辅助函数
|
|||
|
|
func WithTracing(ctx context.Context, tracer *Tracer, name string) (context.Context, trace.Span) {
|
|||
|
|
return tracer.StartSpan(ctx, name)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TraceFunction 追踪函数执行的辅助函数
|
|||
|
|
func (t *Tracer) TraceFunction(ctx context.Context, name string, fn func(context.Context) error) error {
|
|||
|
|
ctx, span := t.StartSpan(ctx, name)
|
|||
|
|
defer span.End()
|
|||
|
|
|
|||
|
|
err := fn(ctx)
|
|||
|
|
if err != nil {
|
|||
|
|
t.SetSpanError(span, err)
|
|||
|
|
} else {
|
|||
|
|
t.SetSpanSuccess(span)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return err
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// TraceFunctionWithResult 追踪带返回值的函数执行
|
|||
|
|
func TraceFunctionWithResult[T any](ctx context.Context, tracer *Tracer, name string, fn func(context.Context) (T, error)) (T, error) {
|
|||
|
|
ctx, span := tracer.StartSpan(ctx, name)
|
|||
|
|
defer span.End()
|
|||
|
|
|
|||
|
|
result, err := fn(ctx)
|
|||
|
|
if err != nil {
|
|||
|
|
tracer.SetSpanError(span, err)
|
|||
|
|
} else {
|
|||
|
|
tracer.SetSpanSuccess(span)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result, err
|
|||
|
|
}
|