tianyuan-api-server/go-zero错误分级与链路追踪设计.md
2025-07-13 20:37:12 +08:00

21 KiB
Raw Permalink Blame History

go-zero 错误分级与链路追踪设计

1. 错误分级体系

1.1 错误等级定义

// shared/errcode/levels.go
package errcode

// 错误等级枚举
type ErrorLevel int

const (
    LevelDebug ErrorLevel = iota // 调试级别:开发调试信息
    LevelInfo                    // 信息级别:一般业务信息
    LevelWarn                    // 警告级别:需要关注但不影响业务
    LevelError                   // 错误级别:业务错误,需要处理
    LevelFatal                   // 致命级别:系统级错误,影响服务
    LevelPanic                   // 恐慌级别:严重错误,服务不可用
)

// 错误等级字符串映射
var LevelNames = map[ErrorLevel]string{
    LevelDebug: "DEBUG",
    LevelInfo:  "INFO",
    LevelWarn:  "WARN",
    LevelError: "ERROR",
    LevelFatal: "FATAL",
    LevelPanic: "PANIC",
}

func (l ErrorLevel) String() string {
    if name, ok := LevelNames[l]; ok {
        return name
    }
    return "UNKNOWN"
}

1.2 错误分类体系

// shared/errcode/types.go
package errcode

import (
    "fmt"
    "time"
)

// 错误类型
type ErrorType string

const (
    // 系统级错误
    ErrorTypeSystem     ErrorType = "SYSTEM"     // 系统错误
    ErrorTypeNetwork    ErrorType = "NETWORK"    // 网络错误
    ErrorTypeDatabase   ErrorType = "DATABASE"   // 数据库错误
    ErrorTypeRedis      ErrorType = "REDIS"      // Redis错误
    ErrorTypeMQ         ErrorType = "MQ"         // 消息队列错误
    ErrorTypeRPC        ErrorType = "RPC"        // RPC调用错误

    // 业务级错误
    ErrorTypeBusiness   ErrorType = "BUSINESS"   // 业务逻辑错误
    ErrorTypeValidation ErrorType = "VALIDATION" // 参数校验错误
    ErrorTypeAuth       ErrorType = "AUTH"       // 认证授权错误
    ErrorTypePermission ErrorType = "PERMISSION" // 权限错误

    // 客户端错误
    ErrorTypeParam      ErrorType = "PARAM"      // 参数错误
    ErrorTypeRequest    ErrorType = "REQUEST"    // 请求错误
    ErrorTypeResponse   ErrorType = "RESPONSE"   // 响应错误
)

// 统一错误结构
type AppError struct {
    Code      string      `json:"code"`       // 错误码
    Message   string      `json:"message"`    // 错误消息
    Level     ErrorLevel  `json:"level"`      // 错误等级
    Type      ErrorType   `json:"type"`       // 错误类型
    TraceId   string      `json:"trace_id"`   // 链路追踪ID
    SpanId    string      `json:"span_id"`    // 跨度ID
    Service   string      `json:"service"`    // 服务名称
    Method    string      `json:"method"`     // 方法名称
    Timestamp time.Time   `json:"timestamp"`  // 时间戳
    Details   interface{} `json:"details"`    // 详细信息
    Stack     string      `json:"stack"`      // 堆栈信息(仅错误级别以上)
    Cause     error       `json:"-"`          // 原始错误(不序列化)
}

// 实现error接口
func (e *AppError) Error() string {
    return fmt.Sprintf("[%s][%s][%s] %s: %s",
        e.Level.String(), e.Type, e.Code, e.Service, e.Message)
}

// 获取原始错误
func (e *AppError) Unwrap() error {
    return e.Cause
}

1.3 错误构造器

// shared/errcode/builder.go
package errcode

import (
    "runtime"
    "time"
    "github.com/zeromicro/go-zero/core/trace"
)

type ErrorBuilder struct {
    service string
    method  string
}

func NewErrorBuilder(service, method string) *ErrorBuilder {
    return &ErrorBuilder{
        service: service,
        method:  method,
    }
}

// Debug级别错误
func (b *ErrorBuilder) Debug(code, message string) *AppError {
    return b.buildError(LevelDebug, ErrorTypeSystem, code, message, nil, nil)
}

// Info级别错误
func (b *ErrorBuilder) Info(code, message string) *AppError {
    return b.buildError(LevelInfo, ErrorTypeSystem, code, message, nil, nil)
}

// Warn级别错误
func (b *ErrorBuilder) Warn(errorType ErrorType, code, message string) *AppError {
    return b.buildError(LevelWarn, errorType, code, message, nil, nil)
}

// Error级别错误
func (b *ErrorBuilder) Error(errorType ErrorType, code, message string, cause error) *AppError {
    return b.buildError(LevelError, errorType, code, message, cause, nil)
}

// Fatal级别错误
func (b *ErrorBuilder) Fatal(errorType ErrorType, code, message string, cause error) *AppError {
    return b.buildError(LevelFatal, errorType, code, message, cause, nil)
}

// Panic级别错误
func (b *ErrorBuilder) Panic(errorType ErrorType, code, message string, cause error) *AppError {
    return b.buildError(LevelPanic, errorType, code, message, cause, nil)
}

// 业务错误(常用)
func (b *ErrorBuilder) BusinessError(code, message string) *AppError {
    return b.buildError(LevelError, ErrorTypeBusiness, code, message, nil, nil)
}

// 参数校验错误(常用)
func (b *ErrorBuilder) ValidationError(code, message string, details interface{}) *AppError {
    return b.buildError(LevelWarn, ErrorTypeValidation, code, message, nil, details)
}

// 权限错误(常用)
func (b *ErrorBuilder) PermissionError(code, message string) *AppError {
    return b.buildError(LevelWarn, ErrorTypePermission, code, message, nil, nil)
}

// 系统错误(常用)
func (b *ErrorBuilder) SystemError(code, message string, cause error) *AppError {
    return b.buildError(LevelFatal, ErrorTypeSystem, code, message, cause, nil)
}

// 构建错误
func (b *ErrorBuilder) buildError(level ErrorLevel, errorType ErrorType, code, message string, cause error, details interface{}) *AppError {
    appErr := &AppError{
        Code:      code,
        Message:   message,
        Level:     level,
        Type:      errorType,
        Service:   b.service,
        Method:    b.method,
        Timestamp: time.Now(),
        Details:   details,
        Cause:     cause,
    }

    // 获取链路追踪信息
    if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
        appErr.TraceId = traceId
    }
    if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
        appErr.SpanId = spanId
    }

    // 错误级别以上记录堆栈信息
    if level >= LevelError {
        appErr.Stack = getStackTrace()
    }

    return appErr
}

// 获取堆栈信息
func getStackTrace() string {
    buf := make([]byte, 4096)
    n := runtime.Stack(buf, false)
    return string(buf[:n])
}

2. 链路追踪集成

2.1 链路追踪配置

# etc/client-api.yaml
Name: client-api
Host: 0.0.0.0
Port: 8080

# 链路追踪配置
Telemetry:
    Name: client-api
    Endpoint: http://jaeger:14268/api/traces
    Sampler: 1.0
    Batcher: jaeger

# 日志配置
Log:
    ServiceName: client-api
    Mode: file
    Level: info
    Path: logs
    MaxSize: 100
    MaxAge: 7
    MaxBackups: 5
    Compress: true

2.2 链路追踪中间件

// shared/middleware/trace_middleware.go
package middleware

import (
    "context"
    "net/http"
    "github.com/zeromicro/go-zero/core/trace"
    "github.com/zeromicro/go-zero/rest/httpx"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/codes"
    "tianyuan/shared/errcode"
)

// HTTP链路追踪中间件
func TraceMiddleware(serviceName string) func(http.Handler) http.Handler {
    return func(next http.Handler) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            tracer := otel.Tracer(serviceName)

            // 开始span
            ctx, span := tracer.Start(r.Context(), r.URL.Path)
            defer span.End()

            // 设置span属性
            span.SetAttributes(
                attribute.String("http.method", r.Method),
                attribute.String("http.url", r.URL.String()),
                attribute.String("http.user_agent", r.UserAgent()),
                attribute.String("service.name", serviceName),
            )

            // 将链路信息注入上下文
            r = r.WithContext(ctx)

            // 创建响应包装器
            wrapper := &responseWrapper{
                ResponseWriter: w,
                statusCode:     http.StatusOK,
            }

            // 执行下一个处理器
            next.ServeHTTP(wrapper, r)

            // 设置响应属性
            span.SetAttributes(
                attribute.Int("http.status_code", wrapper.statusCode),
            )

            // 如果是错误状态码设置span状态
            if wrapper.statusCode >= 400 {
                span.SetStatus(codes.Error, http.StatusText(wrapper.statusCode))
            }
        })
    }
}

// 响应包装器
type responseWrapper struct {
    http.ResponseWriter
    statusCode int
}

func (w *responseWrapper) WriteHeader(statusCode int) {
    w.statusCode = statusCode
    w.ResponseWriter.WriteHeader(statusCode)
}

2.3 RPC 链路追踪拦截器

// shared/interceptor/trace_interceptor.go
package interceptor

import (
    "context"
    "google.golang.org/grpc"
    "google.golang.org/grpc/codes"
    "google.golang.org/grpc/status"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/codes"
    "tianyuan/shared/errcode"
)

// RPC客户端链路追踪拦截器
func TraceClientInterceptor(serviceName string) grpc.UnaryClientInterceptor {
    return func(ctx context.Context, method string, req, reply interface{},
        cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {

        tracer := otel.Tracer(serviceName)
        ctx, span := tracer.Start(ctx, method)
        defer span.End()

        // 设置span属性
        span.SetAttributes(
            attribute.String("rpc.method", method),
            attribute.String("rpc.service", serviceName),
            attribute.String("rpc.system", "grpc"),
        )

        // 调用RPC
        err := invoker(ctx, method, req, reply, cc, opts...)

        // 处理错误
        if err != nil {
            span.SetStatus(codes.Error, err.Error())
            span.SetAttributes(
                attribute.String("rpc.grpc.status_code", status.Code(err).String()),
            )
        }

        return err
    }
}

// RPC服务端链路追踪拦截器
func TraceServerInterceptor(serviceName string) grpc.UnaryServerInterceptor {
    return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo,
        handler grpc.UnaryHandler) (interface{}, error) {

        tracer := otel.Tracer(serviceName)
        ctx, span := tracer.Start(ctx, info.FullMethod)
        defer span.End()

        // 设置span属性
        span.SetAttributes(
            attribute.String("rpc.method", info.FullMethod),
            attribute.String("rpc.service", serviceName),
            attribute.String("rpc.system", "grpc"),
        )

        // 调用处理器
        resp, err := handler(ctx, req)

        // 处理错误
        if err != nil {
            span.SetStatus(codes.Error, err.Error())

            // 如果是自定义错误,记录更多信息
            if appErr, ok := err.(*errcode.AppError); ok {
                span.SetAttributes(
                    attribute.String("error.type", string(appErr.Type)),
                    attribute.String("error.code", appErr.Code),
                    attribute.String("error.level", appErr.Level.String()),
                )
            }
        }

        return resp, err
    }
}

3. 日志集成

3.1 结构化日志

// shared/logger/logger.go
package logger

import (
    "context"
    "github.com/zeromicro/go-zero/core/logx"
    "github.com/zeromicro/go-zero/core/trace"
    "tianyuan/shared/errcode"
)

// 日志字段
type LogFields map[string]interface{}

// 结构化日志器
type StructuredLogger struct {
    service string
    method  string
}

func NewStructuredLogger(service, method string) *StructuredLogger {
    return &StructuredLogger{
        service: service,
        method:  method,
    }
}

// 记录错误日志
func (l *StructuredLogger) LogError(ctx context.Context, err error, fields LogFields) {
    logFields := l.buildBaseFields(ctx, fields)

    if appErr, ok := err.(*errcode.AppError); ok {
        // 自定义错误
        logFields["error_code"] = appErr.Code
        logFields["error_type"] = appErr.Type
        logFields["error_level"] = appErr.Level.String()
        logFields["error_details"] = appErr.Details

        // 根据错误级别选择日志方法
        switch appErr.Level {
        case errcode.LevelDebug:
            logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
        case errcode.LevelInfo:
            logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
        case errcode.LevelWarn:
            logx.WithContext(ctx).WithFields(logFields).Slow(appErr.Message)
        case errcode.LevelError:
            logx.WithContext(ctx).WithFields(logFields).Error(appErr.Message)
        case errcode.LevelFatal, errcode.LevelPanic:
            logx.WithContext(ctx).WithFields(logFields).Severe(appErr.Message)
        }
    } else {
        // 普通错误
        logFields["error"] = err.Error()
        logx.WithContext(ctx).WithFields(logFields).Error(err.Error())
    }
}

// 记录业务日志
func (l *StructuredLogger) LogInfo(ctx context.Context, message string, fields LogFields) {
    logFields := l.buildBaseFields(ctx, fields)
    logx.WithContext(ctx).WithFields(logFields).Info(message)
}

// 记录警告日志
func (l *StructuredLogger) LogWarn(ctx context.Context, message string, fields LogFields) {
    logFields := l.buildBaseFields(ctx, fields)
    logx.WithContext(ctx).WithFields(logFields).Slow(message)
}

// 构建基础日志字段
func (l *StructuredLogger) buildBaseFields(ctx context.Context, fields LogFields) logx.LogField {
    baseFields := logx.LogField{
        "service": l.service,
        "method":  l.method,
    }

    // 添加链路追踪信息
    if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
        baseFields["trace_id"] = traceId
    }
    if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
        baseFields["span_id"] = spanId
    }

    // 合并自定义字段
    for k, v := range fields {
        baseFields[k] = v
    }

    return baseFields
}

3.2 日志中间件

// shared/middleware/log_middleware.go
package middleware

import (
    "context"
    "net/http"
    "time"
    "github.com/zeromicro/go-zero/rest/httpx"
    "tianyuan/shared/logger"
)

// HTTP日志中间件
func LogMiddleware(serviceName string) func(http.Handler) http.Handler {
    return func(next http.Handler) http.Handler {
        return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            start := time.Now()

            logger := logger.NewStructuredLogger(serviceName, r.URL.Path)

            // 记录请求开始
            logger.LogInfo(r.Context(), "request_start", logger.LogFields{
                "method":     r.Method,
                "path":       r.URL.Path,
                "query":      r.URL.RawQuery,
                "user_agent": r.UserAgent(),
                "remote_ip":  httpx.GetRemoteAddr(r),
            })

            // 创建响应包装器
            wrapper := &responseWrapper{
                ResponseWriter: w,
                statusCode:     http.StatusOK,
            }

            // 执行请求
            next.ServeHTTP(wrapper, r)

            // 记录请求结束
            duration := time.Since(start)
            fields := logger.LogFields{
                "status_code": wrapper.statusCode,
                "duration_ms": duration.Milliseconds(),
            }

            if wrapper.statusCode >= 400 {
                logger.LogWarn(r.Context(), "request_error", fields)
            } else {
                logger.LogInfo(r.Context(), "request_success", fields)
            }
        })
    }
}

4. 使用示例

4.1 在 Handler 中使用

// client/internal/handler/product/getproductlisthandler.go
func (h *GetProductListHandler) GetProductList(w http.ResponseWriter, r *http.Request) {
    // 创建错误构造器
    errBuilder := errcode.NewErrorBuilder("client-api", "GetProductList")
    logger := logger.NewStructuredLogger("client-api", "GetProductList")

    var req types.GetProductListReq

    // 参数校验
    if err := validator.ValidateAndParse(r, &req); err != nil {
        appErr := errBuilder.ValidationError("PARAM_INVALID", "参数校验失败", err)
        logger.LogError(r.Context(), appErr, logger.LogFields{
            "request": req,
        })
        response.ErrorResponse(w, appErr)
        return
    }

    // 调用Logic层
    resp, err := h.logic.GetProductList(r.Context(), &req)
    if err != nil {
        logger.LogError(r.Context(), err, logger.LogFields{
            "request": req,
        })
        response.ErrorResponse(w, err)
        return
    }

    // 记录成功日志
    logger.LogInfo(r.Context(), "get_product_list_success", logger.LogFields{
        "request":      req,
        "result_count": len(resp.List),
    })

    response.SuccessResponse(w, resp)
}

4.2 在 RPC Logic 中使用

// domains/product/rpc/internal/logic/getproductlistlogic.go
func (l *GetProductListLogic) GetProductList(ctx context.Context, req *product.GetProductListReq) (*product.GetProductListResp, error) {
    errBuilder := errcode.NewErrorBuilder("product-rpc", "GetProductList")
    logger := logger.NewStructuredLogger("product-rpc", "GetProductList")

    // 业务校验
    validator := validator.NewProductValidator(ctx, l.svcCtx)
    if err := validator.ValidateGetProductListRequest(req); err != nil {
        appErr := errBuilder.BusinessError("VALIDATION_FAILED", err.Error())
        logger.LogError(ctx, appErr, logger.LogFields{
            "request": req,
        })
        return nil, appErr
    }

    // 查询数据库
    products, err := l.svcCtx.ProductModel.FindList(ctx, req)
    if err != nil {
        appErr := errBuilder.SystemError("DB_QUERY_FAILED", "查询产品列表失败", err)
        logger.LogError(ctx, appErr, logger.LogFields{
            "request": req,
            "db_error": err.Error(),
        })
        return nil, appErr
    }

    logger.LogInfo(ctx, "get_product_list_success", logger.LogFields{
        "request":      req,
        "result_count": len(products),
    })

    return &product.GetProductListResp{
        List:  products,
        Total: int64(len(products)),
    }, nil
}

5. 监控和告警

5.1 错误监控配置

// shared/monitor/error_monitor.go
package monitor

import (
    "context"
    "github.com/zeromicro/go-zero/core/metric"
    "tianyuan/shared/errcode"
)

var (
    // 错误计数器
    ErrorCounter = metric.NewCounterVec(&metric.CounterVecOpts{
        Namespace: "tianyuan",
        Subsystem: "error",
        Name:      "total",
        Help:      "Total number of errors",
        Labels:    []string{"service", "type", "level", "code"},
    })

    // 错误率直方图
    ErrorRateHistogram = metric.NewHistogramVec(&metric.HistogramVecOpts{
        Namespace: "tianyuan",
        Subsystem: "error",
        Name:      "rate",
        Help:      "Error rate histogram",
        Labels:    []string{"service", "method"},
    })
)

// 记录错误指标
func RecordError(appErr *errcode.AppError) {
    ErrorCounter.Inc(
        appErr.Service,
        string(appErr.Type),
        appErr.Level.String(),
        appErr.Code,
    )
}

5.2 告警规则

# prometheus告警规则
groups:
    - name: tianyuan-errors
      rules:
          # 错误率告警
          - alert: HighErrorRate
            expr: rate(tianyuan_error_total[5m]) > 0.1
            for: 2m
            labels:
                severity: warning
            annotations:
                summary: "High error rate detected"
                description: "Error rate is {{ $value }} for service {{ $labels.service }}"

          # 致命错误告警
          - alert: FatalError
            expr: increase(tianyuan_error_total{level="FATAL"}[1m]) > 0
            for: 0m
            labels:
                severity: critical
            annotations:
                summary: "Fatal error detected"
                description: "Fatal error in service {{ $labels.service }}: {{ $labels.code }}"

6. 最佳实践总结

  1. 错误分级原则

    • DEBUG/INFO开发调试信息
    • WARN需要关注但不影响业务
    • ERROR业务错误需要处理
    • FATAL/PANIC系统级错误需要立即处理
  2. 链路追踪要点

    • 每个请求都有唯一的 TraceId
    • 跨服务调用保持链路连续性
    • 关键操作添加自定义 Span
  3. 日志记录规范

    • 结构化日志,便于查询分析
    • 包含链路追踪信息
    • 敏感信息脱敏处理
  4. 监控告警策略

    • 错误率监控
    • 关键错误实时告警
    • 链路追踪性能监控