# go-zero 错误分级与链路追踪设计 ## 1. 错误分级体系 ### 1.1 错误等级定义 ```go // shared/errcode/levels.go package errcode // 错误等级枚举 type ErrorLevel int const ( LevelDebug ErrorLevel = iota // 调试级别:开发调试信息 LevelInfo // 信息级别:一般业务信息 LevelWarn // 警告级别:需要关注但不影响业务 LevelError // 错误级别:业务错误,需要处理 LevelFatal // 致命级别:系统级错误,影响服务 LevelPanic // 恐慌级别:严重错误,服务不可用 ) // 错误等级字符串映射 var LevelNames = map[ErrorLevel]string{ LevelDebug: "DEBUG", LevelInfo: "INFO", LevelWarn: "WARN", LevelError: "ERROR", LevelFatal: "FATAL", LevelPanic: "PANIC", } func (l ErrorLevel) String() string { if name, ok := LevelNames[l]; ok { return name } return "UNKNOWN" } ``` ### 1.2 错误分类体系 ```go // shared/errcode/types.go package errcode import ( "fmt" "time" ) // 错误类型 type ErrorType string const ( // 系统级错误 ErrorTypeSystem ErrorType = "SYSTEM" // 系统错误 ErrorTypeNetwork ErrorType = "NETWORK" // 网络错误 ErrorTypeDatabase ErrorType = "DATABASE" // 数据库错误 ErrorTypeRedis ErrorType = "REDIS" // Redis错误 ErrorTypeMQ ErrorType = "MQ" // 消息队列错误 ErrorTypeRPC ErrorType = "RPC" // RPC调用错误 // 业务级错误 ErrorTypeBusiness ErrorType = "BUSINESS" // 业务逻辑错误 ErrorTypeValidation ErrorType = "VALIDATION" // 参数校验错误 ErrorTypeAuth ErrorType = "AUTH" // 认证授权错误 ErrorTypePermission ErrorType = "PERMISSION" // 权限错误 // 客户端错误 ErrorTypeParam ErrorType = "PARAM" // 参数错误 ErrorTypeRequest ErrorType = "REQUEST" // 请求错误 ErrorTypeResponse ErrorType = "RESPONSE" // 响应错误 ) // 统一错误结构 type AppError struct { Code string `json:"code"` // 错误码 Message string `json:"message"` // 错误消息 Level ErrorLevel `json:"level"` // 错误等级 Type ErrorType `json:"type"` // 错误类型 TraceId string `json:"trace_id"` // 链路追踪ID SpanId string `json:"span_id"` // 跨度ID Service string `json:"service"` // 服务名称 Method string `json:"method"` // 方法名称 Timestamp time.Time `json:"timestamp"` // 时间戳 Details interface{} `json:"details"` // 详细信息 Stack string `json:"stack"` // 堆栈信息(仅错误级别以上) Cause error `json:"-"` // 原始错误(不序列化) } // 实现error接口 func (e *AppError) Error() string { return fmt.Sprintf("[%s][%s][%s] %s: %s", e.Level.String(), e.Type, e.Code, e.Service, e.Message) } // 获取原始错误 func (e *AppError) Unwrap() error { return e.Cause } ``` ### 1.3 错误构造器 ```go // shared/errcode/builder.go package errcode import ( "runtime" "time" "github.com/zeromicro/go-zero/core/trace" ) type ErrorBuilder struct { service string method string } func NewErrorBuilder(service, method string) *ErrorBuilder { return &ErrorBuilder{ service: service, method: method, } } // Debug级别错误 func (b *ErrorBuilder) Debug(code, message string) *AppError { return b.buildError(LevelDebug, ErrorTypeSystem, code, message, nil, nil) } // Info级别错误 func (b *ErrorBuilder) Info(code, message string) *AppError { return b.buildError(LevelInfo, ErrorTypeSystem, code, message, nil, nil) } // Warn级别错误 func (b *ErrorBuilder) Warn(errorType ErrorType, code, message string) *AppError { return b.buildError(LevelWarn, errorType, code, message, nil, nil) } // Error级别错误 func (b *ErrorBuilder) Error(errorType ErrorType, code, message string, cause error) *AppError { return b.buildError(LevelError, errorType, code, message, cause, nil) } // Fatal级别错误 func (b *ErrorBuilder) Fatal(errorType ErrorType, code, message string, cause error) *AppError { return b.buildError(LevelFatal, errorType, code, message, cause, nil) } // Panic级别错误 func (b *ErrorBuilder) Panic(errorType ErrorType, code, message string, cause error) *AppError { return b.buildError(LevelPanic, errorType, code, message, cause, nil) } // 业务错误(常用) func (b *ErrorBuilder) BusinessError(code, message string) *AppError { return b.buildError(LevelError, ErrorTypeBusiness, code, message, nil, nil) } // 参数校验错误(常用) func (b *ErrorBuilder) ValidationError(code, message string, details interface{}) *AppError { return b.buildError(LevelWarn, ErrorTypeValidation, code, message, nil, details) } // 权限错误(常用) func (b *ErrorBuilder) PermissionError(code, message string) *AppError { return b.buildError(LevelWarn, ErrorTypePermission, code, message, nil, nil) } // 系统错误(常用) func (b *ErrorBuilder) SystemError(code, message string, cause error) *AppError { return b.buildError(LevelFatal, ErrorTypeSystem, code, message, cause, nil) } // 构建错误 func (b *ErrorBuilder) buildError(level ErrorLevel, errorType ErrorType, code, message string, cause error, details interface{}) *AppError { appErr := &AppError{ Code: code, Message: message, Level: level, Type: errorType, Service: b.service, Method: b.method, Timestamp: time.Now(), Details: details, Cause: cause, } // 获取链路追踪信息 if traceId := trace.TraceIDFromContext(ctx); traceId != "" { appErr.TraceId = traceId } if spanId := trace.SpanIDFromContext(ctx); spanId != "" { appErr.SpanId = spanId } // 错误级别以上记录堆栈信息 if level >= LevelError { appErr.Stack = getStackTrace() } return appErr } // 获取堆栈信息 func getStackTrace() string { buf := make([]byte, 4096) n := runtime.Stack(buf, false) return string(buf[:n]) } ``` ## 2. 链路追踪集成 ### 2.1 链路追踪配置 ```yaml # etc/client-api.yaml Name: client-api Host: 0.0.0.0 Port: 8080 # 链路追踪配置 Telemetry: Name: client-api Endpoint: http://jaeger:14268/api/traces Sampler: 1.0 Batcher: jaeger # 日志配置 Log: ServiceName: client-api Mode: file Level: info Path: logs MaxSize: 100 MaxAge: 7 MaxBackups: 5 Compress: true ``` ### 2.2 链路追踪中间件 ```go // shared/middleware/trace_middleware.go package middleware import ( "context" "net/http" "github.com/zeromicro/go-zero/core/trace" "github.com/zeromicro/go-zero/rest/httpx" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "tianyuan/shared/errcode" ) // HTTP链路追踪中间件 func TraceMiddleware(serviceName string) func(http.Handler) http.Handler { return func(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { tracer := otel.Tracer(serviceName) // 开始span ctx, span := tracer.Start(r.Context(), r.URL.Path) defer span.End() // 设置span属性 span.SetAttributes( attribute.String("http.method", r.Method), attribute.String("http.url", r.URL.String()), attribute.String("http.user_agent", r.UserAgent()), attribute.String("service.name", serviceName), ) // 将链路信息注入上下文 r = r.WithContext(ctx) // 创建响应包装器 wrapper := &responseWrapper{ ResponseWriter: w, statusCode: http.StatusOK, } // 执行下一个处理器 next.ServeHTTP(wrapper, r) // 设置响应属性 span.SetAttributes( attribute.Int("http.status_code", wrapper.statusCode), ) // 如果是错误状态码,设置span状态 if wrapper.statusCode >= 400 { span.SetStatus(codes.Error, http.StatusText(wrapper.statusCode)) } }) } } // 响应包装器 type responseWrapper struct { http.ResponseWriter statusCode int } func (w *responseWrapper) WriteHeader(statusCode int) { w.statusCode = statusCode w.ResponseWriter.WriteHeader(statusCode) } ``` ### 2.3 RPC 链路追踪拦截器 ```go // shared/interceptor/trace_interceptor.go package interceptor import ( "context" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "tianyuan/shared/errcode" ) // RPC客户端链路追踪拦截器 func TraceClientInterceptor(serviceName string) grpc.UnaryClientInterceptor { return func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error { tracer := otel.Tracer(serviceName) ctx, span := tracer.Start(ctx, method) defer span.End() // 设置span属性 span.SetAttributes( attribute.String("rpc.method", method), attribute.String("rpc.service", serviceName), attribute.String("rpc.system", "grpc"), ) // 调用RPC err := invoker(ctx, method, req, reply, cc, opts...) // 处理错误 if err != nil { span.SetStatus(codes.Error, err.Error()) span.SetAttributes( attribute.String("rpc.grpc.status_code", status.Code(err).String()), ) } return err } } // RPC服务端链路追踪拦截器 func TraceServerInterceptor(serviceName string) grpc.UnaryServerInterceptor { return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { tracer := otel.Tracer(serviceName) ctx, span := tracer.Start(ctx, info.FullMethod) defer span.End() // 设置span属性 span.SetAttributes( attribute.String("rpc.method", info.FullMethod), attribute.String("rpc.service", serviceName), attribute.String("rpc.system", "grpc"), ) // 调用处理器 resp, err := handler(ctx, req) // 处理错误 if err != nil { span.SetStatus(codes.Error, err.Error()) // 如果是自定义错误,记录更多信息 if appErr, ok := err.(*errcode.AppError); ok { span.SetAttributes( attribute.String("error.type", string(appErr.Type)), attribute.String("error.code", appErr.Code), attribute.String("error.level", appErr.Level.String()), ) } } return resp, err } } ``` ## 3. 日志集成 ### 3.1 结构化日志 ```go // shared/logger/logger.go package logger import ( "context" "github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/trace" "tianyuan/shared/errcode" ) // 日志字段 type LogFields map[string]interface{} // 结构化日志器 type StructuredLogger struct { service string method string } func NewStructuredLogger(service, method string) *StructuredLogger { return &StructuredLogger{ service: service, method: method, } } // 记录错误日志 func (l *StructuredLogger) LogError(ctx context.Context, err error, fields LogFields) { logFields := l.buildBaseFields(ctx, fields) if appErr, ok := err.(*errcode.AppError); ok { // 自定义错误 logFields["error_code"] = appErr.Code logFields["error_type"] = appErr.Type logFields["error_level"] = appErr.Level.String() logFields["error_details"] = appErr.Details // 根据错误级别选择日志方法 switch appErr.Level { case errcode.LevelDebug: logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message) case errcode.LevelInfo: logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message) case errcode.LevelWarn: logx.WithContext(ctx).WithFields(logFields).Slow(appErr.Message) case errcode.LevelError: logx.WithContext(ctx).WithFields(logFields).Error(appErr.Message) case errcode.LevelFatal, errcode.LevelPanic: logx.WithContext(ctx).WithFields(logFields).Severe(appErr.Message) } } else { // 普通错误 logFields["error"] = err.Error() logx.WithContext(ctx).WithFields(logFields).Error(err.Error()) } } // 记录业务日志 func (l *StructuredLogger) LogInfo(ctx context.Context, message string, fields LogFields) { logFields := l.buildBaseFields(ctx, fields) logx.WithContext(ctx).WithFields(logFields).Info(message) } // 记录警告日志 func (l *StructuredLogger) LogWarn(ctx context.Context, message string, fields LogFields) { logFields := l.buildBaseFields(ctx, fields) logx.WithContext(ctx).WithFields(logFields).Slow(message) } // 构建基础日志字段 func (l *StructuredLogger) buildBaseFields(ctx context.Context, fields LogFields) logx.LogField { baseFields := logx.LogField{ "service": l.service, "method": l.method, } // 添加链路追踪信息 if traceId := trace.TraceIDFromContext(ctx); traceId != "" { baseFields["trace_id"] = traceId } if spanId := trace.SpanIDFromContext(ctx); spanId != "" { baseFields["span_id"] = spanId } // 合并自定义字段 for k, v := range fields { baseFields[k] = v } return baseFields } ``` ### 3.2 日志中间件 ```go // shared/middleware/log_middleware.go package middleware import ( "context" "net/http" "time" "github.com/zeromicro/go-zero/rest/httpx" "tianyuan/shared/logger" ) // HTTP日志中间件 func LogMiddleware(serviceName string) func(http.Handler) http.Handler { return func(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() logger := logger.NewStructuredLogger(serviceName, r.URL.Path) // 记录请求开始 logger.LogInfo(r.Context(), "request_start", logger.LogFields{ "method": r.Method, "path": r.URL.Path, "query": r.URL.RawQuery, "user_agent": r.UserAgent(), "remote_ip": httpx.GetRemoteAddr(r), }) // 创建响应包装器 wrapper := &responseWrapper{ ResponseWriter: w, statusCode: http.StatusOK, } // 执行请求 next.ServeHTTP(wrapper, r) // 记录请求结束 duration := time.Since(start) fields := logger.LogFields{ "status_code": wrapper.statusCode, "duration_ms": duration.Milliseconds(), } if wrapper.statusCode >= 400 { logger.LogWarn(r.Context(), "request_error", fields) } else { logger.LogInfo(r.Context(), "request_success", fields) } }) } } ``` ## 4. 使用示例 ### 4.1 在 Handler 中使用 ```go // client/internal/handler/product/getproductlisthandler.go func (h *GetProductListHandler) GetProductList(w http.ResponseWriter, r *http.Request) { // 创建错误构造器 errBuilder := errcode.NewErrorBuilder("client-api", "GetProductList") logger := logger.NewStructuredLogger("client-api", "GetProductList") var req types.GetProductListReq // 参数校验 if err := validator.ValidateAndParse(r, &req); err != nil { appErr := errBuilder.ValidationError("PARAM_INVALID", "参数校验失败", err) logger.LogError(r.Context(), appErr, logger.LogFields{ "request": req, }) response.ErrorResponse(w, appErr) return } // 调用Logic层 resp, err := h.logic.GetProductList(r.Context(), &req) if err != nil { logger.LogError(r.Context(), err, logger.LogFields{ "request": req, }) response.ErrorResponse(w, err) return } // 记录成功日志 logger.LogInfo(r.Context(), "get_product_list_success", logger.LogFields{ "request": req, "result_count": len(resp.List), }) response.SuccessResponse(w, resp) } ``` ### 4.2 在 RPC Logic 中使用 ```go // domains/product/rpc/internal/logic/getproductlistlogic.go func (l *GetProductListLogic) GetProductList(ctx context.Context, req *product.GetProductListReq) (*product.GetProductListResp, error) { errBuilder := errcode.NewErrorBuilder("product-rpc", "GetProductList") logger := logger.NewStructuredLogger("product-rpc", "GetProductList") // 业务校验 validator := validator.NewProductValidator(ctx, l.svcCtx) if err := validator.ValidateGetProductListRequest(req); err != nil { appErr := errBuilder.BusinessError("VALIDATION_FAILED", err.Error()) logger.LogError(ctx, appErr, logger.LogFields{ "request": req, }) return nil, appErr } // 查询数据库 products, err := l.svcCtx.ProductModel.FindList(ctx, req) if err != nil { appErr := errBuilder.SystemError("DB_QUERY_FAILED", "查询产品列表失败", err) logger.LogError(ctx, appErr, logger.LogFields{ "request": req, "db_error": err.Error(), }) return nil, appErr } logger.LogInfo(ctx, "get_product_list_success", logger.LogFields{ "request": req, "result_count": len(products), }) return &product.GetProductListResp{ List: products, Total: int64(len(products)), }, nil } ``` ## 5. 监控和告警 ### 5.1 错误监控配置 ```go // shared/monitor/error_monitor.go package monitor import ( "context" "github.com/zeromicro/go-zero/core/metric" "tianyuan/shared/errcode" ) var ( // 错误计数器 ErrorCounter = metric.NewCounterVec(&metric.CounterVecOpts{ Namespace: "tianyuan", Subsystem: "error", Name: "total", Help: "Total number of errors", Labels: []string{"service", "type", "level", "code"}, }) // 错误率直方图 ErrorRateHistogram = metric.NewHistogramVec(&metric.HistogramVecOpts{ Namespace: "tianyuan", Subsystem: "error", Name: "rate", Help: "Error rate histogram", Labels: []string{"service", "method"}, }) ) // 记录错误指标 func RecordError(appErr *errcode.AppError) { ErrorCounter.Inc( appErr.Service, string(appErr.Type), appErr.Level.String(), appErr.Code, ) } ``` ### 5.2 告警规则 ```yaml # prometheus告警规则 groups: - name: tianyuan-errors rules: # 错误率告警 - alert: HighErrorRate expr: rate(tianyuan_error_total[5m]) > 0.1 for: 2m labels: severity: warning annotations: summary: "High error rate detected" description: "Error rate is {{ $value }} for service {{ $labels.service }}" # 致命错误告警 - alert: FatalError expr: increase(tianyuan_error_total{level="FATAL"}[1m]) > 0 for: 0m labels: severity: critical annotations: summary: "Fatal error detected" description: "Fatal error in service {{ $labels.service }}: {{ $labels.code }}" ``` ## 6. 最佳实践总结 1. **错误分级原则**: - DEBUG/INFO:开发调试信息 - WARN:需要关注但不影响业务 - ERROR:业务错误,需要处理 - FATAL/PANIC:系统级错误,需要立即处理 2. **链路追踪要点**: - 每个请求都有唯一的 TraceId - 跨服务调用保持链路连续性 - 关键操作添加自定义 Span 3. **日志记录规范**: - 结构化日志,便于查询分析 - 包含链路追踪信息 - 敏感信息脱敏处理 4. **监控告警策略**: - 错误率监控 - 关键错误实时告警 - 链路追踪性能监控