21 KiB
21 KiB
go-zero 错误分级与链路追踪设计
1. 错误分级体系
1.1 错误等级定义
// shared/errcode/levels.go
package errcode
// 错误等级枚举
type ErrorLevel int
const (
LevelDebug ErrorLevel = iota // 调试级别:开发调试信息
LevelInfo // 信息级别:一般业务信息
LevelWarn // 警告级别:需要关注但不影响业务
LevelError // 错误级别:业务错误,需要处理
LevelFatal // 致命级别:系统级错误,影响服务
LevelPanic // 恐慌级别:严重错误,服务不可用
)
// 错误等级字符串映射
var LevelNames = map[ErrorLevel]string{
LevelDebug: "DEBUG",
LevelInfo: "INFO",
LevelWarn: "WARN",
LevelError: "ERROR",
LevelFatal: "FATAL",
LevelPanic: "PANIC",
}
func (l ErrorLevel) String() string {
if name, ok := LevelNames[l]; ok {
return name
}
return "UNKNOWN"
}
1.2 错误分类体系
// shared/errcode/types.go
package errcode
import (
"fmt"
"time"
)
// 错误类型
type ErrorType string
const (
// 系统级错误
ErrorTypeSystem ErrorType = "SYSTEM" // 系统错误
ErrorTypeNetwork ErrorType = "NETWORK" // 网络错误
ErrorTypeDatabase ErrorType = "DATABASE" // 数据库错误
ErrorTypeRedis ErrorType = "REDIS" // Redis错误
ErrorTypeMQ ErrorType = "MQ" // 消息队列错误
ErrorTypeRPC ErrorType = "RPC" // RPC调用错误
// 业务级错误
ErrorTypeBusiness ErrorType = "BUSINESS" // 业务逻辑错误
ErrorTypeValidation ErrorType = "VALIDATION" // 参数校验错误
ErrorTypeAuth ErrorType = "AUTH" // 认证授权错误
ErrorTypePermission ErrorType = "PERMISSION" // 权限错误
// 客户端错误
ErrorTypeParam ErrorType = "PARAM" // 参数错误
ErrorTypeRequest ErrorType = "REQUEST" // 请求错误
ErrorTypeResponse ErrorType = "RESPONSE" // 响应错误
)
// 统一错误结构
type AppError struct {
Code string `json:"code"` // 错误码
Message string `json:"message"` // 错误消息
Level ErrorLevel `json:"level"` // 错误等级
Type ErrorType `json:"type"` // 错误类型
TraceId string `json:"trace_id"` // 链路追踪ID
SpanId string `json:"span_id"` // 跨度ID
Service string `json:"service"` // 服务名称
Method string `json:"method"` // 方法名称
Timestamp time.Time `json:"timestamp"` // 时间戳
Details interface{} `json:"details"` // 详细信息
Stack string `json:"stack"` // 堆栈信息(仅错误级别以上)
Cause error `json:"-"` // 原始错误(不序列化)
}
// 实现error接口
func (e *AppError) Error() string {
return fmt.Sprintf("[%s][%s][%s] %s: %s",
e.Level.String(), e.Type, e.Code, e.Service, e.Message)
}
// 获取原始错误
func (e *AppError) Unwrap() error {
return e.Cause
}
1.3 错误构造器
// shared/errcode/builder.go
package errcode
import (
"runtime"
"time"
"github.com/zeromicro/go-zero/core/trace"
)
type ErrorBuilder struct {
service string
method string
}
func NewErrorBuilder(service, method string) *ErrorBuilder {
return &ErrorBuilder{
service: service,
method: method,
}
}
// Debug级别错误
func (b *ErrorBuilder) Debug(code, message string) *AppError {
return b.buildError(LevelDebug, ErrorTypeSystem, code, message, nil, nil)
}
// Info级别错误
func (b *ErrorBuilder) Info(code, message string) *AppError {
return b.buildError(LevelInfo, ErrorTypeSystem, code, message, nil, nil)
}
// Warn级别错误
func (b *ErrorBuilder) Warn(errorType ErrorType, code, message string) *AppError {
return b.buildError(LevelWarn, errorType, code, message, nil, nil)
}
// Error级别错误
func (b *ErrorBuilder) Error(errorType ErrorType, code, message string, cause error) *AppError {
return b.buildError(LevelError, errorType, code, message, cause, nil)
}
// Fatal级别错误
func (b *ErrorBuilder) Fatal(errorType ErrorType, code, message string, cause error) *AppError {
return b.buildError(LevelFatal, errorType, code, message, cause, nil)
}
// Panic级别错误
func (b *ErrorBuilder) Panic(errorType ErrorType, code, message string, cause error) *AppError {
return b.buildError(LevelPanic, errorType, code, message, cause, nil)
}
// 业务错误(常用)
func (b *ErrorBuilder) BusinessError(code, message string) *AppError {
return b.buildError(LevelError, ErrorTypeBusiness, code, message, nil, nil)
}
// 参数校验错误(常用)
func (b *ErrorBuilder) ValidationError(code, message string, details interface{}) *AppError {
return b.buildError(LevelWarn, ErrorTypeValidation, code, message, nil, details)
}
// 权限错误(常用)
func (b *ErrorBuilder) PermissionError(code, message string) *AppError {
return b.buildError(LevelWarn, ErrorTypePermission, code, message, nil, nil)
}
// 系统错误(常用)
func (b *ErrorBuilder) SystemError(code, message string, cause error) *AppError {
return b.buildError(LevelFatal, ErrorTypeSystem, code, message, cause, nil)
}
// 构建错误
func (b *ErrorBuilder) buildError(level ErrorLevel, errorType ErrorType, code, message string, cause error, details interface{}) *AppError {
appErr := &AppError{
Code: code,
Message: message,
Level: level,
Type: errorType,
Service: b.service,
Method: b.method,
Timestamp: time.Now(),
Details: details,
Cause: cause,
}
// 获取链路追踪信息
if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
appErr.TraceId = traceId
}
if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
appErr.SpanId = spanId
}
// 错误级别以上记录堆栈信息
if level >= LevelError {
appErr.Stack = getStackTrace()
}
return appErr
}
// 获取堆栈信息
func getStackTrace() string {
buf := make([]byte, 4096)
n := runtime.Stack(buf, false)
return string(buf[:n])
}
2. 链路追踪集成
2.1 链路追踪配置
# etc/client-api.yaml
Name: client-api
Host: 0.0.0.0
Port: 8080
# 链路追踪配置
Telemetry:
Name: client-api
Endpoint: http://jaeger:14268/api/traces
Sampler: 1.0
Batcher: jaeger
# 日志配置
Log:
ServiceName: client-api
Mode: file
Level: info
Path: logs
MaxSize: 100
MaxAge: 7
MaxBackups: 5
Compress: true
2.2 链路追踪中间件
// shared/middleware/trace_middleware.go
package middleware
import (
"context"
"net/http"
"github.com/zeromicro/go-zero/core/trace"
"github.com/zeromicro/go-zero/rest/httpx"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"tianyuan/shared/errcode"
)
// HTTP链路追踪中间件
func TraceMiddleware(serviceName string) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tracer := otel.Tracer(serviceName)
// 开始span
ctx, span := tracer.Start(r.Context(), r.URL.Path)
defer span.End()
// 设置span属性
span.SetAttributes(
attribute.String("http.method", r.Method),
attribute.String("http.url", r.URL.String()),
attribute.String("http.user_agent", r.UserAgent()),
attribute.String("service.name", serviceName),
)
// 将链路信息注入上下文
r = r.WithContext(ctx)
// 创建响应包装器
wrapper := &responseWrapper{
ResponseWriter: w,
statusCode: http.StatusOK,
}
// 执行下一个处理器
next.ServeHTTP(wrapper, r)
// 设置响应属性
span.SetAttributes(
attribute.Int("http.status_code", wrapper.statusCode),
)
// 如果是错误状态码,设置span状态
if wrapper.statusCode >= 400 {
span.SetStatus(codes.Error, http.StatusText(wrapper.statusCode))
}
})
}
}
// 响应包装器
type responseWrapper struct {
http.ResponseWriter
statusCode int
}
func (w *responseWrapper) WriteHeader(statusCode int) {
w.statusCode = statusCode
w.ResponseWriter.WriteHeader(statusCode)
}
2.3 RPC 链路追踪拦截器
// shared/interceptor/trace_interceptor.go
package interceptor
import (
"context"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"tianyuan/shared/errcode"
)
// RPC客户端链路追踪拦截器
func TraceClientInterceptor(serviceName string) grpc.UnaryClientInterceptor {
return func(ctx context.Context, method string, req, reply interface{},
cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
tracer := otel.Tracer(serviceName)
ctx, span := tracer.Start(ctx, method)
defer span.End()
// 设置span属性
span.SetAttributes(
attribute.String("rpc.method", method),
attribute.String("rpc.service", serviceName),
attribute.String("rpc.system", "grpc"),
)
// 调用RPC
err := invoker(ctx, method, req, reply, cc, opts...)
// 处理错误
if err != nil {
span.SetStatus(codes.Error, err.Error())
span.SetAttributes(
attribute.String("rpc.grpc.status_code", status.Code(err).String()),
)
}
return err
}
}
// RPC服务端链路追踪拦截器
func TraceServerInterceptor(serviceName string) grpc.UnaryServerInterceptor {
return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo,
handler grpc.UnaryHandler) (interface{}, error) {
tracer := otel.Tracer(serviceName)
ctx, span := tracer.Start(ctx, info.FullMethod)
defer span.End()
// 设置span属性
span.SetAttributes(
attribute.String("rpc.method", info.FullMethod),
attribute.String("rpc.service", serviceName),
attribute.String("rpc.system", "grpc"),
)
// 调用处理器
resp, err := handler(ctx, req)
// 处理错误
if err != nil {
span.SetStatus(codes.Error, err.Error())
// 如果是自定义错误,记录更多信息
if appErr, ok := err.(*errcode.AppError); ok {
span.SetAttributes(
attribute.String("error.type", string(appErr.Type)),
attribute.String("error.code", appErr.Code),
attribute.String("error.level", appErr.Level.String()),
)
}
}
return resp, err
}
}
3. 日志集成
3.1 结构化日志
// shared/logger/logger.go
package logger
import (
"context"
"github.com/zeromicro/go-zero/core/logx"
"github.com/zeromicro/go-zero/core/trace"
"tianyuan/shared/errcode"
)
// 日志字段
type LogFields map[string]interface{}
// 结构化日志器
type StructuredLogger struct {
service string
method string
}
func NewStructuredLogger(service, method string) *StructuredLogger {
return &StructuredLogger{
service: service,
method: method,
}
}
// 记录错误日志
func (l *StructuredLogger) LogError(ctx context.Context, err error, fields LogFields) {
logFields := l.buildBaseFields(ctx, fields)
if appErr, ok := err.(*errcode.AppError); ok {
// 自定义错误
logFields["error_code"] = appErr.Code
logFields["error_type"] = appErr.Type
logFields["error_level"] = appErr.Level.String()
logFields["error_details"] = appErr.Details
// 根据错误级别选择日志方法
switch appErr.Level {
case errcode.LevelDebug:
logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
case errcode.LevelInfo:
logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
case errcode.LevelWarn:
logx.WithContext(ctx).WithFields(logFields).Slow(appErr.Message)
case errcode.LevelError:
logx.WithContext(ctx).WithFields(logFields).Error(appErr.Message)
case errcode.LevelFatal, errcode.LevelPanic:
logx.WithContext(ctx).WithFields(logFields).Severe(appErr.Message)
}
} else {
// 普通错误
logFields["error"] = err.Error()
logx.WithContext(ctx).WithFields(logFields).Error(err.Error())
}
}
// 记录业务日志
func (l *StructuredLogger) LogInfo(ctx context.Context, message string, fields LogFields) {
logFields := l.buildBaseFields(ctx, fields)
logx.WithContext(ctx).WithFields(logFields).Info(message)
}
// 记录警告日志
func (l *StructuredLogger) LogWarn(ctx context.Context, message string, fields LogFields) {
logFields := l.buildBaseFields(ctx, fields)
logx.WithContext(ctx).WithFields(logFields).Slow(message)
}
// 构建基础日志字段
func (l *StructuredLogger) buildBaseFields(ctx context.Context, fields LogFields) logx.LogField {
baseFields := logx.LogField{
"service": l.service,
"method": l.method,
}
// 添加链路追踪信息
if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
baseFields["trace_id"] = traceId
}
if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
baseFields["span_id"] = spanId
}
// 合并自定义字段
for k, v := range fields {
baseFields[k] = v
}
return baseFields
}
3.2 日志中间件
// shared/middleware/log_middleware.go
package middleware
import (
"context"
"net/http"
"time"
"github.com/zeromicro/go-zero/rest/httpx"
"tianyuan/shared/logger"
)
// HTTP日志中间件
func LogMiddleware(serviceName string) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
logger := logger.NewStructuredLogger(serviceName, r.URL.Path)
// 记录请求开始
logger.LogInfo(r.Context(), "request_start", logger.LogFields{
"method": r.Method,
"path": r.URL.Path,
"query": r.URL.RawQuery,
"user_agent": r.UserAgent(),
"remote_ip": httpx.GetRemoteAddr(r),
})
// 创建响应包装器
wrapper := &responseWrapper{
ResponseWriter: w,
statusCode: http.StatusOK,
}
// 执行请求
next.ServeHTTP(wrapper, r)
// 记录请求结束
duration := time.Since(start)
fields := logger.LogFields{
"status_code": wrapper.statusCode,
"duration_ms": duration.Milliseconds(),
}
if wrapper.statusCode >= 400 {
logger.LogWarn(r.Context(), "request_error", fields)
} else {
logger.LogInfo(r.Context(), "request_success", fields)
}
})
}
}
4. 使用示例
4.1 在 Handler 中使用
// client/internal/handler/product/getproductlisthandler.go
func (h *GetProductListHandler) GetProductList(w http.ResponseWriter, r *http.Request) {
// 创建错误构造器
errBuilder := errcode.NewErrorBuilder("client-api", "GetProductList")
logger := logger.NewStructuredLogger("client-api", "GetProductList")
var req types.GetProductListReq
// 参数校验
if err := validator.ValidateAndParse(r, &req); err != nil {
appErr := errBuilder.ValidationError("PARAM_INVALID", "参数校验失败", err)
logger.LogError(r.Context(), appErr, logger.LogFields{
"request": req,
})
response.ErrorResponse(w, appErr)
return
}
// 调用Logic层
resp, err := h.logic.GetProductList(r.Context(), &req)
if err != nil {
logger.LogError(r.Context(), err, logger.LogFields{
"request": req,
})
response.ErrorResponse(w, err)
return
}
// 记录成功日志
logger.LogInfo(r.Context(), "get_product_list_success", logger.LogFields{
"request": req,
"result_count": len(resp.List),
})
response.SuccessResponse(w, resp)
}
4.2 在 RPC Logic 中使用
// domains/product/rpc/internal/logic/getproductlistlogic.go
func (l *GetProductListLogic) GetProductList(ctx context.Context, req *product.GetProductListReq) (*product.GetProductListResp, error) {
errBuilder := errcode.NewErrorBuilder("product-rpc", "GetProductList")
logger := logger.NewStructuredLogger("product-rpc", "GetProductList")
// 业务校验
validator := validator.NewProductValidator(ctx, l.svcCtx)
if err := validator.ValidateGetProductListRequest(req); err != nil {
appErr := errBuilder.BusinessError("VALIDATION_FAILED", err.Error())
logger.LogError(ctx, appErr, logger.LogFields{
"request": req,
})
return nil, appErr
}
// 查询数据库
products, err := l.svcCtx.ProductModel.FindList(ctx, req)
if err != nil {
appErr := errBuilder.SystemError("DB_QUERY_FAILED", "查询产品列表失败", err)
logger.LogError(ctx, appErr, logger.LogFields{
"request": req,
"db_error": err.Error(),
})
return nil, appErr
}
logger.LogInfo(ctx, "get_product_list_success", logger.LogFields{
"request": req,
"result_count": len(products),
})
return &product.GetProductListResp{
List: products,
Total: int64(len(products)),
}, nil
}
5. 监控和告警
5.1 错误监控配置
// shared/monitor/error_monitor.go
package monitor
import (
"context"
"github.com/zeromicro/go-zero/core/metric"
"tianyuan/shared/errcode"
)
var (
// 错误计数器
ErrorCounter = metric.NewCounterVec(&metric.CounterVecOpts{
Namespace: "tianyuan",
Subsystem: "error",
Name: "total",
Help: "Total number of errors",
Labels: []string{"service", "type", "level", "code"},
})
// 错误率直方图
ErrorRateHistogram = metric.NewHistogramVec(&metric.HistogramVecOpts{
Namespace: "tianyuan",
Subsystem: "error",
Name: "rate",
Help: "Error rate histogram",
Labels: []string{"service", "method"},
})
)
// 记录错误指标
func RecordError(appErr *errcode.AppError) {
ErrorCounter.Inc(
appErr.Service,
string(appErr.Type),
appErr.Level.String(),
appErr.Code,
)
}
5.2 告警规则
# prometheus告警规则
groups:
- name: tianyuan-errors
rules:
# 错误率告警
- alert: HighErrorRate
expr: rate(tianyuan_error_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} for service {{ $labels.service }}"
# 致命错误告警
- alert: FatalError
expr: increase(tianyuan_error_total{level="FATAL"}[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Fatal error detected"
description: "Fatal error in service {{ $labels.service }}: {{ $labels.code }}"
6. 最佳实践总结
-
错误分级原则:
- DEBUG/INFO:开发调试信息
- WARN:需要关注但不影响业务
- ERROR:业务错误,需要处理
- FATAL/PANIC:系统级错误,需要立即处理
-
链路追踪要点:
- 每个请求都有唯一的 TraceId
- 跨服务调用保持链路连续性
- 关键操作添加自定义 Span
-
日志记录规范:
- 结构化日志,便于查询分析
- 包含链路追踪信息
- 敏感信息脱敏处理
-
监控告警策略:
- 错误率监控
- 关键错误实时告警
- 链路追踪性能监控