tianyuan-api-server/go-zero错误分级与链路追踪设计.md
2025-07-13 20:37:12 +08:00

740 lines
21 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# go-zero 错误分级与链路追踪设计
## 1. 错误分级体系
### 1.1 错误等级定义
```go
// shared/errcode/levels.go
package errcode
// 错误等级枚举
type ErrorLevel int
const (
LevelDebug ErrorLevel = iota // 调试级别:开发调试信息
LevelInfo // 信息级别:一般业务信息
LevelWarn // 警告级别:需要关注但不影响业务
LevelError // 错误级别:业务错误,需要处理
LevelFatal // 致命级别:系统级错误,影响服务
LevelPanic // 恐慌级别:严重错误,服务不可用
)
// 错误等级字符串映射
var LevelNames = map[ErrorLevel]string{
LevelDebug: "DEBUG",
LevelInfo: "INFO",
LevelWarn: "WARN",
LevelError: "ERROR",
LevelFatal: "FATAL",
LevelPanic: "PANIC",
}
func (l ErrorLevel) String() string {
if name, ok := LevelNames[l]; ok {
return name
}
return "UNKNOWN"
}
```
### 1.2 错误分类体系
```go
// shared/errcode/types.go
package errcode
import (
"fmt"
"time"
)
// 错误类型
type ErrorType string
const (
// 系统级错误
ErrorTypeSystem ErrorType = "SYSTEM" // 系统错误
ErrorTypeNetwork ErrorType = "NETWORK" // 网络错误
ErrorTypeDatabase ErrorType = "DATABASE" // 数据库错误
ErrorTypeRedis ErrorType = "REDIS" // Redis错误
ErrorTypeMQ ErrorType = "MQ" // 消息队列错误
ErrorTypeRPC ErrorType = "RPC" // RPC调用错误
// 业务级错误
ErrorTypeBusiness ErrorType = "BUSINESS" // 业务逻辑错误
ErrorTypeValidation ErrorType = "VALIDATION" // 参数校验错误
ErrorTypeAuth ErrorType = "AUTH" // 认证授权错误
ErrorTypePermission ErrorType = "PERMISSION" // 权限错误
// 客户端错误
ErrorTypeParam ErrorType = "PARAM" // 参数错误
ErrorTypeRequest ErrorType = "REQUEST" // 请求错误
ErrorTypeResponse ErrorType = "RESPONSE" // 响应错误
)
// 统一错误结构
type AppError struct {
Code string `json:"code"` // 错误码
Message string `json:"message"` // 错误消息
Level ErrorLevel `json:"level"` // 错误等级
Type ErrorType `json:"type"` // 错误类型
TraceId string `json:"trace_id"` // 链路追踪ID
SpanId string `json:"span_id"` // 跨度ID
Service string `json:"service"` // 服务名称
Method string `json:"method"` // 方法名称
Timestamp time.Time `json:"timestamp"` // 时间戳
Details interface{} `json:"details"` // 详细信息
Stack string `json:"stack"` // 堆栈信息(仅错误级别以上)
Cause error `json:"-"` // 原始错误(不序列化)
}
// 实现error接口
func (e *AppError) Error() string {
return fmt.Sprintf("[%s][%s][%s] %s: %s",
e.Level.String(), e.Type, e.Code, e.Service, e.Message)
}
// 获取原始错误
func (e *AppError) Unwrap() error {
return e.Cause
}
```
### 1.3 错误构造器
```go
// shared/errcode/builder.go
package errcode
import (
"runtime"
"time"
"github.com/zeromicro/go-zero/core/trace"
)
type ErrorBuilder struct {
service string
method string
}
func NewErrorBuilder(service, method string) *ErrorBuilder {
return &ErrorBuilder{
service: service,
method: method,
}
}
// Debug级别错误
func (b *ErrorBuilder) Debug(code, message string) *AppError {
return b.buildError(LevelDebug, ErrorTypeSystem, code, message, nil, nil)
}
// Info级别错误
func (b *ErrorBuilder) Info(code, message string) *AppError {
return b.buildError(LevelInfo, ErrorTypeSystem, code, message, nil, nil)
}
// Warn级别错误
func (b *ErrorBuilder) Warn(errorType ErrorType, code, message string) *AppError {
return b.buildError(LevelWarn, errorType, code, message, nil, nil)
}
// Error级别错误
func (b *ErrorBuilder) Error(errorType ErrorType, code, message string, cause error) *AppError {
return b.buildError(LevelError, errorType, code, message, cause, nil)
}
// Fatal级别错误
func (b *ErrorBuilder) Fatal(errorType ErrorType, code, message string, cause error) *AppError {
return b.buildError(LevelFatal, errorType, code, message, cause, nil)
}
// Panic级别错误
func (b *ErrorBuilder) Panic(errorType ErrorType, code, message string, cause error) *AppError {
return b.buildError(LevelPanic, errorType, code, message, cause, nil)
}
// 业务错误(常用)
func (b *ErrorBuilder) BusinessError(code, message string) *AppError {
return b.buildError(LevelError, ErrorTypeBusiness, code, message, nil, nil)
}
// 参数校验错误(常用)
func (b *ErrorBuilder) ValidationError(code, message string, details interface{}) *AppError {
return b.buildError(LevelWarn, ErrorTypeValidation, code, message, nil, details)
}
// 权限错误(常用)
func (b *ErrorBuilder) PermissionError(code, message string) *AppError {
return b.buildError(LevelWarn, ErrorTypePermission, code, message, nil, nil)
}
// 系统错误(常用)
func (b *ErrorBuilder) SystemError(code, message string, cause error) *AppError {
return b.buildError(LevelFatal, ErrorTypeSystem, code, message, cause, nil)
}
// 构建错误
func (b *ErrorBuilder) buildError(level ErrorLevel, errorType ErrorType, code, message string, cause error, details interface{}) *AppError {
appErr := &AppError{
Code: code,
Message: message,
Level: level,
Type: errorType,
Service: b.service,
Method: b.method,
Timestamp: time.Now(),
Details: details,
Cause: cause,
}
// 获取链路追踪信息
if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
appErr.TraceId = traceId
}
if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
appErr.SpanId = spanId
}
// 错误级别以上记录堆栈信息
if level >= LevelError {
appErr.Stack = getStackTrace()
}
return appErr
}
// 获取堆栈信息
func getStackTrace() string {
buf := make([]byte, 4096)
n := runtime.Stack(buf, false)
return string(buf[:n])
}
```
## 2. 链路追踪集成
### 2.1 链路追踪配置
```yaml
# etc/client-api.yaml
Name: client-api
Host: 0.0.0.0
Port: 8080
# 链路追踪配置
Telemetry:
Name: client-api
Endpoint: http://jaeger:14268/api/traces
Sampler: 1.0
Batcher: jaeger
# 日志配置
Log:
ServiceName: client-api
Mode: file
Level: info
Path: logs
MaxSize: 100
MaxAge: 7
MaxBackups: 5
Compress: true
```
### 2.2 链路追踪中间件
```go
// shared/middleware/trace_middleware.go
package middleware
import (
"context"
"net/http"
"github.com/zeromicro/go-zero/core/trace"
"github.com/zeromicro/go-zero/rest/httpx"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"tianyuan/shared/errcode"
)
// HTTP链路追踪中间件
func TraceMiddleware(serviceName string) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tracer := otel.Tracer(serviceName)
// 开始span
ctx, span := tracer.Start(r.Context(), r.URL.Path)
defer span.End()
// 设置span属性
span.SetAttributes(
attribute.String("http.method", r.Method),
attribute.String("http.url", r.URL.String()),
attribute.String("http.user_agent", r.UserAgent()),
attribute.String("service.name", serviceName),
)
// 将链路信息注入上下文
r = r.WithContext(ctx)
// 创建响应包装器
wrapper := &responseWrapper{
ResponseWriter: w,
statusCode: http.StatusOK,
}
// 执行下一个处理器
next.ServeHTTP(wrapper, r)
// 设置响应属性
span.SetAttributes(
attribute.Int("http.status_code", wrapper.statusCode),
)
// 如果是错误状态码设置span状态
if wrapper.statusCode >= 400 {
span.SetStatus(codes.Error, http.StatusText(wrapper.statusCode))
}
})
}
}
// 响应包装器
type responseWrapper struct {
http.ResponseWriter
statusCode int
}
func (w *responseWrapper) WriteHeader(statusCode int) {
w.statusCode = statusCode
w.ResponseWriter.WriteHeader(statusCode)
}
```
### 2.3 RPC 链路追踪拦截器
```go
// shared/interceptor/trace_interceptor.go
package interceptor
import (
"context"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"tianyuan/shared/errcode"
)
// RPC客户端链路追踪拦截器
func TraceClientInterceptor(serviceName string) grpc.UnaryClientInterceptor {
return func(ctx context.Context, method string, req, reply interface{},
cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
tracer := otel.Tracer(serviceName)
ctx, span := tracer.Start(ctx, method)
defer span.End()
// 设置span属性
span.SetAttributes(
attribute.String("rpc.method", method),
attribute.String("rpc.service", serviceName),
attribute.String("rpc.system", "grpc"),
)
// 调用RPC
err := invoker(ctx, method, req, reply, cc, opts...)
// 处理错误
if err != nil {
span.SetStatus(codes.Error, err.Error())
span.SetAttributes(
attribute.String("rpc.grpc.status_code", status.Code(err).String()),
)
}
return err
}
}
// RPC服务端链路追踪拦截器
func TraceServerInterceptor(serviceName string) grpc.UnaryServerInterceptor {
return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo,
handler grpc.UnaryHandler) (interface{}, error) {
tracer := otel.Tracer(serviceName)
ctx, span := tracer.Start(ctx, info.FullMethod)
defer span.End()
// 设置span属性
span.SetAttributes(
attribute.String("rpc.method", info.FullMethod),
attribute.String("rpc.service", serviceName),
attribute.String("rpc.system", "grpc"),
)
// 调用处理器
resp, err := handler(ctx, req)
// 处理错误
if err != nil {
span.SetStatus(codes.Error, err.Error())
// 如果是自定义错误,记录更多信息
if appErr, ok := err.(*errcode.AppError); ok {
span.SetAttributes(
attribute.String("error.type", string(appErr.Type)),
attribute.String("error.code", appErr.Code),
attribute.String("error.level", appErr.Level.String()),
)
}
}
return resp, err
}
}
```
## 3. 日志集成
### 3.1 结构化日志
```go
// shared/logger/logger.go
package logger
import (
"context"
"github.com/zeromicro/go-zero/core/logx"
"github.com/zeromicro/go-zero/core/trace"
"tianyuan/shared/errcode"
)
// 日志字段
type LogFields map[string]interface{}
// 结构化日志器
type StructuredLogger struct {
service string
method string
}
func NewStructuredLogger(service, method string) *StructuredLogger {
return &StructuredLogger{
service: service,
method: method,
}
}
// 记录错误日志
func (l *StructuredLogger) LogError(ctx context.Context, err error, fields LogFields) {
logFields := l.buildBaseFields(ctx, fields)
if appErr, ok := err.(*errcode.AppError); ok {
// 自定义错误
logFields["error_code"] = appErr.Code
logFields["error_type"] = appErr.Type
logFields["error_level"] = appErr.Level.String()
logFields["error_details"] = appErr.Details
// 根据错误级别选择日志方法
switch appErr.Level {
case errcode.LevelDebug:
logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
case errcode.LevelInfo:
logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
case errcode.LevelWarn:
logx.WithContext(ctx).WithFields(logFields).Slow(appErr.Message)
case errcode.LevelError:
logx.WithContext(ctx).WithFields(logFields).Error(appErr.Message)
case errcode.LevelFatal, errcode.LevelPanic:
logx.WithContext(ctx).WithFields(logFields).Severe(appErr.Message)
}
} else {
// 普通错误
logFields["error"] = err.Error()
logx.WithContext(ctx).WithFields(logFields).Error(err.Error())
}
}
// 记录业务日志
func (l *StructuredLogger) LogInfo(ctx context.Context, message string, fields LogFields) {
logFields := l.buildBaseFields(ctx, fields)
logx.WithContext(ctx).WithFields(logFields).Info(message)
}
// 记录警告日志
func (l *StructuredLogger) LogWarn(ctx context.Context, message string, fields LogFields) {
logFields := l.buildBaseFields(ctx, fields)
logx.WithContext(ctx).WithFields(logFields).Slow(message)
}
// 构建基础日志字段
func (l *StructuredLogger) buildBaseFields(ctx context.Context, fields LogFields) logx.LogField {
baseFields := logx.LogField{
"service": l.service,
"method": l.method,
}
// 添加链路追踪信息
if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
baseFields["trace_id"] = traceId
}
if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
baseFields["span_id"] = spanId
}
// 合并自定义字段
for k, v := range fields {
baseFields[k] = v
}
return baseFields
}
```
### 3.2 日志中间件
```go
// shared/middleware/log_middleware.go
package middleware
import (
"context"
"net/http"
"time"
"github.com/zeromicro/go-zero/rest/httpx"
"tianyuan/shared/logger"
)
// HTTP日志中间件
func LogMiddleware(serviceName string) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
logger := logger.NewStructuredLogger(serviceName, r.URL.Path)
// 记录请求开始
logger.LogInfo(r.Context(), "request_start", logger.LogFields{
"method": r.Method,
"path": r.URL.Path,
"query": r.URL.RawQuery,
"user_agent": r.UserAgent(),
"remote_ip": httpx.GetRemoteAddr(r),
})
// 创建响应包装器
wrapper := &responseWrapper{
ResponseWriter: w,
statusCode: http.StatusOK,
}
// 执行请求
next.ServeHTTP(wrapper, r)
// 记录请求结束
duration := time.Since(start)
fields := logger.LogFields{
"status_code": wrapper.statusCode,
"duration_ms": duration.Milliseconds(),
}
if wrapper.statusCode >= 400 {
logger.LogWarn(r.Context(), "request_error", fields)
} else {
logger.LogInfo(r.Context(), "request_success", fields)
}
})
}
}
```
## 4. 使用示例
### 4.1 在 Handler 中使用
```go
// client/internal/handler/product/getproductlisthandler.go
func (h *GetProductListHandler) GetProductList(w http.ResponseWriter, r *http.Request) {
// 创建错误构造器
errBuilder := errcode.NewErrorBuilder("client-api", "GetProductList")
logger := logger.NewStructuredLogger("client-api", "GetProductList")
var req types.GetProductListReq
// 参数校验
if err := validator.ValidateAndParse(r, &req); err != nil {
appErr := errBuilder.ValidationError("PARAM_INVALID", "参数校验失败", err)
logger.LogError(r.Context(), appErr, logger.LogFields{
"request": req,
})
response.ErrorResponse(w, appErr)
return
}
// 调用Logic层
resp, err := h.logic.GetProductList(r.Context(), &req)
if err != nil {
logger.LogError(r.Context(), err, logger.LogFields{
"request": req,
})
response.ErrorResponse(w, err)
return
}
// 记录成功日志
logger.LogInfo(r.Context(), "get_product_list_success", logger.LogFields{
"request": req,
"result_count": len(resp.List),
})
response.SuccessResponse(w, resp)
}
```
### 4.2 在 RPC Logic 中使用
```go
// domains/product/rpc/internal/logic/getproductlistlogic.go
func (l *GetProductListLogic) GetProductList(ctx context.Context, req *product.GetProductListReq) (*product.GetProductListResp, error) {
errBuilder := errcode.NewErrorBuilder("product-rpc", "GetProductList")
logger := logger.NewStructuredLogger("product-rpc", "GetProductList")
// 业务校验
validator := validator.NewProductValidator(ctx, l.svcCtx)
if err := validator.ValidateGetProductListRequest(req); err != nil {
appErr := errBuilder.BusinessError("VALIDATION_FAILED", err.Error())
logger.LogError(ctx, appErr, logger.LogFields{
"request": req,
})
return nil, appErr
}
// 查询数据库
products, err := l.svcCtx.ProductModel.FindList(ctx, req)
if err != nil {
appErr := errBuilder.SystemError("DB_QUERY_FAILED", "查询产品列表失败", err)
logger.LogError(ctx, appErr, logger.LogFields{
"request": req,
"db_error": err.Error(),
})
return nil, appErr
}
logger.LogInfo(ctx, "get_product_list_success", logger.LogFields{
"request": req,
"result_count": len(products),
})
return &product.GetProductListResp{
List: products,
Total: int64(len(products)),
}, nil
}
```
## 5. 监控和告警
### 5.1 错误监控配置
```go
// shared/monitor/error_monitor.go
package monitor
import (
"context"
"github.com/zeromicro/go-zero/core/metric"
"tianyuan/shared/errcode"
)
var (
// 错误计数器
ErrorCounter = metric.NewCounterVec(&metric.CounterVecOpts{
Namespace: "tianyuan",
Subsystem: "error",
Name: "total",
Help: "Total number of errors",
Labels: []string{"service", "type", "level", "code"},
})
// 错误率直方图
ErrorRateHistogram = metric.NewHistogramVec(&metric.HistogramVecOpts{
Namespace: "tianyuan",
Subsystem: "error",
Name: "rate",
Help: "Error rate histogram",
Labels: []string{"service", "method"},
})
)
// 记录错误指标
func RecordError(appErr *errcode.AppError) {
ErrorCounter.Inc(
appErr.Service,
string(appErr.Type),
appErr.Level.String(),
appErr.Code,
)
}
```
### 5.2 告警规则
```yaml
# prometheus告警规则
groups:
- name: tianyuan-errors
rules:
# 错误率告警
- alert: HighErrorRate
expr: rate(tianyuan_error_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} for service {{ $labels.service }}"
# 致命错误告警
- alert: FatalError
expr: increase(tianyuan_error_total{level="FATAL"}[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Fatal error detected"
description: "Fatal error in service {{ $labels.service }}: {{ $labels.code }}"
```
## 6. 最佳实践总结
1. **错误分级原则**
- DEBUG/INFO开发调试信息
- WARN需要关注但不影响业务
- ERROR业务错误需要处理
- FATAL/PANIC系统级错误需要立即处理
2. **链路追踪要点**
- 每个请求都有唯一的 TraceId
- 跨服务调用保持链路连续性
- 关键操作添加自定义 Span
3. **日志记录规范**
- 结构化日志,便于查询分析
- 包含链路追踪信息
- 敏感信息脱敏处理
4. **监控告警策略**
- 错误率监控
- 关键错误实时告警
- 链路追踪性能监控