740 lines
21 KiB
Markdown
740 lines
21 KiB
Markdown
# go-zero 错误分级与链路追踪设计
|
||
|
||
## 1. 错误分级体系
|
||
|
||
### 1.1 错误等级定义
|
||
|
||
```go
|
||
// shared/errcode/levels.go
|
||
package errcode
|
||
|
||
// 错误等级枚举
|
||
type ErrorLevel int
|
||
|
||
const (
|
||
LevelDebug ErrorLevel = iota // 调试级别:开发调试信息
|
||
LevelInfo // 信息级别:一般业务信息
|
||
LevelWarn // 警告级别:需要关注但不影响业务
|
||
LevelError // 错误级别:业务错误,需要处理
|
||
LevelFatal // 致命级别:系统级错误,影响服务
|
||
LevelPanic // 恐慌级别:严重错误,服务不可用
|
||
)
|
||
|
||
// 错误等级字符串映射
|
||
var LevelNames = map[ErrorLevel]string{
|
||
LevelDebug: "DEBUG",
|
||
LevelInfo: "INFO",
|
||
LevelWarn: "WARN",
|
||
LevelError: "ERROR",
|
||
LevelFatal: "FATAL",
|
||
LevelPanic: "PANIC",
|
||
}
|
||
|
||
func (l ErrorLevel) String() string {
|
||
if name, ok := LevelNames[l]; ok {
|
||
return name
|
||
}
|
||
return "UNKNOWN"
|
||
}
|
||
```
|
||
|
||
### 1.2 错误分类体系
|
||
|
||
```go
|
||
// shared/errcode/types.go
|
||
package errcode
|
||
|
||
import (
|
||
"fmt"
|
||
"time"
|
||
)
|
||
|
||
// 错误类型
|
||
type ErrorType string
|
||
|
||
const (
|
||
// 系统级错误
|
||
ErrorTypeSystem ErrorType = "SYSTEM" // 系统错误
|
||
ErrorTypeNetwork ErrorType = "NETWORK" // 网络错误
|
||
ErrorTypeDatabase ErrorType = "DATABASE" // 数据库错误
|
||
ErrorTypeRedis ErrorType = "REDIS" // Redis错误
|
||
ErrorTypeMQ ErrorType = "MQ" // 消息队列错误
|
||
ErrorTypeRPC ErrorType = "RPC" // RPC调用错误
|
||
|
||
// 业务级错误
|
||
ErrorTypeBusiness ErrorType = "BUSINESS" // 业务逻辑错误
|
||
ErrorTypeValidation ErrorType = "VALIDATION" // 参数校验错误
|
||
ErrorTypeAuth ErrorType = "AUTH" // 认证授权错误
|
||
ErrorTypePermission ErrorType = "PERMISSION" // 权限错误
|
||
|
||
// 客户端错误
|
||
ErrorTypeParam ErrorType = "PARAM" // 参数错误
|
||
ErrorTypeRequest ErrorType = "REQUEST" // 请求错误
|
||
ErrorTypeResponse ErrorType = "RESPONSE" // 响应错误
|
||
)
|
||
|
||
// 统一错误结构
|
||
type AppError struct {
|
||
Code string `json:"code"` // 错误码
|
||
Message string `json:"message"` // 错误消息
|
||
Level ErrorLevel `json:"level"` // 错误等级
|
||
Type ErrorType `json:"type"` // 错误类型
|
||
TraceId string `json:"trace_id"` // 链路追踪ID
|
||
SpanId string `json:"span_id"` // 跨度ID
|
||
Service string `json:"service"` // 服务名称
|
||
Method string `json:"method"` // 方法名称
|
||
Timestamp time.Time `json:"timestamp"` // 时间戳
|
||
Details interface{} `json:"details"` // 详细信息
|
||
Stack string `json:"stack"` // 堆栈信息(仅错误级别以上)
|
||
Cause error `json:"-"` // 原始错误(不序列化)
|
||
}
|
||
|
||
// 实现error接口
|
||
func (e *AppError) Error() string {
|
||
return fmt.Sprintf("[%s][%s][%s] %s: %s",
|
||
e.Level.String(), e.Type, e.Code, e.Service, e.Message)
|
||
}
|
||
|
||
// 获取原始错误
|
||
func (e *AppError) Unwrap() error {
|
||
return e.Cause
|
||
}
|
||
```
|
||
|
||
### 1.3 错误构造器
|
||
|
||
```go
|
||
// shared/errcode/builder.go
|
||
package errcode
|
||
|
||
import (
|
||
"runtime"
|
||
"time"
|
||
"github.com/zeromicro/go-zero/core/trace"
|
||
)
|
||
|
||
type ErrorBuilder struct {
|
||
service string
|
||
method string
|
||
}
|
||
|
||
func NewErrorBuilder(service, method string) *ErrorBuilder {
|
||
return &ErrorBuilder{
|
||
service: service,
|
||
method: method,
|
||
}
|
||
}
|
||
|
||
// Debug级别错误
|
||
func (b *ErrorBuilder) Debug(code, message string) *AppError {
|
||
return b.buildError(LevelDebug, ErrorTypeSystem, code, message, nil, nil)
|
||
}
|
||
|
||
// Info级别错误
|
||
func (b *ErrorBuilder) Info(code, message string) *AppError {
|
||
return b.buildError(LevelInfo, ErrorTypeSystem, code, message, nil, nil)
|
||
}
|
||
|
||
// Warn级别错误
|
||
func (b *ErrorBuilder) Warn(errorType ErrorType, code, message string) *AppError {
|
||
return b.buildError(LevelWarn, errorType, code, message, nil, nil)
|
||
}
|
||
|
||
// Error级别错误
|
||
func (b *ErrorBuilder) Error(errorType ErrorType, code, message string, cause error) *AppError {
|
||
return b.buildError(LevelError, errorType, code, message, cause, nil)
|
||
}
|
||
|
||
// Fatal级别错误
|
||
func (b *ErrorBuilder) Fatal(errorType ErrorType, code, message string, cause error) *AppError {
|
||
return b.buildError(LevelFatal, errorType, code, message, cause, nil)
|
||
}
|
||
|
||
// Panic级别错误
|
||
func (b *ErrorBuilder) Panic(errorType ErrorType, code, message string, cause error) *AppError {
|
||
return b.buildError(LevelPanic, errorType, code, message, cause, nil)
|
||
}
|
||
|
||
// 业务错误(常用)
|
||
func (b *ErrorBuilder) BusinessError(code, message string) *AppError {
|
||
return b.buildError(LevelError, ErrorTypeBusiness, code, message, nil, nil)
|
||
}
|
||
|
||
// 参数校验错误(常用)
|
||
func (b *ErrorBuilder) ValidationError(code, message string, details interface{}) *AppError {
|
||
return b.buildError(LevelWarn, ErrorTypeValidation, code, message, nil, details)
|
||
}
|
||
|
||
// 权限错误(常用)
|
||
func (b *ErrorBuilder) PermissionError(code, message string) *AppError {
|
||
return b.buildError(LevelWarn, ErrorTypePermission, code, message, nil, nil)
|
||
}
|
||
|
||
// 系统错误(常用)
|
||
func (b *ErrorBuilder) SystemError(code, message string, cause error) *AppError {
|
||
return b.buildError(LevelFatal, ErrorTypeSystem, code, message, cause, nil)
|
||
}
|
||
|
||
// 构建错误
|
||
func (b *ErrorBuilder) buildError(level ErrorLevel, errorType ErrorType, code, message string, cause error, details interface{}) *AppError {
|
||
appErr := &AppError{
|
||
Code: code,
|
||
Message: message,
|
||
Level: level,
|
||
Type: errorType,
|
||
Service: b.service,
|
||
Method: b.method,
|
||
Timestamp: time.Now(),
|
||
Details: details,
|
||
Cause: cause,
|
||
}
|
||
|
||
// 获取链路追踪信息
|
||
if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
|
||
appErr.TraceId = traceId
|
||
}
|
||
if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
|
||
appErr.SpanId = spanId
|
||
}
|
||
|
||
// 错误级别以上记录堆栈信息
|
||
if level >= LevelError {
|
||
appErr.Stack = getStackTrace()
|
||
}
|
||
|
||
return appErr
|
||
}
|
||
|
||
// 获取堆栈信息
|
||
func getStackTrace() string {
|
||
buf := make([]byte, 4096)
|
||
n := runtime.Stack(buf, false)
|
||
return string(buf[:n])
|
||
}
|
||
```
|
||
|
||
## 2. 链路追踪集成
|
||
|
||
### 2.1 链路追踪配置
|
||
|
||
```yaml
|
||
# etc/client-api.yaml
|
||
Name: client-api
|
||
Host: 0.0.0.0
|
||
Port: 8080
|
||
|
||
# 链路追踪配置
|
||
Telemetry:
|
||
Name: client-api
|
||
Endpoint: http://jaeger:14268/api/traces
|
||
Sampler: 1.0
|
||
Batcher: jaeger
|
||
|
||
# 日志配置
|
||
Log:
|
||
ServiceName: client-api
|
||
Mode: file
|
||
Level: info
|
||
Path: logs
|
||
MaxSize: 100
|
||
MaxAge: 7
|
||
MaxBackups: 5
|
||
Compress: true
|
||
```
|
||
|
||
### 2.2 链路追踪中间件
|
||
|
||
```go
|
||
// shared/middleware/trace_middleware.go
|
||
package middleware
|
||
|
||
import (
|
||
"context"
|
||
"net/http"
|
||
"github.com/zeromicro/go-zero/core/trace"
|
||
"github.com/zeromicro/go-zero/rest/httpx"
|
||
"go.opentelemetry.io/otel"
|
||
"go.opentelemetry.io/otel/attribute"
|
||
"go.opentelemetry.io/otel/codes"
|
||
"tianyuan/shared/errcode"
|
||
)
|
||
|
||
// HTTP链路追踪中间件
|
||
func TraceMiddleware(serviceName string) func(http.Handler) http.Handler {
|
||
return func(next http.Handler) http.Handler {
|
||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
tracer := otel.Tracer(serviceName)
|
||
|
||
// 开始span
|
||
ctx, span := tracer.Start(r.Context(), r.URL.Path)
|
||
defer span.End()
|
||
|
||
// 设置span属性
|
||
span.SetAttributes(
|
||
attribute.String("http.method", r.Method),
|
||
attribute.String("http.url", r.URL.String()),
|
||
attribute.String("http.user_agent", r.UserAgent()),
|
||
attribute.String("service.name", serviceName),
|
||
)
|
||
|
||
// 将链路信息注入上下文
|
||
r = r.WithContext(ctx)
|
||
|
||
// 创建响应包装器
|
||
wrapper := &responseWrapper{
|
||
ResponseWriter: w,
|
||
statusCode: http.StatusOK,
|
||
}
|
||
|
||
// 执行下一个处理器
|
||
next.ServeHTTP(wrapper, r)
|
||
|
||
// 设置响应属性
|
||
span.SetAttributes(
|
||
attribute.Int("http.status_code", wrapper.statusCode),
|
||
)
|
||
|
||
// 如果是错误状态码,设置span状态
|
||
if wrapper.statusCode >= 400 {
|
||
span.SetStatus(codes.Error, http.StatusText(wrapper.statusCode))
|
||
}
|
||
})
|
||
}
|
||
}
|
||
|
||
// 响应包装器
|
||
type responseWrapper struct {
|
||
http.ResponseWriter
|
||
statusCode int
|
||
}
|
||
|
||
func (w *responseWrapper) WriteHeader(statusCode int) {
|
||
w.statusCode = statusCode
|
||
w.ResponseWriter.WriteHeader(statusCode)
|
||
}
|
||
```
|
||
|
||
### 2.3 RPC 链路追踪拦截器
|
||
|
||
```go
|
||
// shared/interceptor/trace_interceptor.go
|
||
package interceptor
|
||
|
||
import (
|
||
"context"
|
||
"google.golang.org/grpc"
|
||
"google.golang.org/grpc/codes"
|
||
"google.golang.org/grpc/status"
|
||
"go.opentelemetry.io/otel"
|
||
"go.opentelemetry.io/otel/attribute"
|
||
"go.opentelemetry.io/otel/codes"
|
||
"tianyuan/shared/errcode"
|
||
)
|
||
|
||
// RPC客户端链路追踪拦截器
|
||
func TraceClientInterceptor(serviceName string) grpc.UnaryClientInterceptor {
|
||
return func(ctx context.Context, method string, req, reply interface{},
|
||
cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
|
||
|
||
tracer := otel.Tracer(serviceName)
|
||
ctx, span := tracer.Start(ctx, method)
|
||
defer span.End()
|
||
|
||
// 设置span属性
|
||
span.SetAttributes(
|
||
attribute.String("rpc.method", method),
|
||
attribute.String("rpc.service", serviceName),
|
||
attribute.String("rpc.system", "grpc"),
|
||
)
|
||
|
||
// 调用RPC
|
||
err := invoker(ctx, method, req, reply, cc, opts...)
|
||
|
||
// 处理错误
|
||
if err != nil {
|
||
span.SetStatus(codes.Error, err.Error())
|
||
span.SetAttributes(
|
||
attribute.String("rpc.grpc.status_code", status.Code(err).String()),
|
||
)
|
||
}
|
||
|
||
return err
|
||
}
|
||
}
|
||
|
||
// RPC服务端链路追踪拦截器
|
||
func TraceServerInterceptor(serviceName string) grpc.UnaryServerInterceptor {
|
||
return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo,
|
||
handler grpc.UnaryHandler) (interface{}, error) {
|
||
|
||
tracer := otel.Tracer(serviceName)
|
||
ctx, span := tracer.Start(ctx, info.FullMethod)
|
||
defer span.End()
|
||
|
||
// 设置span属性
|
||
span.SetAttributes(
|
||
attribute.String("rpc.method", info.FullMethod),
|
||
attribute.String("rpc.service", serviceName),
|
||
attribute.String("rpc.system", "grpc"),
|
||
)
|
||
|
||
// 调用处理器
|
||
resp, err := handler(ctx, req)
|
||
|
||
// 处理错误
|
||
if err != nil {
|
||
span.SetStatus(codes.Error, err.Error())
|
||
|
||
// 如果是自定义错误,记录更多信息
|
||
if appErr, ok := err.(*errcode.AppError); ok {
|
||
span.SetAttributes(
|
||
attribute.String("error.type", string(appErr.Type)),
|
||
attribute.String("error.code", appErr.Code),
|
||
attribute.String("error.level", appErr.Level.String()),
|
||
)
|
||
}
|
||
}
|
||
|
||
return resp, err
|
||
}
|
||
}
|
||
```
|
||
|
||
## 3. 日志集成
|
||
|
||
### 3.1 结构化日志
|
||
|
||
```go
|
||
// shared/logger/logger.go
|
||
package logger
|
||
|
||
import (
|
||
"context"
|
||
"github.com/zeromicro/go-zero/core/logx"
|
||
"github.com/zeromicro/go-zero/core/trace"
|
||
"tianyuan/shared/errcode"
|
||
)
|
||
|
||
// 日志字段
|
||
type LogFields map[string]interface{}
|
||
|
||
// 结构化日志器
|
||
type StructuredLogger struct {
|
||
service string
|
||
method string
|
||
}
|
||
|
||
func NewStructuredLogger(service, method string) *StructuredLogger {
|
||
return &StructuredLogger{
|
||
service: service,
|
||
method: method,
|
||
}
|
||
}
|
||
|
||
// 记录错误日志
|
||
func (l *StructuredLogger) LogError(ctx context.Context, err error, fields LogFields) {
|
||
logFields := l.buildBaseFields(ctx, fields)
|
||
|
||
if appErr, ok := err.(*errcode.AppError); ok {
|
||
// 自定义错误
|
||
logFields["error_code"] = appErr.Code
|
||
logFields["error_type"] = appErr.Type
|
||
logFields["error_level"] = appErr.Level.String()
|
||
logFields["error_details"] = appErr.Details
|
||
|
||
// 根据错误级别选择日志方法
|
||
switch appErr.Level {
|
||
case errcode.LevelDebug:
|
||
logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
|
||
case errcode.LevelInfo:
|
||
logx.WithContext(ctx).WithFields(logFields).Info(appErr.Message)
|
||
case errcode.LevelWarn:
|
||
logx.WithContext(ctx).WithFields(logFields).Slow(appErr.Message)
|
||
case errcode.LevelError:
|
||
logx.WithContext(ctx).WithFields(logFields).Error(appErr.Message)
|
||
case errcode.LevelFatal, errcode.LevelPanic:
|
||
logx.WithContext(ctx).WithFields(logFields).Severe(appErr.Message)
|
||
}
|
||
} else {
|
||
// 普通错误
|
||
logFields["error"] = err.Error()
|
||
logx.WithContext(ctx).WithFields(logFields).Error(err.Error())
|
||
}
|
||
}
|
||
|
||
// 记录业务日志
|
||
func (l *StructuredLogger) LogInfo(ctx context.Context, message string, fields LogFields) {
|
||
logFields := l.buildBaseFields(ctx, fields)
|
||
logx.WithContext(ctx).WithFields(logFields).Info(message)
|
||
}
|
||
|
||
// 记录警告日志
|
||
func (l *StructuredLogger) LogWarn(ctx context.Context, message string, fields LogFields) {
|
||
logFields := l.buildBaseFields(ctx, fields)
|
||
logx.WithContext(ctx).WithFields(logFields).Slow(message)
|
||
}
|
||
|
||
// 构建基础日志字段
|
||
func (l *StructuredLogger) buildBaseFields(ctx context.Context, fields LogFields) logx.LogField {
|
||
baseFields := logx.LogField{
|
||
"service": l.service,
|
||
"method": l.method,
|
||
}
|
||
|
||
// 添加链路追踪信息
|
||
if traceId := trace.TraceIDFromContext(ctx); traceId != "" {
|
||
baseFields["trace_id"] = traceId
|
||
}
|
||
if spanId := trace.SpanIDFromContext(ctx); spanId != "" {
|
||
baseFields["span_id"] = spanId
|
||
}
|
||
|
||
// 合并自定义字段
|
||
for k, v := range fields {
|
||
baseFields[k] = v
|
||
}
|
||
|
||
return baseFields
|
||
}
|
||
```
|
||
|
||
### 3.2 日志中间件
|
||
|
||
```go
|
||
// shared/middleware/log_middleware.go
|
||
package middleware
|
||
|
||
import (
|
||
"context"
|
||
"net/http"
|
||
"time"
|
||
"github.com/zeromicro/go-zero/rest/httpx"
|
||
"tianyuan/shared/logger"
|
||
)
|
||
|
||
// HTTP日志中间件
|
||
func LogMiddleware(serviceName string) func(http.Handler) http.Handler {
|
||
return func(next http.Handler) http.Handler {
|
||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
start := time.Now()
|
||
|
||
logger := logger.NewStructuredLogger(serviceName, r.URL.Path)
|
||
|
||
// 记录请求开始
|
||
logger.LogInfo(r.Context(), "request_start", logger.LogFields{
|
||
"method": r.Method,
|
||
"path": r.URL.Path,
|
||
"query": r.URL.RawQuery,
|
||
"user_agent": r.UserAgent(),
|
||
"remote_ip": httpx.GetRemoteAddr(r),
|
||
})
|
||
|
||
// 创建响应包装器
|
||
wrapper := &responseWrapper{
|
||
ResponseWriter: w,
|
||
statusCode: http.StatusOK,
|
||
}
|
||
|
||
// 执行请求
|
||
next.ServeHTTP(wrapper, r)
|
||
|
||
// 记录请求结束
|
||
duration := time.Since(start)
|
||
fields := logger.LogFields{
|
||
"status_code": wrapper.statusCode,
|
||
"duration_ms": duration.Milliseconds(),
|
||
}
|
||
|
||
if wrapper.statusCode >= 400 {
|
||
logger.LogWarn(r.Context(), "request_error", fields)
|
||
} else {
|
||
logger.LogInfo(r.Context(), "request_success", fields)
|
||
}
|
||
})
|
||
}
|
||
}
|
||
```
|
||
|
||
## 4. 使用示例
|
||
|
||
### 4.1 在 Handler 中使用
|
||
|
||
```go
|
||
// client/internal/handler/product/getproductlisthandler.go
|
||
func (h *GetProductListHandler) GetProductList(w http.ResponseWriter, r *http.Request) {
|
||
// 创建错误构造器
|
||
errBuilder := errcode.NewErrorBuilder("client-api", "GetProductList")
|
||
logger := logger.NewStructuredLogger("client-api", "GetProductList")
|
||
|
||
var req types.GetProductListReq
|
||
|
||
// 参数校验
|
||
if err := validator.ValidateAndParse(r, &req); err != nil {
|
||
appErr := errBuilder.ValidationError("PARAM_INVALID", "参数校验失败", err)
|
||
logger.LogError(r.Context(), appErr, logger.LogFields{
|
||
"request": req,
|
||
})
|
||
response.ErrorResponse(w, appErr)
|
||
return
|
||
}
|
||
|
||
// 调用Logic层
|
||
resp, err := h.logic.GetProductList(r.Context(), &req)
|
||
if err != nil {
|
||
logger.LogError(r.Context(), err, logger.LogFields{
|
||
"request": req,
|
||
})
|
||
response.ErrorResponse(w, err)
|
||
return
|
||
}
|
||
|
||
// 记录成功日志
|
||
logger.LogInfo(r.Context(), "get_product_list_success", logger.LogFields{
|
||
"request": req,
|
||
"result_count": len(resp.List),
|
||
})
|
||
|
||
response.SuccessResponse(w, resp)
|
||
}
|
||
```
|
||
|
||
### 4.2 在 RPC Logic 中使用
|
||
|
||
```go
|
||
// domains/product/rpc/internal/logic/getproductlistlogic.go
|
||
func (l *GetProductListLogic) GetProductList(ctx context.Context, req *product.GetProductListReq) (*product.GetProductListResp, error) {
|
||
errBuilder := errcode.NewErrorBuilder("product-rpc", "GetProductList")
|
||
logger := logger.NewStructuredLogger("product-rpc", "GetProductList")
|
||
|
||
// 业务校验
|
||
validator := validator.NewProductValidator(ctx, l.svcCtx)
|
||
if err := validator.ValidateGetProductListRequest(req); err != nil {
|
||
appErr := errBuilder.BusinessError("VALIDATION_FAILED", err.Error())
|
||
logger.LogError(ctx, appErr, logger.LogFields{
|
||
"request": req,
|
||
})
|
||
return nil, appErr
|
||
}
|
||
|
||
// 查询数据库
|
||
products, err := l.svcCtx.ProductModel.FindList(ctx, req)
|
||
if err != nil {
|
||
appErr := errBuilder.SystemError("DB_QUERY_FAILED", "查询产品列表失败", err)
|
||
logger.LogError(ctx, appErr, logger.LogFields{
|
||
"request": req,
|
||
"db_error": err.Error(),
|
||
})
|
||
return nil, appErr
|
||
}
|
||
|
||
logger.LogInfo(ctx, "get_product_list_success", logger.LogFields{
|
||
"request": req,
|
||
"result_count": len(products),
|
||
})
|
||
|
||
return &product.GetProductListResp{
|
||
List: products,
|
||
Total: int64(len(products)),
|
||
}, nil
|
||
}
|
||
```
|
||
|
||
## 5. 监控和告警
|
||
|
||
### 5.1 错误监控配置
|
||
|
||
```go
|
||
// shared/monitor/error_monitor.go
|
||
package monitor
|
||
|
||
import (
|
||
"context"
|
||
"github.com/zeromicro/go-zero/core/metric"
|
||
"tianyuan/shared/errcode"
|
||
)
|
||
|
||
var (
|
||
// 错误计数器
|
||
ErrorCounter = metric.NewCounterVec(&metric.CounterVecOpts{
|
||
Namespace: "tianyuan",
|
||
Subsystem: "error",
|
||
Name: "total",
|
||
Help: "Total number of errors",
|
||
Labels: []string{"service", "type", "level", "code"},
|
||
})
|
||
|
||
// 错误率直方图
|
||
ErrorRateHistogram = metric.NewHistogramVec(&metric.HistogramVecOpts{
|
||
Namespace: "tianyuan",
|
||
Subsystem: "error",
|
||
Name: "rate",
|
||
Help: "Error rate histogram",
|
||
Labels: []string{"service", "method"},
|
||
})
|
||
)
|
||
|
||
// 记录错误指标
|
||
func RecordError(appErr *errcode.AppError) {
|
||
ErrorCounter.Inc(
|
||
appErr.Service,
|
||
string(appErr.Type),
|
||
appErr.Level.String(),
|
||
appErr.Code,
|
||
)
|
||
}
|
||
```
|
||
|
||
### 5.2 告警规则
|
||
|
||
```yaml
|
||
# prometheus告警规则
|
||
groups:
|
||
- name: tianyuan-errors
|
||
rules:
|
||
# 错误率告警
|
||
- alert: HighErrorRate
|
||
expr: rate(tianyuan_error_total[5m]) > 0.1
|
||
for: 2m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "High error rate detected"
|
||
description: "Error rate is {{ $value }} for service {{ $labels.service }}"
|
||
|
||
# 致命错误告警
|
||
- alert: FatalError
|
||
expr: increase(tianyuan_error_total{level="FATAL"}[1m]) > 0
|
||
for: 0m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Fatal error detected"
|
||
description: "Fatal error in service {{ $labels.service }}: {{ $labels.code }}"
|
||
```
|
||
|
||
## 6. 最佳实践总结
|
||
|
||
1. **错误分级原则**:
|
||
|
||
- DEBUG/INFO:开发调试信息
|
||
- WARN:需要关注但不影响业务
|
||
- ERROR:业务错误,需要处理
|
||
- FATAL/PANIC:系统级错误,需要立即处理
|
||
|
||
2. **链路追踪要点**:
|
||
|
||
- 每个请求都有唯一的 TraceId
|
||
- 跨服务调用保持链路连续性
|
||
- 关键操作添加自定义 Span
|
||
|
||
3. **日志记录规范**:
|
||
|
||
- 结构化日志,便于查询分析
|
||
- 包含链路追踪信息
|
||
- 敏感信息脱敏处理
|
||
|
||
4. **监控告警策略**:
|
||
- 错误率监控
|
||
- 关键错误实时告警
|
||
- 链路追踪性能监控
|