mirror of
https://github.com/zitadel/zitadel.git
synced 2025-08-11 21:17:32 +00:00
fix(OTEL): reduce high cardinality in traces and metrics (#9286)
# Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
||||
|
||||
"github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
|
||||
"github.com/zitadel/logging"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/credentials"
|
||||
@@ -56,6 +57,13 @@ var (
|
||||
},
|
||||
)
|
||||
|
||||
// we need the errorHandler to set the request URI pattern in case of an error
|
||||
errorHandler = runtime.ErrorHandlerFunc(
|
||||
func(ctx context.Context, mux *runtime.ServeMux, marshaler runtime.Marshaler, w http.ResponseWriter, r *http.Request, err error) {
|
||||
setRequestURIPattern(ctx)
|
||||
runtime.DefaultHTTPErrorHandler(ctx, mux, marshaler, w, r, err)
|
||||
})
|
||||
|
||||
serveMuxOptions = func(hostHeaders []string) []runtime.ServeMuxOption {
|
||||
return []runtime.ServeMuxOption{
|
||||
runtime.WithMarshalerOption(jsonMarshaler.ContentType(nil), jsonMarshaler),
|
||||
@@ -65,6 +73,7 @@ var (
|
||||
runtime.WithOutgoingHeaderMatcher(runtime.DefaultHeaderMatcher),
|
||||
runtime.WithForwardResponseOption(responseForwarder),
|
||||
runtime.WithRoutingErrorHandler(httpErrorHandler),
|
||||
runtime.WithErrorHandler(errorHandler),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,6 +90,7 @@ var (
|
||||
}
|
||||
|
||||
responseForwarder = func(ctx context.Context, w http.ResponseWriter, resp proto.Message) error {
|
||||
setRequestURIPattern(ctx)
|
||||
t, ok := resp.(CustomHTTPResponse)
|
||||
if ok {
|
||||
// TODO: find a way to return a location header if needed w.Header().Set("location", t.Location())
|
||||
@@ -118,9 +128,9 @@ func CreateGatewayWithPrefix(
|
||||
opts := []grpc.DialOption{
|
||||
grpc.WithTransportCredentials(grpcCredentials(tlsConfig)),
|
||||
grpc.WithChainUnaryInterceptor(
|
||||
client_middleware.DefaultTracingClient(),
|
||||
client_middleware.UnaryActivityClientInterceptor(),
|
||||
),
|
||||
grpc.WithStatsHandler(client_middleware.DefaultTracingClient()),
|
||||
}
|
||||
connection, err := dial(ctx, port, opts)
|
||||
if err != nil {
|
||||
@@ -145,9 +155,9 @@ func CreateGateway(
|
||||
[]grpc.DialOption{
|
||||
grpc.WithTransportCredentials(grpcCredentials(tlsConfig)),
|
||||
grpc.WithChainUnaryInterceptor(
|
||||
client_middleware.DefaultTracingClient(),
|
||||
client_middleware.UnaryActivityClientInterceptor(),
|
||||
),
|
||||
grpc.WithStatsHandler(client_middleware.DefaultTracingClient()),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -260,3 +270,13 @@ func grpcCredentials(tlsConfig *tls.Config) credentials.TransportCredentials {
|
||||
}
|
||||
return creds
|
||||
}
|
||||
|
||||
func setRequestURIPattern(ctx context.Context) {
|
||||
pattern, ok := runtime.HTTPPathPattern(ctx)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
span := trace.SpanFromContext(ctx)
|
||||
span.SetName(pattern)
|
||||
metrics.SetRequestURIPattern(ctx, pattern)
|
||||
}
|
||||
|
@@ -1,34 +1,29 @@
|
||||
package middleware
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
|
||||
grpc_trace "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/stats"
|
||||
|
||||
grpc_utils "github.com/zitadel/zitadel/internal/api/grpc"
|
||||
)
|
||||
|
||||
type GRPCMethod string
|
||||
|
||||
func DefaultTracingServer() grpc.UnaryServerInterceptor {
|
||||
func DefaultTracingServer() stats.Handler {
|
||||
return TracingServer(grpc_utils.Healthz, grpc_utils.Readiness, grpc_utils.Validation)
|
||||
}
|
||||
|
||||
func TracingServer(ignoredMethods ...GRPCMethod) grpc.UnaryServerInterceptor {
|
||||
return func(
|
||||
ctx context.Context,
|
||||
req interface{},
|
||||
info *grpc.UnaryServerInfo,
|
||||
handler grpc.UnaryHandler,
|
||||
) (interface{}, error) {
|
||||
|
||||
for _, ignoredMethod := range ignoredMethods {
|
||||
if strings.HasSuffix(info.FullMethod, string(ignoredMethod)) {
|
||||
return handler(ctx, req)
|
||||
func TracingServer(ignoredMethods ...GRPCMethod) stats.Handler {
|
||||
return grpc_trace.NewServerHandler(grpc_trace.WithFilter(
|
||||
func(info *stats.RPCTagInfo) bool {
|
||||
for _, ignoredMethod := range ignoredMethods {
|
||||
if strings.HasSuffix(info.FullMethodName, string(ignoredMethod)) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return grpc_trace.UnaryServerInterceptor()(ctx, req, info, handler)
|
||||
}
|
||||
return true
|
||||
},
|
||||
))
|
||||
}
|
||||
|
@@ -47,7 +47,6 @@ func CreateServer(
|
||||
grpc.UnaryInterceptor(
|
||||
grpc_middleware.ChainUnaryServer(
|
||||
middleware.CallDurationHandler(),
|
||||
middleware.DefaultTracingServer(),
|
||||
middleware.MetricsHandler(metricTypes, grpc_api.Probes...),
|
||||
middleware.NoCacheInterceptor(),
|
||||
middleware.InstanceInterceptor(queries, externalDomain, system_pb.SystemService_ServiceDesc.ServiceName, healthpb.Health_ServiceDesc.ServiceName),
|
||||
@@ -63,6 +62,7 @@ func CreateServer(
|
||||
middleware.ActivityInterceptor(),
|
||||
),
|
||||
),
|
||||
grpc.StatsHandler(middleware.DefaultTracingServer()),
|
||||
}
|
||||
if tlsConfig != nil {
|
||||
serverOptions = append(serverOptions, grpc.Creds(credentials.NewTLS(tlsConfig)))
|
||||
|
Reference in New Issue
Block a user