fix(OTEL): reduce high cardinality in traces and metrics (#9286)

# Which Problems Are Solved

There were multiple issues in the OpenTelemetry (OTEL) implementation
and usage for tracing and metrics, which lead to high cardinality and
potential memory leaks:
- wrongly initiated tracing interceptors
- high cardinality in traces:
  - HTTP/1.1 endpoints containing host names
- HTTP/1.1 endpoints containing object IDs like userID (e.g.
`/management/v1/users/2352839823/`)
- high amount of traces from internal processes (spooler)
- high cardinality in metrics endpoint:
  - GRPC entries containing host names
  - notification metrics containing instanceIDs and error messages

# How the Problems Are Solved

- Properly initialize the interceptors once and update them to use the
grpc stats handler (unary interceptors were deprecated).
- Remove host names from HTTP/1.1 span names and use path as default.
- Set / overwrite the uri for spans on the grpc-gateway with the uri
pattern (`/management/v1/users/{user_id}`). This is used for spans in
traces and metric entries.
- Created a new sampler which will only sample spans in the following
cases:
  - remote was already sampled
- remote was not sampled, root span is of kind `Server` and based on
fraction set in the runtime configuration
- This will prevent having a lot of spans from the spooler back ground
jobs if they were not started by a client call querying an object (e.g.
UserByID).
- Filter out host names and alike from OTEL generated metrics (using a
`view`).
- Removed instance and error messages from notification metrics.

# Additional Changes

Fixed the middleware handling for serving Console. Telemetry and
instance selection are only used for the environment.json, but not on
statically served files.

# Additional Context

- closes #8096 
- relates to #9074
- back ports to at least 2.66.x, 2.67.x and 2.68.x
This commit is contained in:
Livio Spring
2025-02-04 09:55:26 +01:00
committed by GitHub
parent 04b9e9b144
commit 990e1982c7
14 changed files with 237 additions and 60 deletions

View File

@@ -1,36 +1,29 @@
package middleware
import (
"context"
"strings"
grpc_trace "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"google.golang.org/grpc"
"google.golang.org/grpc/stats"
grpc_utils "github.com/zitadel/zitadel/internal/api/grpc"
)
type GRPCMethod string
func DefaultTracingClient() grpc.UnaryClientInterceptor {
return TracingServer(grpc_utils.Healthz, grpc_utils.Readiness, grpc_utils.Validation)
func DefaultTracingClient() stats.Handler {
return TracingClient(grpc_utils.Healthz, grpc_utils.Readiness, grpc_utils.Validation)
}
func TracingServer(ignoredMethods ...GRPCMethod) grpc.UnaryClientInterceptor {
return func(
ctx context.Context,
method string,
req, reply interface{},
cc *grpc.ClientConn,
invoker grpc.UnaryInvoker,
opts ...grpc.CallOption,
) error {
for _, ignoredMethod := range ignoredMethods {
if strings.HasSuffix(method, string(ignoredMethod)) {
return invoker(ctx, method, req, reply, cc, opts...)
func TracingClient(ignoredMethods ...GRPCMethod) stats.Handler {
return grpc_trace.NewClientHandler(grpc_trace.WithFilter(
func(info *stats.RPCTagInfo) bool {
for _, ignoredMethod := range ignoredMethods {
if strings.HasSuffix(info.FullMethodName, string(ignoredMethod)) {
return false
}
}
}
return grpc_trace.UnaryClientInterceptor()(ctx, method, req, reply, cc, invoker, opts...)
}
return true
},
))
}