fix(OTEL): reduce high cardinality in traces and metrics (#9286)

# Which Problems Are Solved

There were multiple issues in the OpenTelemetry (OTEL) implementation
and usage for tracing and metrics, which lead to high cardinality and
potential memory leaks:
- wrongly initiated tracing interceptors
- high cardinality in traces:
  - HTTP/1.1 endpoints containing host names
- HTTP/1.1 endpoints containing object IDs like userID (e.g.
`/management/v1/users/2352839823/`)
- high amount of traces from internal processes (spooler)
- high cardinality in metrics endpoint:
  - GRPC entries containing host names
  - notification metrics containing instanceIDs and error messages

# How the Problems Are Solved

- Properly initialize the interceptors once and update them to use the
grpc stats handler (unary interceptors were deprecated).
- Remove host names from HTTP/1.1 span names and use path as default.
- Set / overwrite the uri for spans on the grpc-gateway with the uri
pattern (`/management/v1/users/{user_id}`). This is used for spans in
traces and metric entries.
- Created a new sampler which will only sample spans in the following
cases:
  - remote was already sampled
- remote was not sampled, root span is of kind `Server` and based on
fraction set in the runtime configuration
- This will prevent having a lot of spans from the spooler back ground
jobs if they were not started by a client call querying an object (e.g.
UserByID).
- Filter out host names and alike from OTEL generated metrics (using a
`view`).
- Removed instance and error messages from notification metrics.

# Additional Changes

Fixed the middleware handling for serving Console. Telemetry and
instance selection are only used for the environment.json, but not on
statically served files.

# Additional Context

- closes #8096 
- relates to #9074
- back ports to at least 2.66.x, 2.67.x and 2.68.x
This commit is contained in:
Livio Spring
2025-02-04 09:55:26 +01:00
committed by GitHub
parent 04b9e9b144
commit 990e1982c7
14 changed files with 237 additions and 60 deletions

View File

@@ -10,6 +10,7 @@ import (
"github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
"github.com/zitadel/logging"
"go.opentelemetry.io/otel/trace"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials"
@@ -56,6 +57,13 @@ var (
},
)
// we need the errorHandler to set the request URI pattern in case of an error
errorHandler = runtime.ErrorHandlerFunc(
func(ctx context.Context, mux *runtime.ServeMux, marshaler runtime.Marshaler, w http.ResponseWriter, r *http.Request, err error) {
setRequestURIPattern(ctx)
runtime.DefaultHTTPErrorHandler(ctx, mux, marshaler, w, r, err)
})
serveMuxOptions = func(hostHeaders []string) []runtime.ServeMuxOption {
return []runtime.ServeMuxOption{
runtime.WithMarshalerOption(jsonMarshaler.ContentType(nil), jsonMarshaler),
@@ -65,6 +73,7 @@ var (
runtime.WithOutgoingHeaderMatcher(runtime.DefaultHeaderMatcher),
runtime.WithForwardResponseOption(responseForwarder),
runtime.WithRoutingErrorHandler(httpErrorHandler),
runtime.WithErrorHandler(errorHandler),
}
}
@@ -81,6 +90,7 @@ var (
}
responseForwarder = func(ctx context.Context, w http.ResponseWriter, resp proto.Message) error {
setRequestURIPattern(ctx)
t, ok := resp.(CustomHTTPResponse)
if ok {
// TODO: find a way to return a location header if needed w.Header().Set("location", t.Location())
@@ -118,9 +128,9 @@ func CreateGatewayWithPrefix(
opts := []grpc.DialOption{
grpc.WithTransportCredentials(grpcCredentials(tlsConfig)),
grpc.WithChainUnaryInterceptor(
client_middleware.DefaultTracingClient(),
client_middleware.UnaryActivityClientInterceptor(),
),
grpc.WithStatsHandler(client_middleware.DefaultTracingClient()),
}
connection, err := dial(ctx, port, opts)
if err != nil {
@@ -145,9 +155,9 @@ func CreateGateway(
[]grpc.DialOption{
grpc.WithTransportCredentials(grpcCredentials(tlsConfig)),
grpc.WithChainUnaryInterceptor(
client_middleware.DefaultTracingClient(),
client_middleware.UnaryActivityClientInterceptor(),
),
grpc.WithStatsHandler(client_middleware.DefaultTracingClient()),
})
if err != nil {
return nil, err
@@ -260,3 +270,13 @@ func grpcCredentials(tlsConfig *tls.Config) credentials.TransportCredentials {
}
return creds
}
func setRequestURIPattern(ctx context.Context) {
pattern, ok := runtime.HTTPPathPattern(ctx)
if !ok {
return
}
span := trace.SpanFromContext(ctx)
span.SetName(pattern)
metrics.SetRequestURIPattern(ctx, pattern)
}