mirror of
https://github.com/zitadel/zitadel.git
synced 2025-08-13 18:59:06 +00:00
fix(OTEL): reduce high cardinality in traces and metrics (#9286)
# Which Problems Are Solved
There were multiple issues in the OpenTelemetry (OTEL) implementation
and usage for tracing and metrics, which lead to high cardinality and
potential memory leaks:
- wrongly initiated tracing interceptors
- high cardinality in traces:
- HTTP/1.1 endpoints containing host names
- HTTP/1.1 endpoints containing object IDs like userID (e.g.
`/management/v1/users/2352839823/`)
- high amount of traces from internal processes (spooler)
- high cardinality in metrics endpoint:
- GRPC entries containing host names
- notification metrics containing instanceIDs and error messages
# How the Problems Are Solved
- Properly initialize the interceptors once and update them to use the
grpc stats handler (unary interceptors were deprecated).
- Remove host names from HTTP/1.1 span names and use path as default.
- Set / overwrite the uri for spans on the grpc-gateway with the uri
pattern (`/management/v1/users/{user_id}`). This is used for spans in
traces and metric entries.
- Created a new sampler which will only sample spans in the following
cases:
- remote was already sampled
- remote was not sampled, root span is of kind `Server` and based on
fraction set in the runtime configuration
- This will prevent having a lot of spans from the spooler back ground
jobs if they were not started by a client call querying an object (e.g.
UserByID).
- Filter out host names and alike from OTEL generated metrics (using a
`view`).
- Removed instance and error messages from notification metrics.
# Additional Changes
Fixed the middleware handling for serving Console. Telemetry and
instance selection are only used for the environment.json, but not on
statically served files.
# Additional Context
- closes #8096
- relates to #9074
- back ports to at least 2.66.x, 2.67.x and 2.68.x
(cherry picked from commit 990e1982c7
)
This commit is contained in:
@@ -6,7 +6,6 @@ import (
|
||||
"github.com/zitadel/logging"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
|
||||
"github.com/zitadel/zitadel/internal/api/authz"
|
||||
"github.com/zitadel/zitadel/internal/notification/channels"
|
||||
"github.com/zitadel/zitadel/internal/telemetry/metrics"
|
||||
)
|
||||
@@ -18,18 +17,14 @@ func countMessages(ctx context.Context, channel channels.NotificationChannel, su
|
||||
if err != nil {
|
||||
metricName = errorMetricName
|
||||
}
|
||||
addCount(ctx, metricName, message, err)
|
||||
addCount(ctx, metricName, message)
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
func addCount(ctx context.Context, metricName string, message channels.Message, err error) {
|
||||
func addCount(ctx context.Context, metricName string, message channels.Message) {
|
||||
labels := map[string]attribute.Value{
|
||||
"triggering_event_typey": attribute.StringValue(string(message.GetTriggeringEvent().Type())),
|
||||
"instance": attribute.StringValue(authz.GetInstance(ctx).InstanceID()),
|
||||
}
|
||||
if err != nil {
|
||||
labels["error"] = attribute.StringValue(err.Error())
|
||||
"triggering_event_type": attribute.StringValue(string(message.GetTriggeringEvent().Type())),
|
||||
}
|
||||
addCountErr := metrics.AddCount(ctx, metricName, 1, labels)
|
||||
logging.WithFields("name", metricName, "labels", labels).OnError(addCountErr).Error("incrementing counter metric failed")
|
||||
|
Reference in New Issue
Block a user