mirror of
https://github.com/zitadel/zitadel.git
synced 2025-08-11 21:37:32 +00:00
fix(OTEL): reduce high cardinality in traces and metrics (#9286)
# Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
@@ -35,7 +36,8 @@ const (
|
||||
|
||||
type StatusRecorder struct {
|
||||
http.ResponseWriter
|
||||
Status int
|
||||
RequestURI *string
|
||||
Status int
|
||||
}
|
||||
|
||||
func (r *StatusRecorder) WriteHeader(status int) {
|
||||
@@ -56,6 +58,18 @@ func NewMetricsHandler(handler http.Handler, metricMethods []MetricType, ignored
|
||||
return &h
|
||||
}
|
||||
|
||||
type key int
|
||||
|
||||
const requestURI key = iota
|
||||
|
||||
func SetRequestURIPattern(ctx context.Context, pattern string) {
|
||||
uri, ok := ctx.Value(requestURI).(*string)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
*uri = pattern
|
||||
}
|
||||
|
||||
// ServeHTTP serves HTTP requests (http.Handler)
|
||||
func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
if len(h.methods) == 0 {
|
||||
@@ -69,13 +83,16 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
}
|
||||
uri := strings.Split(r.RequestURI, "?")[0]
|
||||
recorder := &StatusRecorder{
|
||||
ResponseWriter: w,
|
||||
RequestURI: &uri,
|
||||
Status: 200,
|
||||
}
|
||||
r = r.WithContext(context.WithValue(r.Context(), requestURI, &uri))
|
||||
h.handler.ServeHTTP(recorder, r)
|
||||
if h.containsMetricsMethod(MetricTypeRequestCount) {
|
||||
RegisterRequestCounter(r)
|
||||
RegisterRequestCounter(recorder, r)
|
||||
}
|
||||
if h.containsMetricsMethod(MetricTypeTotalCount) {
|
||||
RegisterTotalRequestCounter(r)
|
||||
@@ -94,9 +111,9 @@ func (h *Handler) containsMetricsMethod(method MetricType) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func RegisterRequestCounter(r *http.Request) {
|
||||
func RegisterRequestCounter(recorder *StatusRecorder, r *http.Request) {
|
||||
var labels = map[string]attribute.Value{
|
||||
URI: attribute.StringValue(strings.Split(r.RequestURI, "?")[0]),
|
||||
URI: attribute.StringValue(*recorder.RequestURI),
|
||||
Method: attribute.StringValue(r.Method),
|
||||
}
|
||||
RegisterCounter(RequestCounter, RequestCountDescription)
|
||||
@@ -110,7 +127,7 @@ func RegisterTotalRequestCounter(r *http.Request) {
|
||||
|
||||
func RegisterRequestCodeCounter(recorder *StatusRecorder, r *http.Request) {
|
||||
var labels = map[string]attribute.Value{
|
||||
URI: attribute.StringValue(strings.Split(r.RequestURI, "?")[0]),
|
||||
URI: attribute.StringValue(*recorder.RequestURI),
|
||||
Method: attribute.StringValue(r.Method),
|
||||
ReturnCode: attribute.IntValue(recorder.Status),
|
||||
}
|
||||
|
@@ -6,9 +6,11 @@ import (
|
||||
"sync"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/prometheus"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
"go.opentelemetry.io/otel/sdk/instrumentation"
|
||||
sdk_metric "go.opentelemetry.io/otel/sdk/metric"
|
||||
|
||||
"github.com/zitadel/zitadel/internal/telemetry/metrics"
|
||||
@@ -33,9 +35,19 @@ func NewMetrics(meterName string) (metrics.Metrics, error) {
|
||||
if err != nil {
|
||||
return &Metrics{}, err
|
||||
}
|
||||
// create a view to filter out unwanted attributes
|
||||
view := sdk_metric.NewView(
|
||||
sdk_metric.Instrument{
|
||||
Scope: instrumentation.Scope{Name: otelhttp.ScopeName},
|
||||
},
|
||||
sdk_metric.Stream{
|
||||
AttributeFilter: attribute.NewAllowKeysFilter("http.method", "http.status_code", "http.target"),
|
||||
},
|
||||
)
|
||||
meterProvider := sdk_metric.NewMeterProvider(
|
||||
sdk_metric.WithReader(exporter),
|
||||
sdk_metric.WithResource(resource),
|
||||
sdk_metric.WithView(view),
|
||||
)
|
||||
return &Metrics{
|
||||
Provider: meterProvider,
|
||||
|
Reference in New Issue
Block a user