zitadel/internal/telemetry/metrics/http_handler.go

148 lines
3.6 KiB
Go
Raw Permalink Normal View History

package metrics
import (
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
"context"
"net/http"
"strings"
"go.opentelemetry.io/otel/attribute"
)
const (
RequestCounter = "http.server.request_count"
RequestCountDescription = "Request counter"
TotalRequestCounter = "http.server.total_request_count"
TotalRequestDescription = "Total return code counter"
ReturnCodeCounter = "http.server.return_code_counter"
ReturnCodeCounterDescription = "Return code counter"
Method = "method"
URI = "uri"
ReturnCode = "return_code"
)
type Handler struct {
handler http.Handler
methods []MetricType
filters []Filter
}
type MetricType int32
const (
MetricTypeTotalCount MetricType = iota
MetricTypeStatusCode
MetricTypeRequestCount
)
type StatusRecorder struct {
http.ResponseWriter
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
RequestURI *string
Status int
}
func (r *StatusRecorder) WriteHeader(status int) {
r.Status = status
r.ResponseWriter.WriteHeader(status)
}
type Filter func(*http.Request) bool
func NewMetricsHandler(handler http.Handler, metricMethods []MetricType, ignoredEndpoints ...string) http.Handler {
h := Handler{
handler: handler,
methods: metricMethods,
}
if len(ignoredEndpoints) > 0 {
h.filters = append(h.filters, shouldNotIgnore(ignoredEndpoints...))
}
return &h
}
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
type key int
const requestURI key = iota
func SetRequestURIPattern(ctx context.Context, pattern string) {
uri, ok := ctx.Value(requestURI).(*string)
if !ok {
return
}
*uri = pattern
}
// ServeHTTP serves HTTP requests (http.Handler)
func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if len(h.methods) == 0 {
h.handler.ServeHTTP(w, r)
return
}
for _, f := range h.filters {
if !f(r) {
// Simply pass through to the handler if a filter rejects the request
h.handler.ServeHTTP(w, r)
return
}
}
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
uri := strings.Split(r.RequestURI, "?")[0]
recorder := &StatusRecorder{
ResponseWriter: w,
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
RequestURI: &uri,
Status: 200,
}
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
r = r.WithContext(context.WithValue(r.Context(), requestURI, &uri))
h.handler.ServeHTTP(recorder, r)
if h.containsMetricsMethod(MetricTypeRequestCount) {
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
RegisterRequestCounter(recorder, r)
}
if h.containsMetricsMethod(MetricTypeTotalCount) {
RegisterTotalRequestCounter(r)
}
if h.containsMetricsMethod(MetricTypeStatusCode) {
RegisterRequestCodeCounter(recorder, r)
}
}
func (h *Handler) containsMetricsMethod(method MetricType) bool {
for _, m := range h.methods {
if m == method {
return true
}
}
return false
}
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
func RegisterRequestCounter(recorder *StatusRecorder, r *http.Request) {
var labels = map[string]attribute.Value{
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
URI: attribute.StringValue(*recorder.RequestURI),
Method: attribute.StringValue(r.Method),
}
RegisterCounter(RequestCounter, RequestCountDescription)
AddCount(r.Context(), RequestCounter, 1, labels)
}
func RegisterTotalRequestCounter(r *http.Request) {
RegisterCounter(TotalRequestCounter, TotalRequestDescription)
AddCount(r.Context(), TotalRequestCounter, 1, nil)
}
func RegisterRequestCodeCounter(recorder *StatusRecorder, r *http.Request) {
var labels = map[string]attribute.Value{
fix(OTEL): reduce high cardinality in traces and metrics (#9286) # Which Problems Are Solved There were multiple issues in the OpenTelemetry (OTEL) implementation and usage for tracing and metrics, which lead to high cardinality and potential memory leaks: - wrongly initiated tracing interceptors - high cardinality in traces: - HTTP/1.1 endpoints containing host names - HTTP/1.1 endpoints containing object IDs like userID (e.g. `/management/v1/users/2352839823/`) - high amount of traces from internal processes (spooler) - high cardinality in metrics endpoint: - GRPC entries containing host names - notification metrics containing instanceIDs and error messages # How the Problems Are Solved - Properly initialize the interceptors once and update them to use the grpc stats handler (unary interceptors were deprecated). - Remove host names from HTTP/1.1 span names and use path as default. - Set / overwrite the uri for spans on the grpc-gateway with the uri pattern (`/management/v1/users/{user_id}`). This is used for spans in traces and metric entries. - Created a new sampler which will only sample spans in the following cases: - remote was already sampled - remote was not sampled, root span is of kind `Server` and based on fraction set in the runtime configuration - This will prevent having a lot of spans from the spooler back ground jobs if they were not started by a client call querying an object (e.g. UserByID). - Filter out host names and alike from OTEL generated metrics (using a `view`). - Removed instance and error messages from notification metrics. # Additional Changes Fixed the middleware handling for serving Console. Telemetry and instance selection are only used for the environment.json, but not on statically served files. # Additional Context - closes #8096 - relates to #9074 - back ports to at least 2.66.x, 2.67.x and 2.68.x
2025-02-04 09:55:26 +01:00
URI: attribute.StringValue(*recorder.RequestURI),
Method: attribute.StringValue(r.Method),
ReturnCode: attribute.IntValue(recorder.Status),
}
RegisterCounter(ReturnCodeCounter, ReturnCodeCounterDescription)
AddCount(r.Context(), ReturnCodeCounter, 1, labels)
}
func shouldNotIgnore(endpoints ...string) func(r *http.Request) bool {
return func(r *http.Request) bool {
for _, endpoint := range endpoints {
if strings.HasPrefix(r.URL.RequestURI(), endpoint) {
return false
}
}
return true
}
}