zitadel/cmd/setup/config.go
Zach Hirschtritt 61ddecee31
fix: add prometheus metrics on projection handlers (#9561)
# Which Problems Are Solved

With current provided telemetry it's difficult to predict when a
projection handler is under increased load until it's too late and
causes downstream issues. Importantly, projection updating is in the
critical path for many login flows and increased latency there can
result in system downtime for users.

# How the Problems Are Solved

This PR adds three new prometheus-style metrics:
1. **projection_events_processed** (_labels: projection, success_) -
This metric gives us a counter of the number of events processed per
projection update run and whether they we're processed without error. A
high number of events being processed can let us know how busy a
particular projection handler is.

2. **projection_handle_timer** _(labels: projection)_ - This is the time
it takes to process a projection update given a batch of events - time
to take the current_states lock, query for new events, reduce,
update_the projection, and update current_states.

3. **projection_state_latency** _(labels: projection)_ - This is the
time from the last event processed in the current_states table for a
given projection. It tells us how old was the last event you processed?
Or, how far behind are you running for this projection? Higher latencies
could mean high load or stalled projection handling.

# Additional Changes

I also had to initialize the global otel metrics provider (`metrics.M`)
in the `setup` step additionally to `start` since projection handlers
are initialized at setup. The initialization checks if a metrics
provider is already set (in case of `start-from-setup` or
`start-from-init` to prevent overwriting, which causes the otel metrics
provider to stop working.

# Additional Context

## Example Dashboards

![image](https://github.com/user-attachments/assets/94ba5c2b-9c62-44cd-83ee-4db4a8859073)

![image](https://github.com/user-attachments/assets/60a1b406-a8c6-48dc-a925-575359f97e1e)

---------

Co-authored-by: Silvan <27845747+adlerhurst@users.noreply.github.com>
Co-authored-by: Livio Spring <livio.a@gmail.com>
(cherry picked from commit c1535b7b4916abd415a29e034d266c4ea1dc3645)
2025-03-28 07:58:47 +01:00

179 lines
7.1 KiB
Go

package setup
import (
"bytes"
"strings"
"time"
"github.com/mitchellh/mapstructure"
"github.com/spf13/viper"
"github.com/zitadel/logging"
"github.com/zitadel/zitadel/cmd/encryption"
"github.com/zitadel/zitadel/cmd/hooks"
"github.com/zitadel/zitadel/internal/actions"
internal_authz "github.com/zitadel/zitadel/internal/api/authz"
"github.com/zitadel/zitadel/internal/api/oidc"
"github.com/zitadel/zitadel/internal/api/ui/login"
"github.com/zitadel/zitadel/internal/cache/connector"
"github.com/zitadel/zitadel/internal/command"
"github.com/zitadel/zitadel/internal/config/hook"
"github.com/zitadel/zitadel/internal/config/systemdefaults"
"github.com/zitadel/zitadel/internal/database"
"github.com/zitadel/zitadel/internal/domain"
"github.com/zitadel/zitadel/internal/eventstore"
"github.com/zitadel/zitadel/internal/id"
"github.com/zitadel/zitadel/internal/notification/handlers"
"github.com/zitadel/zitadel/internal/query/projection"
static_config "github.com/zitadel/zitadel/internal/static/config"
metrics "github.com/zitadel/zitadel/internal/telemetry/metrics/config"
)
type Config struct {
ForMirror bool
Database database.Config
Caches *connector.CachesConfig
SystemDefaults systemdefaults.SystemDefaults
InternalAuthZ internal_authz.Config
ExternalDomain string
ExternalPort uint16
ExternalSecure bool
Log *logging.Config
Metrics metrics.Config
EncryptionKeys *encryption.EncryptionKeyConfig
DefaultInstance command.InstanceSetup
Machine *id.Config
Projections projection.Config
Notifications handlers.WorkerConfig
Eventstore *eventstore.Config
InitProjections InitProjections
AssetStorage static_config.AssetStorageConfig
OIDC oidc.Config
Login login.Config
WebAuthNName string
Telemetry *handlers.TelemetryPusherConfig
SystemAPIUsers map[string]*internal_authz.SystemAPIUser
}
type InitProjections struct {
Enabled bool
RetryFailedAfter time.Duration
MaxFailureCount uint8
BulkLimit uint64
}
func MustNewConfig(v *viper.Viper) *Config {
config := new(Config)
err := v.Unmarshal(config,
viper.DecodeHook(mapstructure.ComposeDecodeHookFunc(
hooks.SliceTypeStringDecode[*domain.CustomMessageText],
hooks.SliceTypeStringDecode[internal_authz.RoleMapping],
hooks.MapTypeStringDecode[string, *internal_authz.SystemAPIUser],
hooks.MapHTTPHeaderStringDecode,
database.DecodeHook,
actions.HTTPConfigDecodeHook,
hook.EnumHookFunc(internal_authz.MemberTypeString),
hook.Base64ToBytesHookFunc(),
hook.TagToLanguageHookFunc(),
mapstructure.StringToTimeDurationHookFunc(),
mapstructure.StringToTimeHookFunc(time.RFC3339),
mapstructure.StringToSliceHookFunc(","),
mapstructure.TextUnmarshallerHookFunc(),
)),
)
logging.OnError(err).Fatal("unable to read default config")
err = config.Log.SetLogger()
logging.OnError(err).Fatal("unable to set logger")
err = config.Metrics.NewMeter()
logging.OnError(err).Fatal("unable to set meter")
id.Configure(config.Machine)
// Copy the global role permissions mappings to the instance until we allow instance-level configuration over the API.
config.DefaultInstance.RolePermissionMappings = config.InternalAuthZ.RolePermissionMappings
return config
}
type Steps struct {
s1ProjectionTable *ProjectionTable
s2AssetsTable *AssetTable
FirstInstance *FirstInstance
s5LastFailed *LastFailed
s6OwnerRemoveColumns *OwnerRemoveColumns
s7LogstoreTables *LogstoreTables
s8AuthTokens *AuthTokenIndexes
CorrectCreationDate *CorrectCreationDate
s12AddOTPColumns *AddOTPColumns
s13FixQuotaProjection *FixQuotaConstraints
s14NewEventsTable *NewEventsTable
s15CurrentStates *CurrentProjectionState
s16UniqueConstraintsLower *UniqueConstraintToLower
s17AddOffsetToUniqueConstraints *AddOffsetToCurrentStates
s18AddLowerFieldsToLoginNames *AddLowerFieldsToLoginNames
s19AddCurrentStatesIndex *AddCurrentSequencesIndex
s20AddByUserSessionIndex *AddByUserIndexToSession
s21AddBlockFieldToLimits *AddBlockFieldToLimits
s22ActiveInstancesIndex *ActiveInstanceEvents
s23CorrectGlobalUniqueConstraints *CorrectGlobalUniqueConstraints
s24AddActorToAuthTokens *AddActorToAuthTokens
s25User11AddLowerFieldsToVerifiedEmail *User11AddLowerFieldsToVerifiedEmail
s26AuthUsers3 *AuthUsers3
s27IDPTemplate6SAMLNameIDFormat *IDPTemplate6SAMLNameIDFormat
s28AddFieldTable *AddFieldTable
s29FillFieldsForProjectGrant *FillFieldsForProjectGrant
s30FillFieldsForOrgDomainVerified *FillFieldsForOrgDomainVerified
s31AddAggregateIndexToFields *AddAggregateIndexToFields
s32AddAuthSessionID *AddAuthSessionID
s33SMSConfigs3TwilioAddVerifyServiceSid *SMSConfigs3TwilioAddVerifyServiceSid
s34AddCacheSchema *AddCacheSchema
s35AddPositionToIndexEsWm *AddPositionToIndexEsWm
s36FillV2Milestones *FillV3Milestones
s37Apps7OIDConfigsBackChannelLogoutURI *Apps7OIDConfigsBackChannelLogoutURI
s38BackChannelLogoutNotificationStart *BackChannelLogoutNotificationStart
s40InitPushFunc *InitPushFunc
s42Apps7OIDCConfigsLoginVersion *Apps7OIDCConfigsLoginVersion
s43CreateFieldsDomainIndex *CreateFieldsDomainIndex
s44ReplaceCurrentSequencesIndex *ReplaceCurrentSequencesIndex
s45CorrectProjectOwners *CorrectProjectOwners
s46InitPermissionFunctions *InitPermissionFunctions
s47FillMembershipFields *FillMembershipFields
s48Apps7SAMLConfigsLoginVersion *Apps7SAMLConfigsLoginVersion
s49InitPermittedOrgsFunction *InitPermittedOrgsFunction
s50IDPTemplate6UsePKCE *IDPTemplate6UsePKCE
s51IDPTemplate6RootCA *IDPTemplate6RootCA
s52IDPTemplate6LDAP2 *IDPTemplate6LDAP2
}
func MustNewSteps(v *viper.Viper) *Steps {
v.AutomaticEnv()
v.SetEnvPrefix("ZITADEL")
v.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
v.SetConfigType("yaml")
err := v.ReadConfig(bytes.NewBuffer(defaultSteps))
logging.OnError(err).Fatal("unable to read setup steps")
for _, file := range stepFiles {
v.SetConfigFile(file)
err := v.MergeInConfig()
logging.WithFields("file", file).OnError(err).Warn("unable to read setup file")
}
steps := new(Steps)
err = v.Unmarshal(steps,
viper.DecodeHook(mapstructure.ComposeDecodeHookFunc(
hook.Base64ToBytesHookFunc(),
hook.TagToLanguageHookFunc(),
mapstructure.StringToTimeDurationHookFunc(),
mapstructure.StringToTimeHookFunc(time.RFC3339),
mapstructure.StringToSliceHookFunc(","),
mapstructure.TextUnmarshallerHookFunc(),
)),
)
logging.OnError(err).Fatal("unable to read steps")
return steps
}