refactor(handler): cache active instances (#9008)

# Which Problems Are Solved

Scheduled handlers use `eventstore.InstanceIDs` to get the all active
instances within a given timeframe. This function scrapes through all
events written within that time frame which can cause heavy load on the
database.

# How the Problems Are Solved

A new query cache `activeInstances` is introduced which caches the ids
of all instances queried by id or host within the configured timeframe.

# Additional Changes

- Changed `default.yaml`
  - Removed `HandleActiveInstances` from custom handler configs
- Added `MaxActiveInstances` to define the maximal amount of cached
instance ids
- fixed start-from-init and start-from-setup to start auth and admin
projections twice
- fixed org cache invalidation to use correct index

# Additional Context

- part of #8999
This commit is contained in:
Silvan
2024-12-06 12:32:53 +01:00
committed by GitHub
parent a81d42a61a
commit 77cd430b3a
25 changed files with 181 additions and 188 deletions

View File

@@ -400,6 +400,9 @@ Projections:
# from HandleActiveInstances duration in the past until the projection's current time
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s # ZITADEL_PROJECTIONS_HANDLEACTIVEINSTANCES
# Maximum amount of instances cached as active
# If set to 0, every instance is always considered active
MaxActiveInstances: 0 # ZITADEL_PROJECTIONS_MAXACTIVEINSTANCES
# In the Customizations section, all settings from above can be overwritten for each specific projection
Customizations:
custom_texts:
@@ -423,11 +426,6 @@ Projections:
TransactionDuration: 2s # ZITADEL_PROJECTIONS_CUSTOMIZATIONS_LOCKOUT_POLICY_TRANSACTIONDURATION
# The NotificationsQuotas projection is used for calling quota webhooks
NotificationsQuotas:
# In case of failed deliveries, ZITADEL retries to send the data points to the configured endpoints, but only for active instances.
# An instance is active, as long as there are projected events on the instance, that are not older than the HandleActiveInstances duration.
# Delivery guarantee requirements are higher for quota webhooks
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s # ZITADEL_PROJECTIONS_CUSTOMIZATIONS_NOTIFICATIONSQUOTAS_HANDLEACTIVEINSTANCES
# As quota notification projections don't result in database statements, retries don't have an effect
MaxFailureCount: 10 # ZITADEL_PROJECTIONS_CUSTOMIZATIONS_NOTIFICATIONSQUOTAS_MAXFAILURECOUNT
# Quota notifications are not so time critical. Setting RequeueEvery every five minutes doesn't annoy the db too much.
@@ -438,11 +436,6 @@ Projections:
BulkLimit: 50
# The Telemetry projection is used for calling telemetry webhooks
Telemetry:
# In case of failed deliveries, ZITADEL retries to send the data points to the configured endpoints, but only for active instances.
# An instance is active, as long as there are projected events on the instance, that are not older than the HandleActiveInstances duration.
# Telemetry delivery guarantee requirements are a bit higher than normal data projections, as they are not interactively retryable.
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s # ZITADEL_PROJECTIONS_CUSTOMIZATIONS_TELEMETRY_HANDLEACTIVEINSTANCES
# As sending telemetry data doesn't result in database statements, retries don't have any effects
MaxFailureCount: 0 # ZITADEL_PROJECTIONS_CUSTOMIZATIONS_TELEMETRY_MAXFAILURECOUNT
# Telemetry data synchronization is not time critical. Setting RequeueEvery to 55 minutes doesn't annoy the database too much.
@@ -497,10 +490,6 @@ Auth:
BulkLimit: 100 #ZITADEL_AUTH_SPOOLER_BULKLIMIT
# See Projections.MaxFailureCount
FailureCountUntilSkip: 5 #ZITADEL_AUTH_SPOOLER_FAILURECOUNTUNTILSKIP
# Only instance are projected, for which at least a projection relevant event exists withing the timeframe
# from HandleActiveInstances duration in the past until the projections current time
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s #ZITADEL_AUTH_SPOOLER_HANDLEACTIVEINSTANCES
# Defines the amount of auth requests stored in the LRU caches.
# There are two caches implemented one for id and one for code
AmountOfCachedAuthRequests: 0 #ZITADEL_AUTH_AMOUNTOFCACHEDAUTHREQUESTS
@@ -515,10 +504,6 @@ Admin:
BulkLimit: 200
# See Projections.MaxFailureCount
FailureCountUntilSkip: 5
# Only instance are projected, for which at least a projection relevant event exists withing the timeframe
# from HandleActiveInstances duration in the past until the projections current time
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s
UserAgentCookie:
Name: zitadel.useragent # ZITADEL_USERAGENTCOOKIE_NAME

View File

@@ -16,8 +16,6 @@ type FillFieldsForProjectGrant struct {
func (mig *FillFieldsForProjectGrant) Execute(ctx context.Context, _ eventstore.Event) error {
instances, err := mig.eventstore.InstanceIDs(
ctx,
0,
true,
eventstore.NewSearchQueryBuilder(eventstore.ColumnsInstanceIDs).
OrderDesc().
AddQuery().

View File

@@ -16,8 +16,6 @@ type FillFieldsForOrgDomainVerified struct {
func (mig *FillFieldsForOrgDomainVerified) Execute(ctx context.Context, _ eventstore.Event) error {
instances, err := mig.eventstore.InstanceIDs(
ctx,
0,
true,
eventstore.NewSearchQueryBuilder(eventstore.ColumnsInstanceIDs).
OrderDesc().
AddQuery().

View File

@@ -16,8 +16,6 @@ type FillFieldsForInstanceDomains struct {
func (mig *FillFieldsForInstanceDomains) Execute(ctx context.Context, _ eventstore.Event) error {
instances, err := mig.eventstore.InstanceIDs(
ctx,
0,
true,
eventstore.NewSearchQueryBuilder(eventstore.ColumnsInstanceIDs).
OrderDesc().
AddQuery().

View File

@@ -405,6 +405,7 @@ func startAPIs(
config.Auth.Spooler.Client = dbClient
config.Auth.Spooler.Eventstore = eventstore
config.Auth.Spooler.ActiveInstancer = queries
authRepo, err := auth_es.Start(ctx, config.Auth, config.SystemDefaults, commands, queries, dbClient, eventstore, keys.OIDC, keys.User)
if err != nil {
return nil, fmt.Errorf("error starting auth repo: %w", err)
@@ -412,7 +413,8 @@ func startAPIs(
config.Admin.Spooler.Client = dbClient
config.Admin.Spooler.Eventstore = eventstore
err = admin_es.Start(ctx, config.Admin, store, dbClient)
config.Admin.Spooler.ActiveInstancer = queries
err = admin_es.Start(ctx, config.Admin, store, dbClient, queries)
if err != nil {
return nil, fmt.Errorf("error starting admin repo: %w", err)
}