feat(notification): use event worker pool (#8962)

# Which Problems Are Solved

The current handling of notification follows the same pattern as all
other projections:
Created events are handled sequentially (based on "position") by a
handler. During the process, a lot of information is aggregated (user,
texts, templates, ...).
This leads to back pressure on the projection since the handling of
events might take longer than the time before a new event (to be
handled) is created.

# How the Problems Are Solved

- The current user notification handler creates separate notification
events based on the user / session events.
- These events contain all the present and required information
including the userID.
- These notification events get processed by notification workers, which
gather the necessary information (recipient address, texts, templates)
to send out these notifications.
- If a notification fails, a retry event is created based on the current
notification request including the current state of the user (this
prevents race conditions, where a user is changed in the meantime and
the notification already gets the new state).
- The retry event will be handled after a backoff delay. This delay
increases with every attempt.
- If the configured amount of attempts is reached or the message expired
(based on config), a cancel event is created, letting the workers know,
the notification must no longer be handled.
- In case of successful send, a sent event is created for the
notification aggregate and the existing "sent" events for the user /
session object is stored.
- The following is added to the defaults.yaml to allow configuration of
the notification workers:
```yaml

Notifications:
  # The amount of workers processing the notification request events.
  # If set to 0, no notification request events will be handled. This can be useful when running in
  # multi binary / pod setup and allowing only certain executables to process the events.
  Workers: 1 # ZITADEL_NOTIFIACATIONS_WORKERS
  # The amount of events a single worker will process in a run.
  BulkLimit: 10 # ZITADEL_NOTIFIACATIONS_BULKLIMIT
  # Time interval between scheduled notifications for request events
  RequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_REQUEUEEVERY
  # The amount of workers processing the notification retry events.
  # If set to 0, no notification retry events will be handled. This can be useful when running in
  # multi binary / pod setup and allowing only certain executables to process the events.
  RetryWorkers: 1 # ZITADEL_NOTIFIACATIONS_RETRYWORKERS
  # Time interval between scheduled notifications for retry events
  RetryRequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_RETRYREQUEUEEVERY
  # Only instances are projected, for which at least a projection-relevant event exists within the timeframe
  # from HandleActiveInstances duration in the past until the projection's current time
  # If set to 0 (default), every instance is always considered active
  HandleActiveInstances: 0s # ZITADEL_NOTIFIACATIONS_HANDLEACTIVEINSTANCES
  # The maximum duration a transaction remains open
  # before it spots left folding additional events
  # and updates the table.
  TransactionDuration: 1m # ZITADEL_NOTIFIACATIONS_TRANSACTIONDURATION
  # Automatically cancel the notification after the amount of failed attempts
  MaxAttempts: 3 # ZITADEL_NOTIFIACATIONS_MAXATTEMPTS
  # Automatically cancel the notification if it cannot be handled within a specific time
  MaxTtl: 5m  # ZITADEL_NOTIFIACATIONS_MAXTTL
  # Failed attempts are retried after a confogired delay (with exponential backoff).
  # Set a minimum and maximum delay and a factor for the backoff
  MinRetryDelay: 1s  # ZITADEL_NOTIFIACATIONS_MINRETRYDELAY
  MaxRetryDelay: 20s # ZITADEL_NOTIFIACATIONS_MAXRETRYDELAY
  # Any factor below 1 will be set to 1
  RetryDelayFactor: 1.5 # ZITADEL_NOTIFIACATIONS_RETRYDELAYFACTOR
```


# Additional Changes

None

# Additional Context

- closes #8931
This commit is contained in:
Livio Spring
2024-11-27 16:01:17 +01:00
committed by GitHub
parent 4413efd82c
commit 8537805ea5
45 changed files with 4005 additions and 2158 deletions

View File

@@ -448,6 +448,40 @@ Projections:
# Telemetry data synchronization is not time critical. Setting RequeueEvery to 55 minutes doesn't annoy the database too much.
RequeueEvery: 3300s # ZITADEL_PROJECTIONS_CUSTOMIZATIONS_TELEMETRY_REQUEUEEVERY
Notifications:
# The amount of workers processing the notification request events.
# If set to 0, no notification request events will be handled. This can be useful when running in
# multi binary / pod setup and allowing only certain executables to process the events.
Workers: 1 # ZITADEL_NOTIFIACATIONS_WORKERS
# The amount of events a single worker will process in a run.
BulkLimit: 10 # ZITADEL_NOTIFIACATIONS_BULKLIMIT
# Time interval between scheduled notifications for request events
RequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_REQUEUEEVERY
# The amount of workers processing the notification retry events.
# If set to 0, no notification retry events will be handled. This can be useful when running in
# multi binary / pod setup and allowing only certain executables to process the events.
RetryWorkers: 1 # ZITADEL_NOTIFIACATIONS_RETRYWORKERS
# Time interval between scheduled notifications for retry events
RetryRequeueEvery: 2s # ZITADEL_NOTIFIACATIONS_RETRYREQUEUEEVERY
# Only instances are projected, for which at least a projection-relevant event exists within the timeframe
# from HandleActiveInstances duration in the past until the projection's current time
# If set to 0 (default), every instance is always considered active
HandleActiveInstances: 0s # ZITADEL_NOTIFIACATIONS_HANDLEACTIVEINSTANCES
# The maximum duration a transaction remains open
# before it spots left folding additional events
# and updates the table.
TransactionDuration: 1m # ZITADEL_NOTIFIACATIONS_TRANSACTIONDURATION
# Automatically cancel the notification after the amount of failed attempts
MaxAttempts: 3 # ZITADEL_NOTIFIACATIONS_MAXATTEMPTS
# Automatically cancel the notification if it cannot be handled within a specific time
MaxTtl: 5m # ZITADEL_NOTIFIACATIONS_MAXTTL
# Failed attempts are retried after a confogired delay (with exponential backoff).
# Set a minimum and maximum delay and a factor for the backoff
MinRetryDelay: 1s # ZITADEL_NOTIFIACATIONS_MINRETRYDELAY
MaxRetryDelay: 20s # ZITADEL_NOTIFIACATIONS_MAXRETRYDELAY
# Any factor below 1 will be set to 1
RetryDelayFactor: 1.5 # ZITADEL_NOTIFIACATIONS_RETRYDELAYFACTOR
Auth:
# See Projections.BulkLimit
SearchLimit: 1000 # ZITADEL_AUTH_SEARCHLIMIT

View File

@@ -69,6 +69,7 @@ func projectionsCmd() *cobra.Command {
type ProjectionsConfig struct {
Destination database.Config
Projections projection.Config
Notifications handlers.WorkerConfig
EncryptionKeys *encryption.EncryptionKeyConfig
SystemAPIUsers map[string]*internal_authz.SystemAPIUser
Eventstore *eventstore.Config
@@ -205,6 +206,7 @@ func projections(
config.Projections.Customizations["notificationsquotas"],
config.Projections.Customizations["backchannel"],
config.Projections.Customizations["telemetry"],
config.Notifications,
*config.Telemetry,
config.ExternalDomain,
config.ExternalPort,
@@ -219,6 +221,7 @@ func projections(
keys.SMS,
keys.OIDC,
config.OIDC.DefaultBackChannelLogoutLifetime,
client,
)
config.Auth.Spooler.Client = client

View File

@@ -42,6 +42,7 @@ type Config struct {
DefaultInstance command.InstanceSetup
Machine *id.Config
Projections projection.Config
Notifications handlers.WorkerConfig
Eventstore *eventstore.Config
InitProjections InitProjections

View File

@@ -437,6 +437,7 @@ func initProjections(
config.Projections.Customizations["notificationsquotas"],
config.Projections.Customizations["backchannel"],
config.Projections.Customizations["telemetry"],
config.Notifications,
*config.Telemetry,
config.ExternalDomain,
config.ExternalPort,
@@ -451,6 +452,7 @@ func initProjections(
keys.SMS,
keys.OIDC,
config.OIDC.DefaultBackChannelLogoutLifetime,
queryDBClient,
)
for _, p := range notify_handler.Projections() {
err := migration.Migrate(ctx, eventstoreClient, p)

View File

@@ -54,6 +54,7 @@ type Config struct {
Metrics metrics.Config
Profiler profiler.Config
Projections projection.Config
Notifications handlers.WorkerConfig
Auth auth_es.Config
Admin admin_es.Config
UserAgentCookie *middleware.UserAgentCookieConfig

View File

@@ -277,6 +277,7 @@ func startZitadel(ctx context.Context, config *Config, masterKey string, server
config.Projections.Customizations["notificationsquotas"],
config.Projections.Customizations["backchannel"],
config.Projections.Customizations["telemetry"],
config.Notifications,
*config.Telemetry,
config.ExternalDomain,
config.ExternalPort,
@@ -291,6 +292,7 @@ func startZitadel(ctx context.Context, config *Config, masterKey string, server
keys.SMS,
keys.OIDC,
config.OIDC.DefaultBackChannelLogoutLifetime,
queryDBClient,
)
notification.Start(ctx)