mirror of
https://github.com/zitadel/zitadel.git
synced 2025-01-05 14:37:45 +00:00
77cd430b3a
# Which Problems Are Solved Scheduled handlers use `eventstore.InstanceIDs` to get the all active instances within a given timeframe. This function scrapes through all events written within that time frame which can cause heavy load on the database. # How the Problems Are Solved A new query cache `activeInstances` is introduced which caches the ids of all instances queried by id or host within the configured timeframe. # Additional Changes - Changed `default.yaml` - Removed `HandleActiveInstances` from custom handler configs - Added `MaxActiveInstances` to define the maximal amount of cached instance ids - fixed start-from-init and start-from-setup to start auth and admin projections twice - fixed org cache invalidation to use correct index # Additional Context - part of #8999
499 lines
17 KiB
Go
499 lines
17 KiB
Go
package handlers
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"errors"
|
|
"math/rand/v2"
|
|
"slices"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/zitadel/logging"
|
|
|
|
"github.com/zitadel/zitadel/internal/api/authz"
|
|
"github.com/zitadel/zitadel/internal/api/call"
|
|
"github.com/zitadel/zitadel/internal/command"
|
|
"github.com/zitadel/zitadel/internal/crypto"
|
|
"github.com/zitadel/zitadel/internal/database"
|
|
"github.com/zitadel/zitadel/internal/domain"
|
|
"github.com/zitadel/zitadel/internal/eventstore"
|
|
"github.com/zitadel/zitadel/internal/notification/channels"
|
|
"github.com/zitadel/zitadel/internal/notification/senders"
|
|
"github.com/zitadel/zitadel/internal/notification/types"
|
|
"github.com/zitadel/zitadel/internal/query"
|
|
"github.com/zitadel/zitadel/internal/repository/instance"
|
|
"github.com/zitadel/zitadel/internal/repository/notification"
|
|
)
|
|
|
|
const (
|
|
Code = "Code"
|
|
OTP = "OTP"
|
|
)
|
|
|
|
type NotificationWorker struct {
|
|
commands Commands
|
|
queries *NotificationQueries
|
|
es *eventstore.Eventstore
|
|
client *database.DB
|
|
channels types.ChannelChains
|
|
config WorkerConfig
|
|
now nowFunc
|
|
backOff func(current time.Duration) time.Duration
|
|
}
|
|
|
|
type WorkerConfig struct {
|
|
LegacyEnabled bool
|
|
Workers uint8
|
|
BulkLimit uint16
|
|
RequeueEvery time.Duration
|
|
RetryWorkers uint8
|
|
RetryRequeueEvery time.Duration
|
|
TransactionDuration time.Duration
|
|
MaxAttempts uint8
|
|
MaxTtl time.Duration
|
|
MinRetryDelay time.Duration
|
|
MaxRetryDelay time.Duration
|
|
RetryDelayFactor float32
|
|
}
|
|
|
|
// nowFunc makes [time.Now] mockable
|
|
type nowFunc func() time.Time
|
|
|
|
type Sent func(ctx context.Context, commands Commands, id, orgID string, generatorInfo *senders.CodeGeneratorInfo, args map[string]any) error
|
|
|
|
var sentHandlers map[eventstore.EventType]Sent
|
|
|
|
func RegisterSentHandler(eventType eventstore.EventType, sent Sent) {
|
|
if sentHandlers == nil {
|
|
sentHandlers = make(map[eventstore.EventType]Sent)
|
|
}
|
|
sentHandlers[eventType] = sent
|
|
}
|
|
|
|
func NewNotificationWorker(
|
|
config WorkerConfig,
|
|
commands Commands,
|
|
queries *NotificationQueries,
|
|
es *eventstore.Eventstore,
|
|
client *database.DB,
|
|
channels types.ChannelChains,
|
|
) *NotificationWorker {
|
|
// make sure the delay does not get less
|
|
if config.RetryDelayFactor < 1 {
|
|
config.RetryDelayFactor = 1
|
|
}
|
|
w := &NotificationWorker{
|
|
config: config,
|
|
commands: commands,
|
|
queries: queries,
|
|
es: es,
|
|
client: client,
|
|
channels: channels,
|
|
now: time.Now,
|
|
}
|
|
w.backOff = w.exponentialBackOff
|
|
return w
|
|
}
|
|
|
|
func (w *NotificationWorker) Start(ctx context.Context) {
|
|
if w.config.LegacyEnabled {
|
|
return
|
|
}
|
|
for i := 0; i < int(w.config.Workers); i++ {
|
|
go w.schedule(ctx, i, false)
|
|
}
|
|
for i := 0; i < int(w.config.RetryWorkers); i++ {
|
|
go w.schedule(ctx, i, true)
|
|
}
|
|
}
|
|
|
|
func (w *NotificationWorker) reduceNotificationRequested(ctx, txCtx context.Context, tx *sql.Tx, event *notification.RequestedEvent) (err error) {
|
|
ctx = ContextWithNotifier(ctx, event.Aggregate())
|
|
|
|
// if the notification is too old, we can directly cancel
|
|
if event.CreatedAt().Add(w.config.MaxTtl).Before(w.now()) {
|
|
return w.commands.NotificationCanceled(txCtx, tx, event.Aggregate().ID, event.Aggregate().ResourceOwner, nil)
|
|
}
|
|
|
|
// Get the notify user first, so if anything fails afterward we have the current state of the user
|
|
// and can pass that to the retry request.
|
|
// We do not trigger the projection to reduce load on the database. By the time the notification is processed,
|
|
// the user should be projected anyway. If not, it will just wait for the next run.
|
|
notifyUser, err := w.queries.GetNotifyUserByID(ctx, false, event.UserID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// The domain claimed event requires the domain as argument, but lacks the user when creating the request event.
|
|
// Since we set it into the request arguments, it will be passed into a potential retry event.
|
|
if event.RequiresPreviousDomain && event.Request.Args != nil && event.Request.Args.Domain == "" {
|
|
index := strings.LastIndex(notifyUser.LastEmail, "@")
|
|
event.Request.Args.Domain = notifyUser.LastEmail[index+1:]
|
|
}
|
|
|
|
err = w.sendNotification(ctx, txCtx, tx, event.Request, notifyUser, event)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
// if retries are disabled or if the error explicitly specifies, we cancel the notification
|
|
if w.config.MaxAttempts <= 1 || errors.Is(err, &channels.CancelError{}) {
|
|
return w.commands.NotificationCanceled(txCtx, tx, event.Aggregate().ID, event.Aggregate().ResourceOwner, err)
|
|
}
|
|
// otherwise we retry after a backoff delay
|
|
return w.commands.NotificationRetryRequested(
|
|
txCtx,
|
|
tx,
|
|
event.Aggregate().ID,
|
|
event.Aggregate().ResourceOwner,
|
|
notificationEventToRequest(event.Request, notifyUser, w.backOff(0)),
|
|
err,
|
|
)
|
|
}
|
|
|
|
func (w *NotificationWorker) reduceNotificationRetry(ctx, txCtx context.Context, tx *sql.Tx, event *notification.RetryRequestedEvent) (err error) {
|
|
ctx = ContextWithNotifier(ctx, event.Aggregate())
|
|
|
|
// if the notification is too old, we can directly cancel
|
|
if event.CreatedAt().Add(w.config.MaxTtl).Before(w.now()) {
|
|
return w.commands.NotificationCanceled(txCtx, tx, event.Aggregate().ID, event.Aggregate().ResourceOwner, err)
|
|
}
|
|
|
|
if event.CreatedAt().Add(event.BackOff).After(w.now()) {
|
|
return nil
|
|
}
|
|
err = w.sendNotification(ctx, txCtx, tx, event.Request, event.NotifyUser, event)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
// if the max attempts are reached or if the error explicitly specifies, we cancel the notification
|
|
if event.Sequence() >= uint64(w.config.MaxAttempts) || errors.Is(err, &channels.CancelError{}) {
|
|
return w.commands.NotificationCanceled(txCtx, tx, event.Aggregate().ID, event.Aggregate().ResourceOwner, err)
|
|
}
|
|
// otherwise we retry after a backoff delay
|
|
return w.commands.NotificationRetryRequested(txCtx, tx, event.Aggregate().ID, event.Aggregate().ResourceOwner, notificationEventToRequest(
|
|
event.Request,
|
|
event.NotifyUser,
|
|
w.backOff(event.BackOff),
|
|
), err)
|
|
}
|
|
|
|
func (w *NotificationWorker) sendNotification(ctx, txCtx context.Context, tx *sql.Tx, request notification.Request, notifyUser *query.NotifyUser, e eventstore.Event) error {
|
|
ctx, err := enrichCtx(ctx, request.TriggeredAtOrigin)
|
|
if err != nil {
|
|
return channels.NewCancelError(err)
|
|
}
|
|
|
|
// check early that a "sent" handler exists, otherwise we can cancel early
|
|
sentHandler, ok := sentHandlers[request.EventType]
|
|
if !ok {
|
|
logging.Errorf(`no "sent" handler registered for %s`, request.EventType)
|
|
return channels.NewCancelError(err)
|
|
}
|
|
|
|
var code string
|
|
if request.Code != nil {
|
|
code, err = crypto.DecryptString(request.Code, w.queries.UserDataCrypto)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
colors, err := w.queries.ActiveLabelPolicyByOrg(ctx, request.UserResourceOwner, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
translator, err := w.queries.GetTranslatorWithOrgTexts(ctx, request.UserResourceOwner, request.MessageType)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
generatorInfo := new(senders.CodeGeneratorInfo)
|
|
var notify types.Notify
|
|
switch request.NotificationType {
|
|
case domain.NotificationTypeEmail:
|
|
template, err := w.queries.MailTemplateByOrg(ctx, notifyUser.ResourceOwner, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
notify = types.SendEmail(ctx, w.channels, string(template.Template), translator, notifyUser, colors, e)
|
|
case domain.NotificationTypeSms:
|
|
notify = types.SendSMS(ctx, w.channels, translator, notifyUser, colors, e, generatorInfo)
|
|
}
|
|
|
|
args := request.Args.ToMap()
|
|
args[Code] = code
|
|
// existing notifications use `OTP` as argument for the code
|
|
if request.IsOTP {
|
|
args[OTP] = code
|
|
}
|
|
|
|
if err := notify(request.URLTemplate, args, request.MessageType, request.UnverifiedNotificationChannel); err != nil {
|
|
return err
|
|
}
|
|
err = w.commands.NotificationSent(txCtx, tx, e.Aggregate().ID, e.Aggregate().ResourceOwner)
|
|
if err != nil {
|
|
// In case the notification event cannot be pushed, we most likely cannot create a retry or cancel event.
|
|
// Therefore, we'll only log the error and also do not need to try to push to the user / session.
|
|
logging.WithFields("instanceID", authz.GetInstance(ctx).InstanceID(), "notification", e.Aggregate().ID).
|
|
OnError(err).Error("could not set sent notification event")
|
|
return nil
|
|
}
|
|
err = sentHandler(txCtx, w.commands, request.NotificationAggregateID(), request.NotificationAggregateResourceOwner(), generatorInfo, args)
|
|
logging.WithFields("instanceID", authz.GetInstance(ctx).InstanceID(), "notification", e.Aggregate().ID).
|
|
OnError(err).Error("could not set notification event on aggregate")
|
|
return nil
|
|
}
|
|
|
|
func (w *NotificationWorker) exponentialBackOff(current time.Duration) time.Duration {
|
|
if current >= w.config.MaxRetryDelay {
|
|
return w.config.MaxRetryDelay
|
|
}
|
|
if current < w.config.MinRetryDelay {
|
|
current = w.config.MinRetryDelay
|
|
}
|
|
t := time.Duration(rand.Int64N(int64(w.config.RetryDelayFactor*float32(current.Nanoseconds()))-current.Nanoseconds()) + current.Nanoseconds())
|
|
if t > w.config.MaxRetryDelay {
|
|
return w.config.MaxRetryDelay
|
|
}
|
|
return t
|
|
}
|
|
|
|
func notificationEventToRequest(e notification.Request, notifyUser *query.NotifyUser, backoff time.Duration) *command.NotificationRetryRequest {
|
|
return &command.NotificationRetryRequest{
|
|
NotificationRequest: command.NotificationRequest{
|
|
UserID: e.UserID,
|
|
UserResourceOwner: e.UserResourceOwner,
|
|
TriggerOrigin: e.TriggeredAtOrigin,
|
|
URLTemplate: e.URLTemplate,
|
|
Code: e.Code,
|
|
CodeExpiry: e.CodeExpiry,
|
|
EventType: e.EventType,
|
|
NotificationType: e.NotificationType,
|
|
MessageType: e.MessageType,
|
|
UnverifiedNotificationChannel: e.UnverifiedNotificationChannel,
|
|
Args: e.Args,
|
|
AggregateID: e.AggregateID,
|
|
AggregateResourceOwner: e.AggregateResourceOwner,
|
|
IsOTP: e.IsOTP,
|
|
},
|
|
BackOff: backoff,
|
|
NotifyUser: notifyUser,
|
|
}
|
|
}
|
|
|
|
func (w *NotificationWorker) schedule(ctx context.Context, workerID int, retry bool) {
|
|
t := time.NewTimer(0)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
t.Stop()
|
|
w.log(workerID, retry).Info("scheduler stopped")
|
|
return
|
|
case <-t.C:
|
|
instances, err := w.queryInstances(ctx, retry)
|
|
w.log(workerID, retry).OnError(err).Error("unable to query instances")
|
|
|
|
w.triggerInstances(call.WithTimestamp(ctx), instances, workerID, retry)
|
|
if retry {
|
|
t.Reset(w.config.RetryRequeueEvery)
|
|
continue
|
|
}
|
|
t.Reset(w.config.RequeueEvery)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (w *NotificationWorker) log(workerID int, retry bool) *logging.Entry {
|
|
return logging.WithFields("notification worker", workerID, "retries", retry)
|
|
}
|
|
|
|
func (w *NotificationWorker) queryInstances(ctx context.Context, retry bool) ([]string, error) {
|
|
return w.queries.ActiveInstances(), nil
|
|
}
|
|
|
|
func (w *NotificationWorker) triggerInstances(ctx context.Context, instances []string, workerID int, retry bool) {
|
|
for _, instance := range instances {
|
|
instanceCtx := authz.WithInstanceID(ctx, instance)
|
|
|
|
err := w.trigger(instanceCtx, workerID, retry)
|
|
w.log(workerID, retry).WithField("instance", instance).OnError(err).Info("trigger failed")
|
|
}
|
|
}
|
|
|
|
func (w *NotificationWorker) trigger(ctx context.Context, workerID int, retry bool) (err error) {
|
|
txCtx := ctx
|
|
if w.config.TransactionDuration > 0 {
|
|
var cancel, cancelTx func()
|
|
txCtx, cancelTx = context.WithCancel(ctx)
|
|
defer cancelTx()
|
|
ctx, cancel = context.WithTimeout(ctx, w.config.TransactionDuration)
|
|
defer cancel()
|
|
}
|
|
tx, err := w.client.BeginTx(txCtx, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
err = database.CloseTransaction(tx, err)
|
|
}()
|
|
|
|
events, err := w.searchEvents(txCtx, tx, retry)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// If there aren't any events or no unlocked event terminate early and start a new run.
|
|
if len(events) == 0 {
|
|
return nil
|
|
}
|
|
|
|
w.log(workerID, retry).
|
|
WithField("instanceID", authz.GetInstance(ctx).InstanceID()).
|
|
WithField("events", len(events)).
|
|
Info("handling notification events")
|
|
|
|
for _, event := range events {
|
|
var err error
|
|
switch e := event.(type) {
|
|
case *notification.RequestedEvent:
|
|
w.createSavepoint(txCtx, tx, event, workerID, retry)
|
|
err = w.reduceNotificationRequested(ctx, txCtx, tx, e)
|
|
case *notification.RetryRequestedEvent:
|
|
w.createSavepoint(txCtx, tx, event, workerID, retry)
|
|
err = w.reduceNotificationRetry(ctx, txCtx, tx, e)
|
|
}
|
|
if err != nil {
|
|
w.log(workerID, retry).OnError(err).
|
|
WithField("instanceID", authz.GetInstance(ctx).InstanceID()).
|
|
WithField("notificationID", event.Aggregate().ID).
|
|
WithField("sequence", event.Sequence()).
|
|
WithField("type", event.Type()).
|
|
Error("could not handle notification event")
|
|
// if we have an error, we rollback to the savepoint and continue with the next event
|
|
// we use the txCtx to make sure we can rollback the transaction in case the ctx is canceled
|
|
w.rollbackToSavepoint(txCtx, tx, event, workerID, retry)
|
|
}
|
|
// if the context is canceled, we stop the processing
|
|
if ctx.Err() != nil {
|
|
return nil
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (w *NotificationWorker) latestRetries(events []eventstore.Event) []eventstore.Event {
|
|
for i := len(events) - 1; i > 0; i-- {
|
|
// since we delete during the iteration, we need to make sure we don't panic
|
|
if len(events) <= i {
|
|
continue
|
|
}
|
|
// delete all the previous retries of the same notification
|
|
events = slices.DeleteFunc(events, func(e eventstore.Event) bool {
|
|
return e.Aggregate().ID == events[i].Aggregate().ID &&
|
|
e.Sequence() < events[i].Sequence()
|
|
})
|
|
}
|
|
return events
|
|
}
|
|
|
|
func (w *NotificationWorker) createSavepoint(ctx context.Context, tx *sql.Tx, event eventstore.Event, workerID int, retry bool) {
|
|
_, err := tx.ExecContext(ctx, "SAVEPOINT notification_send")
|
|
w.log(workerID, retry).OnError(err).
|
|
WithField("instanceID", authz.GetInstance(ctx).InstanceID()).
|
|
WithField("notificationID", event.Aggregate().ID).
|
|
WithField("sequence", event.Sequence()).
|
|
WithField("type", event.Type()).
|
|
Error("could not create savepoint for notification event")
|
|
}
|
|
|
|
func (w *NotificationWorker) rollbackToSavepoint(ctx context.Context, tx *sql.Tx, event eventstore.Event, workerID int, retry bool) {
|
|
_, err := tx.ExecContext(ctx, "ROLLBACK TO SAVEPOINT notification_send")
|
|
w.log(workerID, retry).OnError(err).
|
|
WithField("instanceID", authz.GetInstance(ctx).InstanceID()).
|
|
WithField("notificationID", event.Aggregate().ID).
|
|
WithField("sequence", event.Sequence()).
|
|
WithField("type", event.Type()).
|
|
Error("could not rollback to savepoint for notification event")
|
|
}
|
|
|
|
func (w *NotificationWorker) searchEvents(ctx context.Context, tx *sql.Tx, retry bool) ([]eventstore.Event, error) {
|
|
if retry {
|
|
return w.searchRetryEvents(ctx, tx)
|
|
}
|
|
// query events and lock them for update (with skip locked)
|
|
searchQuery := eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
|
|
LockRowsDuringTx(tx, eventstore.LockOptionSkipLocked).
|
|
// Messages older than the MaxTTL, we can be ignored.
|
|
// The first attempt of a retry might still be older than the TTL and needs to be filtered out later on.
|
|
CreationDateAfter(w.now().Add(-1*w.config.MaxTtl)).
|
|
Limit(uint64(w.config.BulkLimit)).
|
|
AddQuery().
|
|
AggregateTypes(notification.AggregateType).
|
|
EventTypes(notification.RequestedType).
|
|
Builder().
|
|
ExcludeAggregateIDs().
|
|
EventTypes(notification.RetryRequestedType, notification.CanceledType, notification.SentType).
|
|
Builder()
|
|
//nolint:staticcheck
|
|
return w.es.Filter(ctx, searchQuery)
|
|
}
|
|
|
|
func (w *NotificationWorker) searchRetryEvents(ctx context.Context, tx *sql.Tx) ([]eventstore.Event, error) {
|
|
// query events and lock them for update (with skip locked)
|
|
searchQuery := eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
|
|
LockRowsDuringTx(tx, eventstore.LockOptionSkipLocked).
|
|
// Messages older than the MaxTTL, we can be ignored.
|
|
// The first attempt of a retry might still be older than the TTL and needs to be filtered out later on.
|
|
CreationDateAfter(w.now().Add(-1*w.config.MaxTtl)).
|
|
AddQuery().
|
|
AggregateTypes(notification.AggregateType).
|
|
EventTypes(notification.RetryRequestedType).
|
|
Builder().
|
|
ExcludeAggregateIDs().
|
|
EventTypes(notification.CanceledType, notification.SentType).
|
|
Builder()
|
|
//nolint:staticcheck
|
|
events, err := w.es.Filter(ctx, searchQuery)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return w.latestRetries(events), nil
|
|
}
|
|
|
|
type existingInstances []string
|
|
|
|
// AppendEvents implements eventstore.QueryReducer.
|
|
func (ai *existingInstances) AppendEvents(events ...eventstore.Event) {
|
|
for _, event := range events {
|
|
switch event.Type() {
|
|
case instance.InstanceAddedEventType:
|
|
*ai = append(*ai, event.Aggregate().InstanceID)
|
|
case instance.InstanceRemovedEventType:
|
|
*ai = slices.DeleteFunc(*ai, func(s string) bool {
|
|
return s == event.Aggregate().InstanceID
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// Query implements eventstore.QueryReducer.
|
|
func (*existingInstances) Query() *eventstore.SearchQueryBuilder {
|
|
return eventstore.NewSearchQueryBuilder(eventstore.ColumnsEvent).
|
|
AddQuery().
|
|
AggregateTypes(instance.AggregateType).
|
|
EventTypes(
|
|
instance.InstanceAddedEventType,
|
|
instance.InstanceRemovedEventType,
|
|
).
|
|
Builder()
|
|
}
|
|
|
|
// Reduce implements eventstore.QueryReducer.
|
|
// reduce is not used as events are reduced during AppendEvents
|
|
func (*existingInstances) Reduce() error {
|
|
return nil
|
|
}
|