Files
zitadel/internal/eventstore/handler/v2/state.go

93 lines
2.4 KiB
Go
Raw Normal View History

package handler
import (
"context"
"database/sql"
_ "embed"
"errors"
"time"
"github.com/shopspring/decimal"
"github.com/zitadel/zitadel/internal/api/authz"
"github.com/zitadel/zitadel/internal/eventstore"
"github.com/zitadel/zitadel/internal/zerrors"
)
type state struct {
instanceID string
position decimal.Decimal
eventTimestamp time.Time
aggregateType eventstore.AggregateType
aggregateID string
sequence uint64
offset uint32
}
var (
//go:embed state_get.sql
currentStateStmt string
//go:embed state_set.sql
updateStateStmt string
)
fix(projections): overhaul the event projection system (#10560) This PR overhauls our event projection system to make it more robust and prevent skipped events under high load. The core change replaces our custom, transaction-based locking with standard PostgreSQL advisory locks. We also introduce a worker pool to manage concurrency and prevent database connection exhaustion. ### Key Changes * **Advisory Locks for Projections:** Replaces exclusive row locks and inspection of `pg_stat_activity` with PostgreSQL advisory locks for managing projection state. This is a more reliable and standard approach to distributed locking. * **Simplified Await Logic:** Removes the complex logic for awaiting open transactions, simplifying it to a more straightforward time-based filtering of events. * **Projection Worker Pool:** Implements a worker pool to limit concurrent projection triggers, preventing connection exhaustion and improving stability under load. A new `MaxParallelTriggers` configuration option is introduced. ### Problem Solved Under high throughput, a race condition could cause projections to miss events from the eventstore. This led to inconsistent data in projection tables (e.g., a user grant might be missing). This PR fixes the underlying locking and concurrency issues to ensure all events are processed reliably. ### How it Works 1. **Event Writing:** When writing events, a *shared* advisory lock is taken. This signals that a write is in progress. 2. **Event Handling (Projections):** * A projection worker attempts to acquire an *exclusive* advisory lock for that specific projection. If the lock is already held, it means another worker is on the job, so the current one backs off. * Once the lock is acquired, the worker briefly acquires and releases the same *shared* lock used by event writers. This acts as a barrier, ensuring it waits for any in-flight writes to complete. * Finally, it processes all events that occurred before its transaction began. ### Additional Information * ZITADEL no longer modifies the `application_name` PostgreSQL variable during event writes. * The lock on the `current_states` table is now `FOR NO KEY UPDATE`. * Fixes https://github.com/zitadel/zitadel/issues/8509 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Tim Möhlmann <tim+github@zitadel.com> (cherry picked from commit 0575f67e942c3192b36e39fd6ae06b1502bc0f5f)
2025-09-03 17:29:00 +02:00
func (h *Handler) currentState(ctx context.Context, tx *sql.Tx) (currentState *state, err error) {
currentState = &state{
instanceID: authz.GetInstance(ctx).InstanceID(),
}
var (
aggregateID = new(sql.NullString)
aggregateType = new(sql.NullString)
sequence = new(sql.NullInt64)
timestamp = new(sql.NullTime)
position = new(decimal.NullDecimal)
offset = new(sql.NullInt64)
)
fix(projections): overhaul the event projection system (#10560) This PR overhauls our event projection system to make it more robust and prevent skipped events under high load. The core change replaces our custom, transaction-based locking with standard PostgreSQL advisory locks. We also introduce a worker pool to manage concurrency and prevent database connection exhaustion. ### Key Changes * **Advisory Locks for Projections:** Replaces exclusive row locks and inspection of `pg_stat_activity` with PostgreSQL advisory locks for managing projection state. This is a more reliable and standard approach to distributed locking. * **Simplified Await Logic:** Removes the complex logic for awaiting open transactions, simplifying it to a more straightforward time-based filtering of events. * **Projection Worker Pool:** Implements a worker pool to limit concurrent projection triggers, preventing connection exhaustion and improving stability under load. A new `MaxParallelTriggers` configuration option is introduced. ### Problem Solved Under high throughput, a race condition could cause projections to miss events from the eventstore. This led to inconsistent data in projection tables (e.g., a user grant might be missing). This PR fixes the underlying locking and concurrency issues to ensure all events are processed reliably. ### How it Works 1. **Event Writing:** When writing events, a *shared* advisory lock is taken. This signals that a write is in progress. 2. **Event Handling (Projections):** * A projection worker attempts to acquire an *exclusive* advisory lock for that specific projection. If the lock is already held, it means another worker is on the job, so the current one backs off. * Once the lock is acquired, the worker briefly acquires and releases the same *shared* lock used by event writers. This acts as a barrier, ensuring it waits for any in-flight writes to complete. * Finally, it processes all events that occurred before its transaction began. ### Additional Information * ZITADEL no longer modifies the `application_name` PostgreSQL variable during event writes. * The lock on the `current_states` table is now `FOR NO KEY UPDATE`. * Fixes https://github.com/zitadel/zitadel/issues/8509 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Tim Möhlmann <tim+github@zitadel.com> (cherry picked from commit 0575f67e942c3192b36e39fd6ae06b1502bc0f5f)
2025-09-03 17:29:00 +02:00
row := tx.QueryRow(currentStateStmt, currentState.instanceID, h.projection.Name())
err = row.Scan(
aggregateID,
aggregateType,
sequence,
timestamp,
position,
offset,
)
fix(projections): overhaul the event projection system (#10560) This PR overhauls our event projection system to make it more robust and prevent skipped events under high load. The core change replaces our custom, transaction-based locking with standard PostgreSQL advisory locks. We also introduce a worker pool to manage concurrency and prevent database connection exhaustion. ### Key Changes * **Advisory Locks for Projections:** Replaces exclusive row locks and inspection of `pg_stat_activity` with PostgreSQL advisory locks for managing projection state. This is a more reliable and standard approach to distributed locking. * **Simplified Await Logic:** Removes the complex logic for awaiting open transactions, simplifying it to a more straightforward time-based filtering of events. * **Projection Worker Pool:** Implements a worker pool to limit concurrent projection triggers, preventing connection exhaustion and improving stability under load. A new `MaxParallelTriggers` configuration option is introduced. ### Problem Solved Under high throughput, a race condition could cause projections to miss events from the eventstore. This led to inconsistent data in projection tables (e.g., a user grant might be missing). This PR fixes the underlying locking and concurrency issues to ensure all events are processed reliably. ### How it Works 1. **Event Writing:** When writing events, a *shared* advisory lock is taken. This signals that a write is in progress. 2. **Event Handling (Projections):** * A projection worker attempts to acquire an *exclusive* advisory lock for that specific projection. If the lock is already held, it means another worker is on the job, so the current one backs off. * Once the lock is acquired, the worker briefly acquires and releases the same *shared* lock used by event writers. This acts as a barrier, ensuring it waits for any in-flight writes to complete. * Finally, it processes all events that occurred before its transaction began. ### Additional Information * ZITADEL no longer modifies the `application_name` PostgreSQL variable during event writes. * The lock on the `current_states` table is now `FOR NO KEY UPDATE`. * Fixes https://github.com/zitadel/zitadel/issues/8509 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Tim Möhlmann <tim+github@zitadel.com> (cherry picked from commit 0575f67e942c3192b36e39fd6ae06b1502bc0f5f)
2025-09-03 17:29:00 +02:00
if err != nil && !errors.Is(err, sql.ErrNoRows) {
h.log().WithError(err).Debug("unable to query current state")
return nil, err
}
currentState.aggregateID = aggregateID.String
currentState.aggregateType = eventstore.AggregateType(aggregateType.String)
currentState.sequence = uint64(sequence.Int64)
currentState.eventTimestamp = timestamp.Time
currentState.position = position.Decimal
// psql does not provide unsigned numbers so we work around it
currentState.offset = uint32(offset.Int64)
return currentState, nil
}
func (h *Handler) setState(tx *sql.Tx, updatedState *state) error {
res, err := tx.Exec(updateStateStmt,
h.projection.Name(),
updatedState.instanceID,
updatedState.aggregateID,
updatedState.aggregateType,
updatedState.sequence,
updatedState.eventTimestamp,
updatedState.position,
updatedState.offset,
)
if err != nil {
h.log().WithError(err).Warn("unable to update state")
return zerrors.ThrowInternal(err, "V2-WF23g2", "unable to update state")
}
if affected, err := res.RowsAffected(); affected == 0 {
h.log().OnError(err).Error("unable to check if states are updated")
return zerrors.ThrowInternal(err, "V2-FGEKi", "unable to update state")
}
return nil
}