mirror of
https://github.com/zitadel/zitadel.git
synced 2025-12-06 09:12:19 +00:00
This PR overhauls our event projection system to make it more robust and
prevent skipped events under high load. The core change replaces our
custom, transaction-based locking with standard PostgreSQL advisory
locks. We also introduce a worker pool to manage concurrency and prevent
database connection exhaustion.
### Key Changes
* **Advisory Locks for Projections:** Replaces exclusive row locks and
inspection of `pg_stat_activity` with PostgreSQL advisory locks for
managing projection state. This is a more reliable and standard approach
to distributed locking.
* **Simplified Await Logic:** Removes the complex logic for awaiting
open transactions, simplifying it to a more straightforward time-based
filtering of events.
* **Projection Worker Pool:** Implements a worker pool to limit
concurrent projection triggers, preventing connection exhaustion and
improving stability under load. A new `MaxParallelTriggers`
configuration option is introduced.
### Problem Solved
Under high throughput, a race condition could cause projections to miss
events from the eventstore. This led to inconsistent data in projection
tables (e.g., a user grant might be missing). This PR fixes the
underlying locking and concurrency issues to ensure all events are
processed reliably.
### How it Works
1. **Event Writing:** When writing events, a *shared* advisory lock is
taken. This signals that a write is in progress.
2. **Event Handling (Projections):**
* A projection worker attempts to acquire an *exclusive* advisory lock
for that specific projection. If the lock is already held, it means
another worker is on the job, so the current one backs off.
* Once the lock is acquired, the worker briefly acquires and releases
the same *shared* lock used by event writers. This acts as a barrier,
ensuring it waits for any in-flight writes to complete.
* Finally, it processes all events that occurred before its transaction
began.
### Additional Information
* ZITADEL no longer modifies the `application_name` PostgreSQL variable
during event writes.
* The lock on the `current_states` table is now `FOR NO KEY UPDATE`.
* Fixes https://github.com/zitadel/zitadel/issues/8509
---------
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Tim Möhlmann <tim+github@zitadel.com>
(cherry picked from commit 0575f67e94)
163 lines
4.1 KiB
Go
163 lines
4.1 KiB
Go
package eventstore
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
_ "embed"
|
|
"fmt"
|
|
|
|
"github.com/riverqueue/river"
|
|
"github.com/zitadel/logging"
|
|
|
|
"github.com/zitadel/zitadel/internal/api/authz"
|
|
"github.com/zitadel/zitadel/internal/database"
|
|
"github.com/zitadel/zitadel/internal/eventstore"
|
|
"github.com/zitadel/zitadel/internal/queue"
|
|
exec_repo "github.com/zitadel/zitadel/internal/repository/execution"
|
|
"github.com/zitadel/zitadel/internal/telemetry/tracing"
|
|
)
|
|
|
|
func (es *Eventstore) Push(ctx context.Context, client database.ContextQueryExecuter, commands ...eventstore.Command) (events []eventstore.Event, err error) {
|
|
ctx, span := tracing.NewSpan(ctx)
|
|
defer func() { span.EndWithError(err) }()
|
|
|
|
events, err = es.writeCommands(ctx, client, commands)
|
|
if isSetupNotExecutedError(err) {
|
|
return es.pushWithoutFunc(ctx, client, commands...)
|
|
}
|
|
|
|
return events, err
|
|
}
|
|
|
|
func (es *Eventstore) writeCommands(ctx context.Context, client database.ContextQueryExecuter, commands []eventstore.Command) (_ []eventstore.Event, err error) {
|
|
var conn *sql.Conn
|
|
switch c := client.(type) {
|
|
case database.Client:
|
|
conn, err = c.Conn(ctx)
|
|
case nil:
|
|
conn, err = es.client.Conn(ctx)
|
|
client = conn
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if conn != nil {
|
|
defer conn.Close()
|
|
}
|
|
|
|
tx, close, err := es.pushTx(ctx, client)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if close != nil {
|
|
defer func() {
|
|
err = close(err)
|
|
}()
|
|
}
|
|
|
|
// lock the instance for reading events if await events is set for the duration of the transaction.
|
|
_, err = tx.ExecContext(ctx, "SELECT pg_advisory_xact_lock_shared('eventstore.events2'::REGCLASS::OID::INTEGER, hashtext($1))", authz.GetInstance(ctx).InstanceID())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
events, err := writeEvents(ctx, tx, commands)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err = handleUniqueConstraints(ctx, tx, commands); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = es.handleFieldCommands(ctx, tx, commands)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = es.queueExecutions(ctx, tx, events)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return events, nil
|
|
}
|
|
|
|
func writeEvents(ctx context.Context, tx database.Tx, commands []eventstore.Command) (_ []eventstore.Event, err error) {
|
|
ctx, span := tracing.NewSpan(ctx)
|
|
defer func() { span.EndWithError(err) }()
|
|
|
|
events, cmds, err := commandsToEvents(ctx, commands)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rows, err := tx.QueryContext(ctx, `select owner, created_at, "sequence", position from eventstore.push($1::eventstore.command[])`, cmds)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
for i := 0; rows.Next(); i++ {
|
|
err = rows.Scan(&events[i].(*event).command.Owner, &events[i].(*event).createdAt, &events[i].(*event).sequence, &events[i].(*event).position)
|
|
if err != nil {
|
|
logging.WithError(err).Warn("failed to scan events")
|
|
return nil, err
|
|
}
|
|
}
|
|
if err = rows.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
return events, nil
|
|
}
|
|
|
|
func (es *Eventstore) queueExecutions(ctx context.Context, tx database.Tx, events []eventstore.Event) error {
|
|
if es.queue == nil {
|
|
return nil
|
|
}
|
|
|
|
sqlTx, ok := tx.(*sql.Tx)
|
|
if !ok {
|
|
types := make([]string, len(events))
|
|
for i, event := range events {
|
|
types[i] = string(event.Type())
|
|
}
|
|
logging.WithFields("event_types", types).Warningf("event executions skipped: wrong type of transaction %T", tx)
|
|
return nil
|
|
}
|
|
jobArgs, err := eventsToJobArgs(ctx, events)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(jobArgs) == 0 {
|
|
return nil
|
|
}
|
|
return es.queue.InsertManyFastTx(
|
|
ctx, sqlTx, jobArgs,
|
|
queue.WithQueueName(exec_repo.QueueName),
|
|
)
|
|
}
|
|
|
|
func eventsToJobArgs(ctx context.Context, events []eventstore.Event) ([]river.JobArgs, error) {
|
|
if len(events) == 0 {
|
|
return nil, nil
|
|
}
|
|
router := authz.GetInstance(ctx).ExecutionRouter()
|
|
if router.IsZero() {
|
|
return nil, nil
|
|
}
|
|
|
|
jobArgs := make([]river.JobArgs, 0, len(events))
|
|
for _, event := range events {
|
|
targets, ok := router.GetEventBestMatch(fmt.Sprintf("event/%s", event.Type()))
|
|
if !ok {
|
|
continue
|
|
}
|
|
req, err := exec_repo.NewRequest(event, targets)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
jobArgs = append(jobArgs, req)
|
|
}
|
|
return jobArgs, nil
|
|
}
|