Files
zitadel/internal/eventstore/handler/v2/state_test.go
Silvan 19d1ab9c94 fix(projections): overhaul the event projection system (#10560)
This PR overhauls our event projection system to make it more robust and
prevent skipped events under high load. The core change replaces our
custom, transaction-based locking with standard PostgreSQL advisory
locks. We also introduce a worker pool to manage concurrency and prevent
database connection exhaustion.

### Key Changes

* **Advisory Locks for Projections:** Replaces exclusive row locks and
inspection of `pg_stat_activity` with PostgreSQL advisory locks for
managing projection state. This is a more reliable and standard approach
to distributed locking.
* **Simplified Await Logic:** Removes the complex logic for awaiting
open transactions, simplifying it to a more straightforward time-based
filtering of events.
* **Projection Worker Pool:** Implements a worker pool to limit
concurrent projection triggers, preventing connection exhaustion and
improving stability under load. A new `MaxParallelTriggers`
configuration option is introduced.

### Problem Solved

Under high throughput, a race condition could cause projections to miss
events from the eventstore. This led to inconsistent data in projection
tables (e.g., a user grant might be missing). This PR fixes the
underlying locking and concurrency issues to ensure all events are
processed reliably.

### How it Works

1. **Event Writing:** When writing events, a *shared* advisory lock is
taken. This signals that a write is in progress.
2.  **Event Handling (Projections):**
* A projection worker attempts to acquire an *exclusive* advisory lock
for that specific projection. If the lock is already held, it means
another worker is on the job, so the current one backs off.
* Once the lock is acquired, the worker briefly acquires and releases
the same *shared* lock used by event writers. This acts as a barrier,
ensuring it waits for any in-flight writes to complete.
* Finally, it processes all events that occurred before its transaction
began.

### Additional Information

* ZITADEL no longer modifies the `application_name` PostgreSQL variable
during event writes.
*   The lock on the `current_states` table is now `FOR NO KEY UPDATE`.
*   Fixes https://github.com/zitadel/zitadel/issues/8509

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Tim Möhlmann <tim+github@zitadel.com>
(cherry picked from commit 0575f67e94)
2025-09-15 09:41:49 +02:00

301 lines
6.6 KiB
Go

package handler
import (
"context"
"database/sql"
"database/sql/driver"
_ "embed"
"errors"
"reflect"
"testing"
"time"
"github.com/jackc/pgx/v5/pgconn"
"github.com/shopspring/decimal"
"github.com/zitadel/zitadel/internal/api/authz"
"github.com/zitadel/zitadel/internal/database/mock"
"github.com/zitadel/zitadel/internal/eventstore"
"github.com/zitadel/zitadel/internal/zerrors"
)
func TestHandler_updateLastUpdated(t *testing.T) {
type fields struct {
projection Projection
mock *mock.SQLMock
}
type args struct {
updatedState *state
}
tests := []struct {
name string
fields fields
args args
isErr func(t *testing.T, err error)
}{
{
name: "update fails",
fields: fields{
projection: &projection{
name: "instance",
},
mock: mock.NewSQLMock(t,
mock.ExpectBegin(nil),
mock.ExcpectExec(updateStateStmt,
mock.WithExecErr(sql.ErrTxDone),
),
),
},
args: args{
updatedState: &state{
instanceID: "instance",
eventTimestamp: time.Now(),
position: decimal.NewFromInt(42),
},
},
isErr: func(t *testing.T, err error) {
if !errors.Is(err, sql.ErrTxDone) {
t.Errorf("unexpected error, want: %v, got %v", sql.ErrTxDone, err)
}
},
},
{
name: "no rows affected",
fields: fields{
projection: &projection{
name: "instance",
},
mock: mock.NewSQLMock(t,
mock.ExpectBegin(nil),
mock.ExcpectExec(updateStateStmt,
mock.WithExecNoRowsAffected(),
),
),
},
args: args{
updatedState: &state{
instanceID: "instance",
eventTimestamp: time.Now(),
position: decimal.NewFromInt(42),
},
},
isErr: func(t *testing.T, err error) {
if !errors.Is(err, zerrors.ThrowInternal(nil, "V2-FGEKi", "")) {
t.Errorf("unexpected error, want: %v, got %v", sql.ErrTxDone, err)
}
},
},
{
name: "success",
fields: fields{
projection: &projection{
name: "projection",
},
mock: mock.NewSQLMock(t,
mock.ExpectBegin(nil),
mock.ExcpectExec(updateStateStmt,
mock.WithExecArgs(
"projection",
"instance",
"aggregate id",
eventstore.AggregateType("aggregate type"),
uint64(42),
mock.AnyType[time.Time]{},
decimal.NewFromInt(42),
uint32(0),
),
mock.WithExecRowsAffected(1),
),
),
},
args: args{
updatedState: &state{
instanceID: "instance",
eventTimestamp: time.Now(),
position: decimal.NewFromInt(42),
aggregateType: "aggregate type",
aggregateID: "aggregate id",
sequence: 42,
},
},
},
}
for _, tt := range tests {
if tt.isErr == nil {
tt.isErr = func(t *testing.T, err error) {
if err != nil {
t.Error("expected no error got:", err)
}
}
}
t.Run(tt.name, func(t *testing.T) {
tx, err := tt.fields.mock.DB.BeginTx(context.Background(), nil)
if err != nil {
t.Fatalf("unable to begin transaction: %v", err)
}
h := &Handler{
projection: tt.fields.projection,
}
err = h.setState(tx, tt.args.updatedState)
tt.isErr(t, err)
tt.fields.mock.Assert(t)
})
}
}
func TestHandler_currentState(t *testing.T) {
testTime := time.Now()
type fields struct {
projection Projection
mock *mock.SQLMock
}
type args struct {
ctx context.Context
}
type want struct {
currentState *state
isErr func(t *testing.T, err error)
}
tests := []struct {
name string
fields fields
args args
want want
}{
{
name: "connection done",
fields: fields{
projection: &projection{
name: "projection",
},
mock: mock.NewSQLMock(t,
mock.ExpectBegin(nil),
mock.ExpectQuery(currentStateStmt,
mock.WithQueryArgs(
"instance",
"projection",
),
mock.WithQueryErr(sql.ErrConnDone),
),
),
},
args: args{
ctx: authz.WithInstanceID(context.Background(), "instance"),
},
want: want{
isErr: func(t *testing.T, err error) {
if !errors.Is(err, sql.ErrConnDone) {
t.Errorf("unexpected error, want: %v, got: %v", sql.ErrConnDone, err)
}
},
},
},
{
name: "state locked",
fields: fields{
projection: &projection{
name: "projection",
},
mock: mock.NewSQLMock(t,
mock.ExpectBegin(nil),
mock.ExpectQuery(currentStateStmt,
mock.WithQueryArgs(
"instance",
"projection",
),
mock.WithQueryErr(&pgconn.PgError{Code: "55P03"}),
),
),
},
args: args{
ctx: authz.WithInstanceID(context.Background(), "instance"),
},
want: want{
isErr: func(t *testing.T, err error) {
pgErr := new(pgconn.PgError)
if !errors.As(err, &pgErr) {
t.Errorf("error should be PgErr but was %T", err)
return
}
if pgErr.Code != "55P03" {
t.Errorf("expected code 55P03 got: %s", pgErr.Code)
}
},
},
},
{
name: "success",
fields: fields{
projection: &projection{
name: "projection",
},
mock: mock.NewSQLMock(t,
mock.ExpectBegin(nil),
mock.ExpectQuery(currentStateStmt,
mock.WithQueryArgs(
"instance",
"projection",
),
mock.WithQueryResult(
[]string{"aggregate_id", "aggregate_type", "event_sequence", "event_date", "position", "offset"},
[][]driver.Value{
{
"aggregate id",
"aggregate type",
int64(42),
testTime,
decimal.NewFromInt(42).String(),
uint16(10),
},
},
),
),
),
},
args: args{
ctx: authz.WithInstanceID(context.Background(), "instance"),
},
want: want{
currentState: &state{
instanceID: "instance",
eventTimestamp: testTime,
position: decimal.NewFromInt(42),
aggregateType: "aggregate type",
aggregateID: "aggregate id",
sequence: 42,
offset: 10,
},
},
},
}
for _, tt := range tests {
if tt.want.isErr == nil {
tt.want.isErr = func(t *testing.T, err error) {
if err != nil {
t.Error("expected no error got:", err)
}
}
}
t.Run(tt.name, func(t *testing.T) {
h := &Handler{
projection: tt.fields.projection,
}
tx, err := tt.fields.mock.DB.BeginTx(context.Background(), nil)
if err != nil {
t.Fatalf("unable to begin transaction: %v", err)
}
gotCurrentState, err := h.currentState(tt.args.ctx, tx)
tt.want.isErr(t, err)
if !reflect.DeepEqual(gotCurrentState, tt.want.currentState) {
t.Errorf("Handler.currentState() gotCurrentState = %v, want %v", gotCurrentState, tt.want.currentState)
}
tt.fields.mock.Assert(t)
})
}
}