From 4f313093f91813260f0526e2455769cae22ea72e Mon Sep 17 00:00:00 2001 From: Elio Bischof Date: Thu, 9 Oct 2025 13:18:34 +0200 Subject: [PATCH] feat: await initial database connection (#10869) # Which Problems Are Solved When Postgres was not ready when the API was started, the API failed immediately. This made task orchestration hard, especially in a platform agnostic way: - The current health check in the Nx target `@zitadel/api:prod` uses the timeout command, which is not installed on all platforms and behaves unpredictably - The current health check in the Nx target `@zitadel/api:prod` requires the DB to have been started using `@zitadel/zitadel:db` # How the Problems Are Solved - Additional configuration option `Database.Postgres.AwaitInitialConn` is added and defaults to *0m* for backwards compatibility. - If a duration is configured, the API retries to ping the database until it succeeds - The API sleeps for a second between each ping. - It emits an info-level log with the error on each try. - When the configured duration times out before the ping is successful, the error is returned and the command exits with a failure code. - When the ping succeeds within the configured duration, the API goes on with the init, setup or start phase. # Additional Context - Relates to internally reported problems with the current DB health check command [here](https://zitadel.slack.com/archives/C07EUL5H83A/p1759915009839269?thread_ts=1759912259.410789&cid=C07EUL5H83A) and [here](https://zitadel.slack.com/archives/C07EUL5H83A/p1759918324246249?thread_ts=1759912259.410789&cid=C07EUL5H83A). (cherry picked from commit 7ba6870baf00ea62556b73ede28d45fc10b04b74) --- apps/api/prod-default.yaml | 1 + apps/api/project.json | 1 - apps/api/test-functional-ui.yaml | 1 + apps/api/test-integration-api.yaml | 1 + cmd/defaults.yaml | 1 + .../manage/database/_postgres.mdx | 1 + internal/database/postgres/pg.go | 33 ++++++++++++------- 7 files changed, 27 insertions(+), 12 deletions(-) diff --git a/apps/api/prod-default.yaml b/apps/api/prod-default.yaml index 7a332fbeb25..406eae4c97b 100644 --- a/apps/api/prod-default.yaml +++ b/apps/api/prod-default.yaml @@ -2,6 +2,7 @@ ExternalSecure: false TLS.Enabled: false Database.Postgres: Database: zitadel + AwaitInitialConn: 5m MaxOpenConns: 20 MaxIdleConns: 20 ConnMaxLifetime: 60m diff --git a/apps/api/project.json b/apps/api/project.json index 91b5b57654d..776b4298875 100644 --- a/apps/api/project.json +++ b/apps/api/project.json @@ -33,7 +33,6 @@ "options": { "parallel": false, "commands": [ - "timeout 300 bash -c 'until nx run @zitadel/devcontainer:compose exec ${API_AWAIT_DB_SERVICE} pg_isready -U postgres -h localhost; do echo \"Awaiting DB\"; sleep 2; done' || (echo \"Database readiness check timed out after 5 minutes\" && exit 1)", "./.artifacts/bin/$(go env GOOS)/$(go env GOARCH)/${ZITADEL_BINARY:-zitadel.local} start-from-init --config ${API_CONFIG_FILE} --steps ${API_CONFIG_FILE} --masterkey MasterkeyNeedsToHave32Characters" ] }, diff --git a/apps/api/test-functional-ui.yaml b/apps/api/test-functional-ui.yaml index 9db92d26acb..5ab9128e71e 100644 --- a/apps/api/test-functional-ui.yaml +++ b/apps/api/test-functional-ui.yaml @@ -7,6 +7,7 @@ Database: postgres: # This makes the e2e config reusable with an out-of-docker zitadel process and an /etc/hosts entry database: zitadel + AwaitInitialConn: 5m MaxOpenConns: 15 MaxIdleConns: 10 Database: zitadel diff --git a/apps/api/test-integration-api.yaml b/apps/api/test-integration-api.yaml index e381539d35b..23ca4a0fe3a 100644 --- a/apps/api/test-integration-api.yaml +++ b/apps/api/test-integration-api.yaml @@ -1,5 +1,6 @@ Database: Postgres: + AwaitInitialConn: 5m MaxOpenConns: 20 MaxIdleConns: 20 MaxConnLifetime: 1h diff --git a/cmd/defaults.yaml b/cmd/defaults.yaml index 6e8a179e62a..66ad2addc14 100644 --- a/cmd/defaults.yaml +++ b/cmd/defaults.yaml @@ -118,6 +118,7 @@ Database: Host: localhost # ZITADEL_DATABASE_POSTGRES_HOST Port: 5432 # ZITADEL_DATABASE_POSTGRES_PORT Database: zitadel # ZITADEL_DATABASE_POSTGRES_DATABASE + AwaitInitialConn: 0m # ZITADEL_DATABASE_POSTGRES_AWAITINITIALCONN MaxOpenConns: 10 # ZITADEL_DATABASE_POSTGRES_MAXOPENCONNS MaxIdleConns: 5 # ZITADEL_DATABASE_POSTGRES_MAXIDLECONNS MaxConnLifetime: 30m # ZITADEL_DATABASE_POSTGRES_MAXCONNLIFETIME diff --git a/docs/docs/self-hosting/manage/database/_postgres.mdx b/docs/docs/self-hosting/manage/database/_postgres.mdx index 5506c88a660..72bc16963ce 100644 --- a/docs/docs/self-hosting/manage/database/_postgres.mdx +++ b/docs/docs/self-hosting/manage/database/_postgres.mdx @@ -13,6 +13,7 @@ Database: Host: localhost Port: 5432 Database: zitadel + AwaitInitialConn: 5m MaxOpenConns: 15 MaxIdleConns: 10 MaxConnLifetime: 1h diff --git a/internal/database/postgres/pg.go b/internal/database/postgres/pg.go index 2f8bb29e178..da32f8cc47c 100644 --- a/internal/database/postgres/pg.go +++ b/internal/database/postgres/pg.go @@ -29,15 +29,16 @@ const ( ) type Config struct { - Host string - Port int32 - Database string - MaxOpenConns uint32 - MaxIdleConns uint32 - MaxConnLifetime time.Duration - MaxConnIdleTime time.Duration - User User - Admin AdminUser + Host string + Port int32 + Database string + AwaitInitialConn time.Duration + MaxOpenConns uint32 + MaxIdleConns uint32 + MaxConnLifetime time.Duration + MaxConnIdleTime time.Duration + User User + Admin AdminUser // Additional options to be appended as options= // The value will be taken as is. Multiple options are space separated. Options string @@ -127,8 +128,18 @@ func (c *Config) Connect(useAdmin bool) (*sql.DB, *pgxpool.Pool, error) { if err != nil { return nil, nil, err } - - if err := pool.Ping(context.Background()); err != nil { + if err = pool.Ping(context.Background()); err != nil && c.AwaitInitialConn > 0 { + waitUntil := time.Now().Add(c.AwaitInitialConn) + for time.Now().Before(waitUntil) { + logging.Infof("retrying initial database connection in a second: %v", err) + time.Sleep(time.Second) + if err = pool.Ping(context.Background()); err == nil { + break + } + } + } + if err != nil { + pool.Close() return nil, nil, err }