feat: await initial database connection (#10869)

# Which Problems Are Solved

When Postgres was not ready when the API was started, the API failed
immediately.
This made task orchestration hard, especially in a platform agnostic
way:

- The current health check in the Nx target `@zitadel/api:prod` uses the
timeout command, which is not installed on all platforms and behaves
unpredictably
- The current health check in the Nx target `@zitadel/api:prod` requires
the DB to have been started using `@zitadel/zitadel:db`

# How the Problems Are Solved

- Additional configuration option `Database.Postgres.AwaitInitialConn`
is added and defaults to *0m* for backwards compatibility.
- If a duration is configured, the API retries to ping the database
until it succeeds
- The API sleeps for a second between each ping.
- It emits an info-level log with the error on each try.
- When the configured duration times out before the ping is successful,
the error is returned and the command exits with a failure code.
- When the ping succeeds within the configured duration, the API goes on
with the init, setup or start phase.

# Additional Context

- Relates to internally reported problems with the current DB health
check command
[here](https://zitadel.slack.com/archives/C07EUL5H83A/p1759915009839269?thread_ts=1759912259.410789&cid=C07EUL5H83A)
and
[here](https://zitadel.slack.com/archives/C07EUL5H83A/p1759918324246249?thread_ts=1759912259.410789&cid=C07EUL5H83A).

(cherry picked from commit 7ba6870baf)
This commit is contained in:
Elio Bischof
2025-10-09 13:18:34 +02:00
committed by Livio Spring
parent d45d19f575
commit 4f313093f9
7 changed files with 27 additions and 12 deletions

View File

@@ -2,6 +2,7 @@ ExternalSecure: false
TLS.Enabled: false TLS.Enabled: false
Database.Postgres: Database.Postgres:
Database: zitadel Database: zitadel
AwaitInitialConn: 5m
MaxOpenConns: 20 MaxOpenConns: 20
MaxIdleConns: 20 MaxIdleConns: 20
ConnMaxLifetime: 60m ConnMaxLifetime: 60m

View File

@@ -33,7 +33,6 @@
"options": { "options": {
"parallel": false, "parallel": false,
"commands": [ "commands": [
"timeout 300 bash -c 'until nx run @zitadel/devcontainer:compose exec ${API_AWAIT_DB_SERVICE} pg_isready -U postgres -h localhost; do echo \"Awaiting DB\"; sleep 2; done' || (echo \"Database readiness check timed out after 5 minutes\" && exit 1)",
"./.artifacts/bin/$(go env GOOS)/$(go env GOARCH)/${ZITADEL_BINARY:-zitadel.local} start-from-init --config ${API_CONFIG_FILE} --steps ${API_CONFIG_FILE} --masterkey MasterkeyNeedsToHave32Characters" "./.artifacts/bin/$(go env GOOS)/$(go env GOARCH)/${ZITADEL_BINARY:-zitadel.local} start-from-init --config ${API_CONFIG_FILE} --steps ${API_CONFIG_FILE} --masterkey MasterkeyNeedsToHave32Characters"
] ]
}, },

View File

@@ -7,6 +7,7 @@ Database:
postgres: postgres:
# This makes the e2e config reusable with an out-of-docker zitadel process and an /etc/hosts entry # This makes the e2e config reusable with an out-of-docker zitadel process and an /etc/hosts entry
database: zitadel database: zitadel
AwaitInitialConn: 5m
MaxOpenConns: 15 MaxOpenConns: 15
MaxIdleConns: 10 MaxIdleConns: 10
Database: zitadel Database: zitadel

View File

@@ -1,5 +1,6 @@
Database: Database:
Postgres: Postgres:
AwaitInitialConn: 5m
MaxOpenConns: 20 MaxOpenConns: 20
MaxIdleConns: 20 MaxIdleConns: 20
MaxConnLifetime: 1h MaxConnLifetime: 1h

View File

@@ -118,6 +118,7 @@ Database:
Host: localhost # ZITADEL_DATABASE_POSTGRES_HOST Host: localhost # ZITADEL_DATABASE_POSTGRES_HOST
Port: 5432 # ZITADEL_DATABASE_POSTGRES_PORT Port: 5432 # ZITADEL_DATABASE_POSTGRES_PORT
Database: zitadel # ZITADEL_DATABASE_POSTGRES_DATABASE Database: zitadel # ZITADEL_DATABASE_POSTGRES_DATABASE
AwaitInitialConn: 0m # ZITADEL_DATABASE_POSTGRES_AWAITINITIALCONN
MaxOpenConns: 10 # ZITADEL_DATABASE_POSTGRES_MAXOPENCONNS MaxOpenConns: 10 # ZITADEL_DATABASE_POSTGRES_MAXOPENCONNS
MaxIdleConns: 5 # ZITADEL_DATABASE_POSTGRES_MAXIDLECONNS MaxIdleConns: 5 # ZITADEL_DATABASE_POSTGRES_MAXIDLECONNS
MaxConnLifetime: 30m # ZITADEL_DATABASE_POSTGRES_MAXCONNLIFETIME MaxConnLifetime: 30m # ZITADEL_DATABASE_POSTGRES_MAXCONNLIFETIME

View File

@@ -13,6 +13,7 @@ Database:
Host: localhost Host: localhost
Port: 5432 Port: 5432
Database: zitadel Database: zitadel
AwaitInitialConn: 5m
MaxOpenConns: 15 MaxOpenConns: 15
MaxIdleConns: 10 MaxIdleConns: 10
MaxConnLifetime: 1h MaxConnLifetime: 1h

View File

@@ -29,15 +29,16 @@ const (
) )
type Config struct { type Config struct {
Host string Host string
Port int32 Port int32
Database string Database string
MaxOpenConns uint32 AwaitInitialConn time.Duration
MaxIdleConns uint32 MaxOpenConns uint32
MaxConnLifetime time.Duration MaxIdleConns uint32
MaxConnIdleTime time.Duration MaxConnLifetime time.Duration
User User MaxConnIdleTime time.Duration
Admin AdminUser User User
Admin AdminUser
// Additional options to be appended as options=<Options> // Additional options to be appended as options=<Options>
// The value will be taken as is. Multiple options are space separated. // The value will be taken as is. Multiple options are space separated.
Options string Options string
@@ -127,8 +128,18 @@ func (c *Config) Connect(useAdmin bool) (*sql.DB, *pgxpool.Pool, error) {
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
if err = pool.Ping(context.Background()); err != nil && c.AwaitInitialConn > 0 {
if err := pool.Ping(context.Background()); err != nil { waitUntil := time.Now().Add(c.AwaitInitialConn)
for time.Now().Before(waitUntil) {
logging.Infof("retrying initial database connection in a second: %v", err)
time.Sleep(time.Second)
if err = pool.Ping(context.Background()); err == nil {
break
}
}
}
if err != nil {
pool.Close()
return nil, nil, err return nil, nil, err
} }