feat: await initial database connection (#10869)

# Which Problems Are Solved

When Postgres was not ready when the API was started, the API failed
immediately.
This made task orchestration hard, especially in a platform agnostic
way:

- The current health check in the Nx target `@zitadel/api:prod` uses the
timeout command, which is not installed on all platforms and behaves
unpredictably
- The current health check in the Nx target `@zitadel/api:prod` requires
the DB to have been started using `@zitadel/zitadel:db`

# How the Problems Are Solved

- Additional configuration option `Database.Postgres.AwaitInitialConn`
is added and defaults to *0m* for backwards compatibility.
- If a duration is configured, the API retries to ping the database
until it succeeds
- The API sleeps for a second between each ping.
- It emits an info-level log with the error on each try.
- When the configured duration times out before the ping is successful,
the error is returned and the command exits with a failure code.
- When the ping succeeds within the configured duration, the API goes on
with the init, setup or start phase.

# Additional Context

- Relates to internally reported problems with the current DB health
check command
[here](https://zitadel.slack.com/archives/C07EUL5H83A/p1759915009839269?thread_ts=1759912259.410789&cid=C07EUL5H83A)
and
[here](https://zitadel.slack.com/archives/C07EUL5H83A/p1759918324246249?thread_ts=1759912259.410789&cid=C07EUL5H83A).

(cherry picked from commit 7ba6870baf)
This commit is contained in:
Elio Bischof
2025-10-09 13:18:34 +02:00
committed by Livio Spring
parent d45d19f575
commit 4f313093f9
7 changed files with 27 additions and 12 deletions

View File

@@ -2,6 +2,7 @@ ExternalSecure: false
TLS.Enabled: false
Database.Postgres:
Database: zitadel
AwaitInitialConn: 5m
MaxOpenConns: 20
MaxIdleConns: 20
ConnMaxLifetime: 60m

View File

@@ -33,7 +33,6 @@
"options": {
"parallel": false,
"commands": [
"timeout 300 bash -c 'until nx run @zitadel/devcontainer:compose exec ${API_AWAIT_DB_SERVICE} pg_isready -U postgres -h localhost; do echo \"Awaiting DB\"; sleep 2; done' || (echo \"Database readiness check timed out after 5 minutes\" && exit 1)",
"./.artifacts/bin/$(go env GOOS)/$(go env GOARCH)/${ZITADEL_BINARY:-zitadel.local} start-from-init --config ${API_CONFIG_FILE} --steps ${API_CONFIG_FILE} --masterkey MasterkeyNeedsToHave32Characters"
]
},

View File

@@ -7,6 +7,7 @@ Database:
postgres:
# This makes the e2e config reusable with an out-of-docker zitadel process and an /etc/hosts entry
database: zitadel
AwaitInitialConn: 5m
MaxOpenConns: 15
MaxIdleConns: 10
Database: zitadel

View File

@@ -1,5 +1,6 @@
Database:
Postgres:
AwaitInitialConn: 5m
MaxOpenConns: 20
MaxIdleConns: 20
MaxConnLifetime: 1h

View File

@@ -118,6 +118,7 @@ Database:
Host: localhost # ZITADEL_DATABASE_POSTGRES_HOST
Port: 5432 # ZITADEL_DATABASE_POSTGRES_PORT
Database: zitadel # ZITADEL_DATABASE_POSTGRES_DATABASE
AwaitInitialConn: 0m # ZITADEL_DATABASE_POSTGRES_AWAITINITIALCONN
MaxOpenConns: 10 # ZITADEL_DATABASE_POSTGRES_MAXOPENCONNS
MaxIdleConns: 5 # ZITADEL_DATABASE_POSTGRES_MAXIDLECONNS
MaxConnLifetime: 30m # ZITADEL_DATABASE_POSTGRES_MAXCONNLIFETIME

View File

@@ -13,6 +13,7 @@ Database:
Host: localhost
Port: 5432
Database: zitadel
AwaitInitialConn: 5m
MaxOpenConns: 15
MaxIdleConns: 10
MaxConnLifetime: 1h

View File

@@ -29,15 +29,16 @@ const (
)
type Config struct {
Host string
Port int32
Database string
MaxOpenConns uint32
MaxIdleConns uint32
MaxConnLifetime time.Duration
MaxConnIdleTime time.Duration
User User
Admin AdminUser
Host string
Port int32
Database string
AwaitInitialConn time.Duration
MaxOpenConns uint32
MaxIdleConns uint32
MaxConnLifetime time.Duration
MaxConnIdleTime time.Duration
User User
Admin AdminUser
// Additional options to be appended as options=<Options>
// The value will be taken as is. Multiple options are space separated.
Options string
@@ -127,8 +128,18 @@ func (c *Config) Connect(useAdmin bool) (*sql.DB, *pgxpool.Pool, error) {
if err != nil {
return nil, nil, err
}
if err := pool.Ping(context.Background()); err != nil {
if err = pool.Ping(context.Background()); err != nil && c.AwaitInitialConn > 0 {
waitUntil := time.Now().Add(c.AwaitInitialConn)
for time.Now().Before(waitUntil) {
logging.Infof("retrying initial database connection in a second: %v", err)
time.Sleep(time.Second)
if err = pool.Ping(context.Background()); err == nil {
break
}
}
}
if err != nil {
pool.Close()
return nil, nil, err
}