postgres-operator
postgres-operator copied to clipboard
No HA when node goes down
Hello Im trying to setup HA cluster with 2 nodes. When testing if HA works, i disconnect one kubernetes node, simulating unexpected failure, I can see that seconds pod is taking over (Im the leader), but when the other pod is down, statefulset becomes unhelthy and service is not reachable in cluster over name anymore, so despite second pod is taking over, nothing in cluster can use database. Am i missing anything important ?
`apiVersion: "acid.zalan.do/v1" kind: postgresql metadata: name: {{ .Values.postgresql.name}} spec: teamId: "master" {{- if .Values.postgresql.nodeAffinity }} nodeAffinity: {{ toYaml .Values.postgresql.nodeAffinity | nindent 4 }} {{- end }} sidecars: - name: postgres-exporter image: wrouesnel/postgres_exporter:v0.8.0 ports: - name: metrics containerPort: 9187 protocol: TCP env: - name: POSTGRES_SERVICE_HOST value: "localhost" - name: DATA_SOURCE_URI value: "localhost:5432/postgres?sslmode=disable" - name: DATA_SOURCE_USER valueFrom: secretKeyRef: name: postgres.timescale.credentials.postgresql.acid.zalan.do key: username - name: DATA_SOURCE_PASS valueFrom: secretKeyRef: name: postgres.timescale.credentials.postgresql.acid.zalan.do key: password - name: DATA_SOURCE_NAME value: "postgresql://$(DATA_SOURCE_USER):$(DATA_SOURCE_PASS)@$(DATA_SOURCE_URI)" - name: PG_EXPORTER_AUTO_DISCOVER_DATABASES value: "true" resources: limits: cpu: 100m memory: 100Mi requests: cpu: 50m memory: 50Mi volume: size: 10Gi # Adjust storage as needed numberOfInstances: {{ .Values.postgresql.numberOfInstances}} postgresql: version: "17" parameters: {{- if .Values.global.environment.postgresql_wal_archive }} archive_mode: {{ .Values.postgresql.wal.archive.mode | quote }} archive_timeout: "60" wal_level: "replica" max_wal_senders: "10" restore_command: "envdir /run/etc/wal-e.d/env wal-e wal-fetch %f %p" archive_command: "envdir /run/etc/wal-e.d/env wal-e wal-push %p" {{- end }} patroni: pg_hba: - local all all trust - hostssl all +zalandos 127.0.0.1/32 pam - host all all 127.0.0.1/32 md5 - hostssl all +zalandos ::1/128 pam - host all all ::1/128 md5 - local replication standby trust - hostssl replication standby all md5 - hostnossl all all all md5 - hostssl all +zalandos all pam - hostssl all all all md5 kubernetes: use_endpoints: true bypass_api_service: false dcs: enable_kubernetes_api: true kubernetes: use_endpoints: true bypass_api_service: false ttl: 30 loop_wait: 10 retry_timeout: 10 maximum_lag_on_failover: 1048576 users: ###redacted
databases: ###redacted
preparedDatabases: {{ toYaml .Values.postgresql.preparedDatabases | nindent 4 }}
`