helm-charts
helm-charts copied to clipboard
🫐 🐛 Operator is not setting the fsGroup ID specified through the statefulset securityContext
What happened?
Redpanda data directories are incorrectly being assigned the systemd-journald
group ID, and systemd-resolve
user ID in the host.
What did you expect to happen?
The value I set as fsGroup
to be set as group ID in Redpanda's data directory.
How can we reproduce it (as minimally and precisely as possible)?. Please include values file.
COMPUTED VALUES:
affinity: {}
auditLogging:
clientMaxBufferSize: 16777216
enabled: false
enabledEventTypes: null
excludedPrincipals: null
excludedTopics: null
listener: internal
partitions: 12
queueDrainIntervalMs: 500
queueMaxBufferSizePerShard: 1048576
replicationFactor: null
auth:
sasl:
enabled: false
mechanism: SCRAM-SHA-512
secretRef: redpanda-superusers
users: []
clusterDomain: cluster.local
commonLabels: {}
config:
cluster:
default_topic_replications: 3
minimum_topic_replications: 3
node:
crash_loop_limit: 5
pandaproxy_client: {}
rpk: {}
schema_registry_client: {}
tunable:
compacted_log_segment_size: 67108864
group_topic_partitions: 16
kafka_batch_max_bytes: 1048576
kafka_connection_rate_limit: 1000
log_segment_size: 134217728
log_segment_size_max: 268435456
log_segment_size_min: 16777216
max_compacted_log_segment_size: 536870912
topic_partitions_per_shard: 1000
connectors:
deployment:
create: false
enabled: false
test:
create: false
console:
config: {}
configmap:
create: false
deployment:
create: false
enabled: false
secret:
create: false
enterprise:
license: ""
licenseSecretRef:
key: license
name: redpanda-license
external:
addresses:
- $PREFIX_TEMPLATE
domain: camilo.panda.dev
enabled: true
externalDns:
enabled: true
prefixTemplate: rp${POD_ORDINAL}-$(echo -n $HOST_IP_ADDRESS | sha256sum | head -c
7)
service:
enabled: true
type: NodePort
fullnameOverride: ""
image:
pullPolicy: IfNotPresent
repository: docker.redpanda.com/redpandadata/redpanda
tag: v23.3.7
imagePullSecrets: []
license_key: ""
license_secret_ref: {}
listeners:
admin:
external:
admin-api:
advertisedPorts:
- 30644
authenticationMethod: sasl
enabled: false
port: 30644
tls:
cert: letsencrypt
enabled: true
requireClientAuth: false
default:
advertisedPorts:
- 31644
port: 9645
tls:
cert: external
port: 9644
tls:
cert: selfsigned
enabled: false
requireClientAuth: false
http:
authenticationMethod: http_basic
enabled: true
external:
default:
advertisedPorts:
- 30082
authenticationMethod: null
port: 8083
tls:
cert: external
requireClientAuth: false
http-proxy:
advertisedPorts:
- 31082
authenticationMethod: http_basic
enabled: true
port: 31082
tls:
cert: letsencrypt
enabled: true
requireClientAuth: false
kafkaEndpoint: default
port: 8082
prefixTemplate: http-proxy$POD_ORDINAL
tls:
cert: selfsigned
enabled: true
requireClientAuth: false
kafka:
authenticationMethod: sasl
external:
default:
advertisedPorts:
- 31092
authenticationMethod: null
port: 9094
tls:
cert: external
kafka-api:
advertisedPorts:
- 32092
authenticationMethod: sasl
enabled: true
port: 32092
tls:
cert: letsencrypt
requireClientAuth: false
port: 9092
prefixTemplate: kafka-api$POD_ORDINAL
tls:
cert: selfsigned
requireClientAuth: false
rpc:
port: 33145
tls:
cert: selfsigned
requireClientAuth: false
schemaRegistry:
authenticationMethod: http_basic
enabled: true
external:
default:
advertisedPorts:
- 30081
authenticationMethod: null
port: 8084
tls:
cert: external
requireClientAuth: false
schema-registry:
advertisedPorts:
- 31081
authenticationMethod: http_basic
enabled: true
port: 31081
tls:
cert: letsencrypt
requireClientAuth: false
kafkaEndpoint: default
port: 8081
tls:
cert: selfsigned
requireClientAuth: false
logging:
logLevel: debug
usageStats:
clusterId: 9m4e2mr0ui3e8a215n4g
enabled: true
monitoring:
enabled: false
labels: {}
scrapeInterval: 30s
tlsConfig: {}
nameOverride: ""
nodeSelector: {}
post_install_job:
affinity: {}
enabled: true
post_upgrade_job:
affinity: {}
enabled: true
rackAwareness:
enabled: true
nodeAnnotation: topology.kubernetes.io/zone
rbac:
annotations: {}
enabled: false
resources:
cpu:
cores: "3"
memory:
container:
max: 8Gi
min: 8Gi
serviceAccount:
annotations:
azure.workload.identity/client-id: c90db393-857d-41d0-ac0d-0e61271fcaa6
create: true
name: id-rpcloud-9m4e2mr0ui3e8a215n4
statefulset:
additionalRedpandaCmdFlags:
- --abort-on-seastar-bad-alloc
- --dump-memory-diagnostics-on-alloc-failure-kind=all
annotations: {}
budget:
maxUnavailable: 1
extraVolumeMounts: ""
extraVolumes: ""
initContainerImage:
repository: busybox
tag: latest
initContainers:
configurator:
extraVolumeMounts: ""
resources: {}
extraInitContainers: ""
fsValidator:
enabled: true
expectedFS: xfs
extraVolumeMounts: ""
resources: {}
setDataDirOwnership:
enabled: true
extraVolumeMounts: ""
resources: {}
setTieredStorageCacheDirOwnership:
extraVolumeMounts: ""
resources: {}
tuning:
extraVolumeMounts: ""
resources: {}
livenessProbe:
failureThreshold: 3
initialDelaySeconds: 10
periodSeconds: 10
nodeSelector:
cloud.redpanda.com/role: redpanda
podAffinity: {}
podAntiAffinity:
custom: {}
topologyKey: kubernetes.io/hostname
type: hard
weight: 100
priorityClassName: ""
readinessProbe:
failureThreshold: 3
initialDelaySeconds: 1
periodSeconds: 10
successThreshold: 1
replicas: 3
securityContext:
allowPrivilegeEscalation: false
fsGroup: 101
fsGroupChangePolicy: OnRootMismatch
runAsGroup: 65530
runAsNonRoot: true
runAsUser: 65530
sideCars:
configWatcher:
enabled: true
extraVolumeMounts: ""
resources: {}
securityContext: {}
controllers:
createRBAC: true
enabled: false
healthProbeAddress: :8085
image:
repository: docker.redpanda.com/redpandadata/redpanda-operator
tag: v2.1.10-23.2.18
metricsAddress: :9082
resources: {}
run:
- all
securityContext: {}
startupProbe:
failureThreshold: 120
initialDelaySeconds: 1
periodSeconds: 10
terminationGracePeriodSeconds: 90
tolerations:
- effect: NoSchedule
key: cloud.redpanda.com/role
operator: Equal
value: redpanda
topologySpreadConstraints:
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: ScheduleAnyway
updateStrategy:
type: RollingUpdate
storage:
hostPath: ""
persistentVolume:
annotations: {}
enabled: true
labels: {}
nameOverwrite: ""
size: 4096Gi
storageClass: local-path
tiered:
config:
cloud_storage_access_key: ""
cloud_storage_api_endpoint: ""
cloud_storage_azure_container: null
cloud_storage_azure_shared_key: null
cloud_storage_azure_storage_account: null
cloud_storage_bucket: ""
cloud_storage_cache_size: 5368709120
cloud_storage_credentials_source: config_file
cloud_storage_enable_remote_read: true
cloud_storage_enable_remote_write: true
cloud_storage_enabled: false
cloud_storage_region: ""
cloud_storage_secret_key: ""
credentialsSecretRef:
accessKey:
configurationKey: cloud_storage_access_key
secretKey:
configurationKey: cloud_storage_secret_key
hostPath: ""
mountType: persistentVolume
persistentVolume:
annotations: {}
labels: {}
storageClass: local-path
tests:
enabled: true
tls:
certs:
default:
caEnabled: true
external:
caEnabled: true
letsencrypt:
caEnabled: false
duration: 43800h0m0s
issuerRef:
kind: ClusterIssuer
name: letsencrypt-dns
selfsigned:
caEnabled: true
duration: 43800h0m0s
issuerRef:
kind: ClusterIssuer
name: redpanda.local
enabled: true
tolerations: []
tuning:
tune_aio_events: false
Anything else we need to know?
The securityContext I'm setting:
statefulset:
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 65534
runAsGroup: 65534
fsGroup: 65534
Which are the affected charts?
Redpanda, Operator
Chart Version(s)
❯ helm -n redpanda list
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
redpanda redpanda 3 2024-04-03 15:51:16.648405 -0400 EDT deployed redpanda-0.1.1 0.1.0
redpanda-broker redpanda 2 2024-04-03 19:08:29.79347033 +0000 UTC deployed redpanda-5.7.37 v23.3.10
redpanda-operator redpanda 1 2024-04-01 18:14:29.732128 -0400 EDT deployed operator-0.4.20 v2.1.15-23.3.7
Cloud provider
JIRA Link: K8S-140
@c4milo can you please run the same ls command on the containers of redpanda pods, for example I see on a kind cluster
$ ls -alh /var/lib/redpanda/
total 16K
drwxr-xr-x 1 redpanda redpanda 4.0K Apr 5 08:46 .
drwxr-xr-x 1 root root 4.0K Apr 5 08:46 ..
lrwxrwxrwx 1 root root 13 Apr 5 08:46 conf -> /etc/redpanda
drwxr-xr-x 2 redpanda redpanda 4.0K Apr 18 2019 coredump
drwxrwxrwx 5 root root 4.0K Apr 10 13:37 data
I would probably get the same result but that's not the real user or group owning the data in the node, right? Basically, in Azure's case, if the node gets compromised through any of these systemd components, the intruder will have full access to the data. So, ideally we use UIDs and GIDs dedicated to Redpanda.
Or better, we map them to the user and group nobody
in the host (usually 65534)
Is this really a limitation of the operator thought? We are asking that the container use a known user for the container. You are suggesting that the PVs be provisioned on the vm using that same user correct? Even though that user may not exist on the VMs' OS? Have you seen this work with other tools? If you have can you point them to me, it may help me close my knowledge gap quicker.
It would think it is if it doesn't let me set what I want. I'm setting fsgroup and it doesn't seem to be taking it.
we just found out there are more properties from securityContext
not being set despite we setting them: capabilities
, runAsGroup
, and privileged
FWIW, here is what I'm trying to set in most of our pods, this user and group should usually exist in the container and the host.
podSecurityContext:
runAsNonRoot: true
runAsUser: 65534
runAsGroup: 65534
fsGroup: 65534
securityContext:
privileged: false
allowPrivilegeEscalation: false
runAsNonRoot: true
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsUser: 65534
runAsGroup: 65534
See also https://github.com/rancher/local-path-provisioner/blob/master/examples/pod-with-security-context/pod.yaml
I think you could test a simple change in setup local-path-provisioner script https://github.com/redpanda-data/cloudv2/blob/cec154918acf04bc8fc92f7475b66a7bf67eca8d/terraform/provisioners/kubernetes-redpanda-aws/files/local-path-provisioner.yaml#L139 to change the owner.
@c4milo, when you have a second would you mind verifying if the issue was due to local-path-provisioner or if we need to fix something on our end?
@c4milo As far as I understand with https://github.com/redpanda-data/cloudv2/pull/17151 you fixed the issue. If you think that the issue is not resolved please re-open or create new one.