home-ops icon indicating copy to clipboard operation
home-ops copied to clipboard

🔍 feat(monitoring): Optimize Prometheus metrics collection and reduce cardinality

Open axeII opened this issue 2 weeks ago • 2 comments

axeII avatar Nov 16 '25 17:11 axeII

--- kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: observability/kube-prometheus-stack HelmRelease: observability/kube-prometheus-stack

+++ kubernetes/apps/observability/kube-prometheus-stack/app Kustomization: observability/kube-prometheus-stack HelmRelease: observability/kube-prometheus-stack

@@ -75,31 +75,54 @@

       - pods=[*]
       - deployments=[*]
       - persistentvolumeclaims=[*]
       prometheus:
         monitor:
           enabled: true
+          interval: 60s
+          metricRelabelings:
+          - action: drop
+            regex: kube_.*_owner
+            sourceLabels:
+            - __name__
+          - action: drop
+            regex: kube_.*(annotations|labels)
+            sourceLabels:
+            - __name__
+          - action: drop
+            regex: kube_.*_resource_version
+            sourceLabels:
+            - __name__
           relabelings:
           - action: replace
             regex: (.*)
             replacement: $1
             sourceLabels:
             - __meta_kubernetes_pod_node_name
             targetLabel: kubernetes_node
+          scrapeTimeout: 55s
     kubeApiServer:
       enabled: true
       serviceMonitor:
+        interval: 60s
         metricRelabelings:
         - action: drop
           regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket
           sourceLabels:
           - __name__
         - action: drop
           regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket)
           sourceLabels:
           - __name__
+        - action: drop
+          regex: apiserver_request_duration_seconds_bucket;(0.001|0.002|0.004|0.008|0.016|0.032|0.064|0.128|0.256)
+          separator: ;
+          sourceLabels:
+          - __name__
+          - le
+        scrapeTimeout: 55s
     kubeControllerManager:
       enabled: true
       endpoints:
       - 192.168.69.110
     kubeEtcd:
       enabled: false
@@ -113,42 +136,83 @@

       - 192.168.69.110
     kubeStateMetrics:
       enabled: true
     kubelet:
       enabled: true
       serviceMonitor:
+        cAdvisor: true
+        cAdvisorMetricRelabelings:
+        - action: drop
+          regex: (container_tasks_state|container_memory_failures_total|container_memory_mapped_file)
+          sourceLabels:
+          - __name__
+        - action: drop
+          regex: (exited|dead)
+          sourceLabels:
+          - container_state
+        - action: drop
+          regex: (container_.*_duration_seconds_bucket);(0.005|0.01|0.025|0.05|0.075|2.5|7.5|15|30|45)
+          separator: ;
+          sourceLabels:
+          - __name__
+          - le
+        interval: 60s
         metricRelabelings:
         - action: labeldrop
           regex: (uid)
         - action: labeldrop
           regex: (id|name)
+        - action: labeldrop
+          regex: (container_id|image_id)
         - action: drop
           regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count)
           sourceLabels:
           - __name__
+        - action: drop
+          regex: (container_memory_failures_total|container_memory_usage_bytes|container_network_tcp_usage_total)
+          sourceLabels:
+          - __name__
+        - action: drop
+          regex: (container_cpu_cfs_throttled_periods_total|container_cpu_cfs_throttled_seconds_total|container_cpu_cfs_periods_total|container_cpu_usage_seconds_total|container_fs_io_time_seconds_total|container_fs_io_time_weighted_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_last_seen|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total)
+          sourceLabels:
+          - __name__
+        - action: drop
+          regex: (kubelet_runtime_operations_duration_seconds_bucket|kubelet_runtime_operations_errors_total)
+          sourceLabels:
+          - __name__
+        scrapeTimeout: 55s
     nodeExporter:
       enabled: true
     prometheus:
       prometheusSpec:
+        enableAdminAPI: true
+        enforcedLabelLimit: 50
+        enforcedLabelNameLengthLimit: 256
+        enforcedLabelValueLengthLimit: 2048
+        enforcedSampleLimit: 500000
+        enforcedTargetLimit: 500
+        evaluationInterval: 60s
         externalUrl: https://prometheus.juno.moe
         image:
           registry: docker.io
           repository: prompp/prompp
           tag: 0.6.4
         podMonitorSelectorNilUsesHelmValues: false
         probeSelectorNilUsesHelmValues: false
+        queryLogFile: /prometheus/query.log
         replicas: 1
         resources:
           limits:
             memory: 4Gi
           requests:
-            cpu: 100m
+            cpu: 500m
         retention: 8d
         retentionSize: 23GB
         ruleSelectorNilUsesHelmValues: false
         scrapeConfigSelectorNilUsesHelmValues: false
+        scrapeInterval: 60s
         securityContext:
           fsGroup: 64535
           runAsGroup: 64535
           runAsNonRoot: true
           runAsUser: 64535
         serviceMonitorSelectorNilUsesHelmValues: false
@@ -157,12 +221,13 @@

             spec:
               resources:
                 requests:
                   storage: 25Gi
               storageClassName: ceph-block
         version: v2.55.1
+        walCompression: true
       route:
         main:
           enabled: true
           hostnames:
           - prometheus.juno.moe
           parentRefs:
@@ -175,14 +240,29 @@

         enabled: false
     prometheus-node-exporter:
       fullnameOverride: node-exporter
       prometheus:
         monitor:
           enabled: true
+          interval: 60s
+          metricRelabelings:
+          - action: drop
+            regex: node_filesystem_.*;/var/lib/kubelet/pods/.*
+            separator: ;
+            sourceLabels:
+            - __name__
+            - mountpoint
+          - action: drop
+            regex: node_network_.*;(veth.*|br.*|cilium.*|lxc.*)
+            separator: ;
+            sourceLabels:
+            - __name__
+            - device
           relabelings:
           - action: replace
             regex: (.*)
             replacement: $1
             sourceLabels:
             - __meta_kubernetes_pod_node_name
             targetLabel: kubernetes_node
+          scrapeTimeout: 50s
 

bot-akira[bot] avatar Nov 16 '25 17:11 bot-akira[bot]

--- HelmRelease: observability/kube-prometheus-stack Prometheus: observability/kube-prometheus-stack

+++ HelmRelease: observability/kube-prometheus-stack Prometheus: observability/kube-prometheus-stack

@@ -28,18 +28,20 @@

   replicas: 1
   shards: 1
   logLevel: info
   logFormat: logfmt
   listenLocal: false
   enableOTLPReceiver: false
-  enableAdminAPI: false
+  enableAdminAPI: true
+  scrapeInterval: 60s
+  evaluationInterval: 60s
   resources:
     limits:
       memory: 4Gi
     requests:
-      cpu: 100m
+      cpu: 500m
   retention: 8d
   retentionSize: 23GB
   tsdb:
     outOfOrderTimeWindow: 0s
   walCompression: true
   routePrefix: /
@@ -82,8 +84,14 @@

               - prometheus
             - key: app.kubernetes.io/instance
               operator: In
               values:
               - kube-prometheus-stack
   portName: http-web
+  queryLogFile: /prometheus/query.log
+  enforcedSampleLimit: 500000
+  enforcedTargetLimit: 500
+  enforcedLabelLimit: 50
+  enforcedLabelNameLengthLimit: 256
+  enforcedLabelValueLengthLimit: 2048
   hostNetwork: false
 
--- HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/kube-state-metrics

+++ HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/kube-state-metrics

@@ -16,13 +16,28 @@

   selector:
     matchLabels:
       app.kubernetes.io/name: kube-state-metrics
       app.kubernetes.io/instance: kube-prometheus-stack
   endpoints:
   - port: http
+    interval: 60s
+    scrapeTimeout: 55s
     honorLabels: true
+    metricRelabelings:
+    - action: drop
+      regex: kube_.*_owner
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: kube_.*(annotations|labels)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: kube_.*_resource_version
+      sourceLabels:
+      - __name__
     relabelings:
     - action: replace
       regex: (.*)
       replacement: $1
       sourceLabels:
       - __meta_kubernetes_pod_node_name
--- HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/node-exporter

+++ HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/node-exporter

@@ -19,14 +19,29 @@

       app.kubernetes.io/instance: kube-prometheus-stack
   attachMetadata:
     node: false
   endpoints:
   - port: http-metrics
     scheme: http
+    interval: 60s
+    scrapeTimeout: 50s
     relabelings:
     - action: replace
       regex: (.*)
       replacement: $1
       sourceLabels:
       - __meta_kubernetes_pod_node_name
       targetLabel: kubernetes_node
+    metricRelabelings:
+    - action: drop
+      regex: node_filesystem_.*;/var/lib/kubelet/pods/.*
+      separator: ;
+      sourceLabels:
+      - __name__
+      - mountpoint
+    - action: drop
+      regex: node_network_.*;(veth.*|br.*|cilium.*|lxc.*)
+      separator: ;
+      sourceLabels:
+      - __name__
+      - device
 
--- HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/kube-prometheus-stack-apiserver

+++ HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/kube-prometheus-stack-apiserver

@@ -11,23 +11,30 @@

     app.kubernetes.io/part-of: kube-prometheus-stack
     release: kube-prometheus-stack
     heritage: Helm
 spec:
   endpoints:
   - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    interval: 60s
     port: https
     scheme: https
     metricRelabelings:
     - action: drop
       regex: (apiserver|etcd|rest_client)_request(|_sli|_slo)_duration_seconds_bucket
       sourceLabels:
       - __name__
     - action: drop
       regex: (apiserver_response_sizes_bucket|apiserver_watch_events_sizes_bucket)
       sourceLabels:
       - __name__
+    - action: drop
+      regex: apiserver_request_duration_seconds_bucket;(0.001|0.002|0.004|0.008|0.016|0.032|0.064|0.128|0.256)
+      separator: ;
+      sourceLabels:
+      - __name__
+      - le
     tlsConfig:
       caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       serverName: kubernetes
       insecureSkipVerify: false
   jobLabel: component
   namespaceSelector:
--- HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/kube-prometheus-stack-kubelet

+++ HelmRelease: observability/kube-prometheus-stack ServiceMonitor: observability/kube-prometheus-stack-kubelet

@@ -22,87 +22,85 @@

     matchLabels:
       app.kubernetes.io/name: kubelet
       k8s-app: kubelet
   endpoints:
   - port: https-metrics
     scheme: https
+    interval: 60s
+    scrapeTimeout: 55s
     tlsConfig:
       caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecureSkipVerify: true
     bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
     honorLabels: true
     honorTimestamps: true
     metricRelabelings:
     - action: labeldrop
       regex: (uid)
     - action: labeldrop
       regex: (id|name)
+    - action: labeldrop
+      regex: (container_id|image_id)
     - action: drop
       regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: (container_memory_failures_total|container_memory_usage_bytes|container_network_tcp_usage_total)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: (container_cpu_cfs_throttled_periods_total|container_cpu_cfs_throttled_seconds_total|container_cpu_cfs_periods_total|container_cpu_usage_seconds_total|container_fs_io_time_seconds_total|container_fs_io_time_weighted_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_last_seen|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: (kubelet_runtime_operations_duration_seconds_bucket|kubelet_runtime_operations_errors_total)
       sourceLabels:
       - __name__
     relabelings:
     - action: replace
       sourceLabels:
       - __metrics_path__
       targetLabel: metrics_path
   - port: https-metrics
     scheme: https
     path: /metrics/cadvisor
-    interval: 10s
+    interval: 60s
+    scrapeTimeout: 55s
     honorLabels: true
     honorTimestamps: true
     trackTimestampsStaleness: true
     tlsConfig:
       caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecureSkipVerify: true
     bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
     metricRelabelings:
     - action: drop
-      regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+      regex: (container_tasks_state|container_memory_failures_total|container_memory_mapped_file)
       sourceLabels:
       - __name__
     - action: drop
-      regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+      regex: (exited|dead)
+      sourceLabels:
+      - container_state
+    - action: drop
+      regex: (container_.*_duration_seconds_bucket);(0.005|0.01|0.025|0.05|0.075|2.5|7.5|15|30|45)
+      separator: ;
       sourceLabels:
       - __name__
-    - action: drop
-      regex: container_memory_(mapped_file|swap)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: container_(file_descriptors|tasks_state|threads_max)
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: container_memory_failures_total;hierarchy
-      sourceLabels:
-      - __name__
-      - scope
-    - action: drop
-      regex: container_network_.*;(cali|cilium|cni|lxc|nodelocaldns|tunl).*
-      sourceLabels:
-      - __name__
-      - interface
-    - action: drop
-      regex: container_spec.*
-      sourceLabels:
-      - __name__
-    - action: drop
-      regex: .+;
-      sourceLabels:
-      - id
-      - pod
+      - le
     relabelings:
     - action: replace
       sourceLabels:
       - __metrics_path__
       targetLabel: metrics_path
   - port: https-metrics
     scheme: https
     path: /metrics/probes
+    interval: 60s
+    scrapeTimeout: 55s
     honorLabels: true
     honorTimestamps: true
     tlsConfig:
       caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
       insecureSkipVerify: true
     bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token

bot-akira[bot] avatar Nov 16 '25 17:11 bot-akira[bot]