k8s-gitops icon indicating copy to clipboard operation
k8s-gitops copied to clipboard

feat!: update helm chart kube-prometheus-stack to 39.1.0

Open carpenike-bot[bot] opened this issue 3 years ago • 7 comments

This PR contains the following updates:

Package Update Change
kube-prometheus-stack major 35.6.2 -> 39.4.0

Configuration

📅 Schedule: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined).

🚦 Automerge: Disabled by config. Please merge this manually once you are satisfied.

Rebasing: Renovate will not automatically rebase this PR, because other commits have been found.

🔕 Ignore: Close this PR and you won't be reminded about this update again.


  • [ ] If you want to rebase/retry this PR, click this checkbox. ⚠ Warning: custom changes will be lost.

This PR has been generated by Renovate Bot.

carpenike-bot[bot] avatar Jul 29 '22 09:07 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.0.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -11355,7 +11359,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11718,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11727,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11736,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11745,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11754,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14124,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14483,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14492,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14501,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14510,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14519,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17375,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +17941,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +17950,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +17959,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +17968,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +17977,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30453,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31512,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39550,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39642,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39690,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -41126,11 +41368,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41501,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41532,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41850,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42122,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42195,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42273,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42287,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42398,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42417,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42428,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42437,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +42862,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43171,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43285,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43530,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Jul 29 '22 09:07 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.1.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -11355,7 +11359,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11718,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11727,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11736,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11745,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11754,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14124,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14483,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14492,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14501,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14510,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14519,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17375,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +17941,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +17950,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +17959,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +17968,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +17977,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30453,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31512,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39550,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39642,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39690,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -41126,11 +41368,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41501,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41532,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41850,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42122,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42195,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42273,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42287,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42398,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42417,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42428,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42437,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +42862,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43171,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43285,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43530,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Jul 29 '22 13:07 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.0.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -11355,7 +11359,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11718,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11727,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11736,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11745,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11754,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14124,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14483,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14492,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14501,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14510,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14519,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17375,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +17941,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +17950,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +17959,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +17968,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +17977,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30453,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31512,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39550,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39642,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39690,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -41126,11 +41368,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41501,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41532,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41850,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42122,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42195,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42273,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42287,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42398,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42417,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42428,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42437,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +42862,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43171,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43285,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43530,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Jul 29 '22 13:07 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.1.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -11355,7 +11359,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11718,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11727,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11736,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11745,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11754,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14124,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14483,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14492,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14501,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14510,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14519,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17375,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +17941,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +17950,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +17959,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +17968,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +17977,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30453,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31512,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39550,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39642,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39690,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -41126,11 +41368,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41501,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41532,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41850,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42122,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42195,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42273,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42287,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42398,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42417,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42428,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42437,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +42862,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43171,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43285,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43530,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Jul 29 '22 16:07 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.2.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -11355,7 +11359,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11718,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11727,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11736,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11745,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11754,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14124,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14483,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14492,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14501,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14510,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14519,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17375,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +17941,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +17950,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +17959,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +17968,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +17977,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30453,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31512,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39550,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39642,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39690,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -41126,11 +41368,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41501,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41532,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41850,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42122,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42195,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42273,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42287,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42398,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42417,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42428,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42437,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +42862,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43171,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43285,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43530,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Jul 31 '22 12:07 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.2.1

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -11355,7 +11359,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11718,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11727,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11736,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11745,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11754,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14124,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14483,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14492,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14501,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14510,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14519,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17375,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +17941,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +17950,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +17959,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +17968,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +17977,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30453,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31512,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39550,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39642,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39690,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -41126,11 +41368,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41501,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41532,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41850,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42122,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42195,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42273,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42287,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42398,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42417,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42428,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42437,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +42862,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43171,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43285,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43530,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 01 '22 12:08 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.4.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -11355,7 +11359,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11718,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11727,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11736,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11745,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11754,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14124,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14483,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14492,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14501,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14510,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14519,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17375,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +17941,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +17950,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +17959,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +17968,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +17977,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30453,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31512,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39550,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39642,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39690,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -41126,11 +41368,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41501,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41532,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41850,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42122,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42195,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42273,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42287,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42398,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42417,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42428,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42437,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +42862,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43171,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43285,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43530,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 02 '22 15:08 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.4.1

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -5685,7 +5689,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
 "steppedLine": true,
 "targets": [
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "hide": false,
 "intervalFactor": 2,
 "legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
 "step": 4
 },
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} DB fsync",
 "metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic In",
 "metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic Out",
 "metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Peer Traffic In",
 "metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "hide": false,
 "interval": "",
 "intervalFactor": 2,
@@ -6421,7 +6425,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Failure Rate",
 "metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Commit Rate",
 "metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Apply Rate",
 "refId": "D",
@@ -6566,6 +6570,115 @@
 "show": true
 }
 ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
 }
 ],
 "title": "New row"
@@ -6574,7 +6687,9 @@
 "schemaVersion": 13,
 "sharedCrosshair": false,
 "style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
 "templating": {
 "list": [
 {
@@ -6583,7 +6698,7 @@
 "value": "Prometheus"
 },
 "hide": 0,
- "label": null,
+ "label": "Data Source",
 "name": "datasource",
 "options": [],
 "query": "prometheus",
@@ -6605,7 +6720,7 @@
 "name": "cluster",
 "options": [],
 "query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
 "regex": "",
 "sort": 2,
 "tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11829,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11838,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11847,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11856,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11865,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14235,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14594,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14603,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14612,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14621,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14630,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17486,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +18052,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +18061,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +18070,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +18079,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +18088,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30564,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31623,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39661,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39753,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39801,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -40001,61 +40354,84 @@
 groups:
 - name: etcd
 rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdInsufficientMembers
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
 for: 3m
 labels:
 severity: critical
 - alert: etcdNoLeader
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
 expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
 for: 1m
 labels:
 severity: critical
 - alert: etcdHighNumberOfLeaderChanges
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 1
 for: 10m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 5
 for: 5m
 labels:
 severity: critical
 - alert: etcdGRPCRequestsSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
 expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
 > 0.15
 for: 10m
 labels:
 severity: critical
 - alert: etcdMemberCommunicationSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
 expr: |-
 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.15
@@ -40064,53 +40440,64 @@
 severity: warning
 - alert: etcdHighNumberOfFailedProposals
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
 for: 15m
 labels:
 severity: warning
 - alert: etcdHighFsyncDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.5
 for: 10m
 labels:
 severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdHighCommitDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.25
 for: 10m
 labels:
 severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
 for: 10m
 labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
 for: 10m
 labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
 annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
 for: 10m
 labels:
 severity: warning
@@ -41126,11 +41513,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41646,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41677,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41995,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42267,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42340,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42418,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42432,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42543,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42562,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42573,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42582,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +43007,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43316,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43430,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43675,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 05 '22 17:08 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.5.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -5685,7 +5689,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
 "steppedLine": true,
 "targets": [
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "hide": false,
 "intervalFactor": 2,
 "legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
 "step": 4
 },
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} DB fsync",
 "metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic In",
 "metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic Out",
 "metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Peer Traffic In",
 "metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "hide": false,
 "interval": "",
 "intervalFactor": 2,
@@ -6421,7 +6425,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Failure Rate",
 "metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Commit Rate",
 "metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Apply Rate",
 "refId": "D",
@@ -6566,6 +6570,115 @@
 "show": true
 }
 ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
 }
 ],
 "title": "New row"
@@ -6574,7 +6687,9 @@
 "schemaVersion": 13,
 "sharedCrosshair": false,
 "style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
 "templating": {
 "list": [
 {
@@ -6583,7 +6698,7 @@
 "value": "Prometheus"
 },
 "hide": 0,
- "label": null,
+ "label": "Data Source",
 "name": "datasource",
 "options": [],
 "query": "prometheus",
@@ -6605,7 +6720,7 @@
 "name": "cluster",
 "options": [],
 "query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
 "regex": "",
 "sort": 2,
 "tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11829,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11838,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11847,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11856,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11865,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14235,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14594,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14603,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14612,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14621,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14630,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17486,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +18052,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +18061,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +18070,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +18079,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +18088,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30564,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31623,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39661,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39753,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39801,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -40001,61 +40354,84 @@
 groups:
 - name: etcd
 rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdInsufficientMembers
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
 for: 3m
 labels:
 severity: critical
 - alert: etcdNoLeader
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
 expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
 for: 1m
 labels:
 severity: critical
 - alert: etcdHighNumberOfLeaderChanges
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 1
 for: 10m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 5
 for: 5m
 labels:
 severity: critical
 - alert: etcdGRPCRequestsSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
 expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
 > 0.15
 for: 10m
 labels:
 severity: critical
 - alert: etcdMemberCommunicationSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
 expr: |-
 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.15
@@ -40064,53 +40440,64 @@
 severity: warning
 - alert: etcdHighNumberOfFailedProposals
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
 for: 15m
 labels:
 severity: warning
 - alert: etcdHighFsyncDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.5
 for: 10m
 labels:
 severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdHighCommitDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.25
 for: 10m
 labels:
 severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
 for: 10m
 labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
 for: 10m
 labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
 annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
 for: 10m
 labels:
 severity: warning
@@ -41126,11 +41513,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41646,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41677,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41995,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42267,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42340,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42418,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42432,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42543,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42562,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42573,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42582,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +43007,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43316,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43430,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43675,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 07 '22 11:08 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.6.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -5685,7 +5689,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
 "steppedLine": true,
 "targets": [
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "hide": false,
 "intervalFactor": 2,
 "legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
 "step": 4
 },
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} DB fsync",
 "metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic In",
 "metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic Out",
 "metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Peer Traffic In",
 "metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "hide": false,
 "interval": "",
 "intervalFactor": 2,
@@ -6421,7 +6425,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Failure Rate",
 "metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Commit Rate",
 "metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Apply Rate",
 "refId": "D",
@@ -6566,6 +6570,115 @@
 "show": true
 }
 ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
 }
 ],
 "title": "New row"
@@ -6574,7 +6687,9 @@
 "schemaVersion": 13,
 "sharedCrosshair": false,
 "style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
 "templating": {
 "list": [
 {
@@ -6583,7 +6698,7 @@
 "value": "Prometheus"
 },
 "hide": 0,
- "label": null,
+ "label": "Data Source",
 "name": "datasource",
 "options": [],
 "query": "prometheus",
@@ -6605,7 +6720,7 @@
 "name": "cluster",
 "options": [],
 "query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
 "regex": "",
 "sort": 2,
 "tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11829,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11838,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11847,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11856,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11865,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14235,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14594,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14603,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14612,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14621,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14630,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17486,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +18052,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +18061,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +18070,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +18079,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +18088,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30564,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31623,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39661,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39753,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39801,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -40001,61 +40354,84 @@
 groups:
 - name: etcd
 rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdInsufficientMembers
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
 for: 3m
 labels:
 severity: critical
 - alert: etcdNoLeader
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
 expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
 for: 1m
 labels:
 severity: critical
 - alert: etcdHighNumberOfLeaderChanges
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 1
 for: 10m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 5
 for: 5m
 labels:
 severity: critical
 - alert: etcdGRPCRequestsSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
 expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
 > 0.15
 for: 10m
 labels:
 severity: critical
 - alert: etcdMemberCommunicationSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
 expr: |-
 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.15
@@ -40064,53 +40440,64 @@
 severity: warning
 - alert: etcdHighNumberOfFailedProposals
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
 for: 15m
 labels:
 severity: warning
 - alert: etcdHighFsyncDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.5
 for: 10m
 labels:
 severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdHighCommitDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.25
 for: 10m
 labels:
 severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
 for: 10m
 labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
 for: 10m
 labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
 annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
 for: 10m
 labels:
 severity: warning
@@ -41126,11 +41513,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41646,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41677,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41995,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42267,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42340,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42418,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42432,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42543,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42562,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42573,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42582,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +43007,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43316,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43430,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43675,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 10 '22 10:08 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.7.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -5685,7 +5689,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
 "steppedLine": true,
 "targets": [
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "hide": false,
 "intervalFactor": 2,
 "legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
 "step": 4
 },
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} DB fsync",
 "metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic In",
 "metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic Out",
 "metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Peer Traffic In",
 "metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "hide": false,
 "interval": "",
 "intervalFactor": 2,
@@ -6421,7 +6425,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Failure Rate",
 "metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Commit Rate",
 "metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Apply Rate",
 "refId": "D",
@@ -6566,6 +6570,115 @@
 "show": true
 }
 ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
 }
 ],
 "title": "New row"
@@ -6574,7 +6687,9 @@
 "schemaVersion": 13,
 "sharedCrosshair": false,
 "style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
 "templating": {
 "list": [
 {
@@ -6583,7 +6698,7 @@
 "value": "Prometheus"
 },
 "hide": 0,
- "label": null,
+ "label": "Data Source",
 "name": "datasource",
 "options": [],
 "query": "prometheus",
@@ -6605,7 +6720,7 @@
 "name": "cluster",
 "options": [],
 "query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
 "regex": "",
 "sort": 2,
 "tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11829,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11838,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11847,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11856,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11865,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14235,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14594,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14603,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14612,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14621,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14630,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17486,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +18052,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +18061,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +18070,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +18079,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +18088,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30564,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31623,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39661,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39753,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39801,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -40001,61 +40354,84 @@
 groups:
 - name: etcd
 rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdInsufficientMembers
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
 for: 3m
 labels:
 severity: critical
 - alert: etcdNoLeader
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
 expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
 for: 1m
 labels:
 severity: critical
 - alert: etcdHighNumberOfLeaderChanges
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 1
 for: 10m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 5
 for: 5m
 labels:
 severity: critical
 - alert: etcdGRPCRequestsSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
 expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
 > 0.15
 for: 10m
 labels:
 severity: critical
 - alert: etcdMemberCommunicationSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
 expr: |-
 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.15
@@ -40064,53 +40440,64 @@
 severity: warning
 - alert: etcdHighNumberOfFailedProposals
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
 for: 15m
 labels:
 severity: warning
 - alert: etcdHighFsyncDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.5
 for: 10m
 labels:
 severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdHighCommitDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.25
 for: 10m
 labels:
 severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
 for: 10m
 labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
 for: 10m
 labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
 annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
 for: 10m
 labels:
 severity: warning
@@ -41126,11 +41513,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41646,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41677,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41995,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42267,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42340,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42418,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42432,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42543,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42562,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42573,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42582,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +43007,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43316,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43430,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43675,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 16 '22 09:08 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.8.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -5685,7 +5689,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
 "steppedLine": true,
 "targets": [
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "hide": false,
 "intervalFactor": 2,
 "legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
 "step": 4
 },
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} DB fsync",
 "metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic In",
 "metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic Out",
 "metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Peer Traffic In",
 "metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "hide": false,
 "interval": "",
 "intervalFactor": 2,
@@ -6421,7 +6425,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Failure Rate",
 "metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Commit Rate",
 "metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Apply Rate",
 "refId": "D",
@@ -6566,6 +6570,115 @@
 "show": true
 }
 ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
 }
 ],
 "title": "New row"
@@ -6574,7 +6687,9 @@
 "schemaVersion": 13,
 "sharedCrosshair": false,
 "style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
 "templating": {
 "list": [
 {
@@ -6583,7 +6698,7 @@
 "value": "Prometheus"
 },
 "hide": 0,
- "label": null,
+ "label": "Data Source",
 "name": "datasource",
 "options": [],
 "query": "prometheus",
@@ -6605,7 +6720,7 @@
 "name": "cluster",
 "options": [],
 "query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
 "regex": "",
 "sort": 2,
 "tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11829,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11838,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11847,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11856,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11865,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14235,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14594,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14603,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14612,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14621,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14630,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17486,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +18052,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +18061,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +18070,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +18079,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +18088,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30564,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31623,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39661,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39753,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39801,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -40001,61 +40354,84 @@
 groups:
 - name: etcd
 rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdInsufficientMembers
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
 for: 3m
 labels:
 severity: critical
 - alert: etcdNoLeader
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
 expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
 for: 1m
 labels:
 severity: critical
 - alert: etcdHighNumberOfLeaderChanges
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 1
 for: 10m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 5
 for: 5m
 labels:
 severity: critical
 - alert: etcdGRPCRequestsSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
 expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
 > 0.15
 for: 10m
 labels:
 severity: critical
 - alert: etcdMemberCommunicationSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
 expr: |-
 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.15
@@ -40064,53 +40440,64 @@
 severity: warning
 - alert: etcdHighNumberOfFailedProposals
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
 for: 15m
 labels:
 severity: warning
 - alert: etcdHighFsyncDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.5
 for: 10m
 labels:
 severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdHighCommitDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.25
 for: 10m
 labels:
 severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
 for: 10m
 labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
 for: 10m
 labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
 annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
 for: 10m
 labels:
 severity: warning
@@ -41126,11 +41513,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41646,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41677,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41995,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42267,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42340,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42418,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42432,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42543,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42562,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42573,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42582,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +43007,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43316,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43430,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43675,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 17 '22 12:08 carpenike-bot[bot]

Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml Version: 35.6.2 -> 39.9.0

@@ -158,6 +158,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -251,6 +252,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -364,6 +366,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -463,6 +466,7 @@
 "dashLength": 10,
 "dashes": false,
 "datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
 "fill": 1,
 "fillGradient": 0,
 "gridPos": {
@@ -5685,7 +5689,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
 "steppedLine": true,
 "targets": [
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "hide": false,
 "intervalFactor": 2,
 "legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
 "step": 4
 },
 {
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} DB fsync",
 "metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic In",
 "metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Client Traffic Out",
 "metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "intervalFactor": 2,
 "legendFormat": "{{instance}} Peer Traffic In",
 "metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
 "hide": false,
 "interval": "",
 "intervalFactor": 2,
@@ -6421,7 +6425,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Failure Rate",
 "metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Commit Rate",
 "metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
 "step": 2
 },
 {
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
 "intervalFactor": 2,
 "legendFormat": "Proposal Apply Rate",
 "refId": "D",
@@ -6566,6 +6570,115 @@
 "show": true
 }
 ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
 }
 ],
 "title": "New row"
@@ -6574,7 +6687,9 @@
 "schemaVersion": 13,
 "sharedCrosshair": false,
 "style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
 "templating": {
 "list": [
 {
@@ -6583,7 +6698,7 @@
 "value": "Prometheus"
 },
 "hide": 0,
- "label": null,
+ "label": "Data Source",
 "name": "datasource",
 "options": [],
 "query": "prometheus",
@@ -6605,7 +6720,7 @@
 "name": "cluster",
 "options": [],
 "query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
 "regex": "",
 "sort": 2,
 "tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11714,7 +11829,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11723,7 +11838,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11732,7 +11847,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11741,7 +11856,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -11750,7 +11865,7 @@
 "step": 10
 },
 {
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14120,7 +14235,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14479,7 +14594,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14488,7 +14603,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14497,7 +14612,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14506,7 +14621,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -14515,7 +14630,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17371,7 +17486,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
 "step": 10
 },
 {
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
 "step": 10
 },
 {
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
 ],
 "targets": [
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17937,7 +18052,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17946,7 +18061,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17955,7 +18070,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17964,7 +18079,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -17973,7 +18088,7 @@
 "step": 10
 },
 {
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
 "format": "table",
 "instant": true,
 "intervalFactor": 2,
@@ -30449,21 +30564,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -31396,21 +31623,21 @@
 "steppedLine": false,
 "targets": [
 {
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} read",
 "refId": "A"
 },
 {
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} written",
 "refId": "B"
 },
 {
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
 "format": "time_series",
 "intervalFactor": 2,
 "legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
 ]
 },
 {
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
 
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
 },
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
 "gridPos": {
 
 },
 "id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
 "span": 6,
- "stack": true,
- "steppedLine": false,
 "targets": [
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
 },
 {
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
 "intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
 }
 ],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
 "title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
 
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
 },
 {
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
 }
- ]
+ ],
+ "transparent": false,
+ "type": "table"
 }
 ],
 "repeat": null,
@@ -39322,13 +39661,27 @@
 containerPort: 9100
 protocol: TCP
 livenessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 readinessProbe:
+ failureThreshold: 3
 httpGet:
+ httpHeaders:
 path: /
 port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
 resources: {}
 volumeMounts:
 - name: proc
@@ -39400,7 +39753,7 @@
 - --metric-labels-allowlist=persistentvolumeclaims=[*]
 - --telemetry-port=8081
 imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
 ports:
 - containerPort: 8080
 name: "http"
@@ -39448,17 +39801,17 @@
 spec:
 containers:
 - name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
 imagePullPolicy: "IfNotPresent"
 args:
 - --kubelet-service=kube-system/prometheus-kubelet
 - --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
 - --config-reloader-cpu-request=200m
 - --config-reloader-cpu-limit=200m
 - --config-reloader-memory-request=50Mi
 - --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
 - --web.enable-tls=true
 - --web.cert-file=/cert/cert
 - --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10252
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
 - ip: 10.20.0.22
 ports:
 - name: http-metrics
- port:
+ port: 10251
 protocol: TCP
 ---
 # Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39736,6 +40089,7 @@
 namespace: default
 name: prometheus-operator
 path: /admission-prometheusrules/mutate
+ timeoutSeconds: 10
 admissionReviewVersions: ["v1", "v1beta1"]
 sideEffects: None
 ---
@@ -39760,8 +40114,8 @@
 port: http-web
 pathPrefix: "/"
 apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
 replicaExternalLabelName: "replica"
 externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
 paused: false
@@ -40001,61 +40355,84 @@
 groups:
 - name: etcd
 rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdInsufficientMembers
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
 for: 3m
 labels:
 severity: critical
 - alert: etcdNoLeader
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
 expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
 for: 1m
 labels:
 severity: critical
 - alert: etcdHighNumberOfLeaderChanges
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 1
 for: 10m
 labels:
 severity: warning
 - alert: etcdHighNumberOfFailedGRPCRequests
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
 expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
 /
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
 > 5
 for: 5m
 labels:
 severity: critical
 - alert: etcdGRPCRequestsSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
 expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
 > 0.15
 for: 10m
 labels:
 severity: critical
 - alert: etcdMemberCommunicationSlow
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
 expr: |-
 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.15
@@ -40064,53 +40441,64 @@
 severity: warning
 - alert: etcdHighNumberOfFailedProposals
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
 for: 15m
 labels:
 severity: warning
 - alert: etcdHighFsyncDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.5
 for: 10m
 labels:
 severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
 - alert: etcdHighCommitDurations
 annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
 expr: |-
 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
 > 0.25
 for: 10m
 labels:
 severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
 for: 10m
 labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
 annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
 for: 10m
 labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
 annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
 for: 10m
 labels:
 severity: warning
@@ -41126,11 +41514,11 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
 summary: Pod has been in a non-ready state for more than 15 minutes.
 expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
 kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
 )
 ) > 0
 for: 15m
@@ -41259,7 +41647,7 @@
 description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
 summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
 for: 1h
 labels:
 severity: warning
@@ -41290,7 +41678,7 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
 summary: Job did not complete in time
 expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
 and
 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
 labels:
@@ -41608,7 +41996,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
 summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
 labels:
 severity: warning
 - alert: KubeAggregatedAPIDown
@@ -41616,7 +42004,7 @@
 description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
 summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
 for: 5m
 labels:
 severity: warning
@@ -41880,7 +42268,7 @@
 description: There are {{ $value }} different semantic versions of Kubernetes components running.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
 summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
 for: 15m
 labels:
 severity: warning
@@ -41953,9 +42341,9 @@
 record: instance:node_memory_utilisation:ratio
 - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
 record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
 record: instance_device:node_disk_io_time_weighted_seconds:rate5m
 - expr: |-
 sum without (device) (
@@ -42031,10 +42419,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42045,10 +42433,10 @@
 annotations:
 description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
 expr: |-
 (
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
 and
 node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
 )
@@ -42156,15 +42544,15 @@
 summary: Clock skew detected.
 expr: |-
 (
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
 and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
 )
 or
 (
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
 and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
 )
 for: 10m
 labels:
@@ -42175,9 +42563,9 @@
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
 summary: Clock not synchronising.
 expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
 and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
 for: 10m
 labels:
 severity: warning
@@ -42186,7 +42574,7 @@
 description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
 summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
 for: 15m
 labels:
 severity: critical
@@ -42195,7 +42583,7 @@
 description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
 summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
 labels:
 severity: warning
 - alert: NodeFileDescriptorLimit
@@ -42363,7 +42751,7 @@
 description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
 summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
 for: 5m
 labels:
 severity: warning
@@ -42620,6 +43008,15 @@
 for: 5m
 labels:
 severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
 - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
 annotations:
 description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43155,12 @@
 - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
 port: https
 scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
 tlsConfig:
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 serverName: kubernetes
@@ -42914,6 +43317,32 @@
 caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
 insecureSkipVerify: true
 bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
 relabelings:
 - sourceLabels:
 - __metrics_path__
@@ -43002,10 +43431,10 @@
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
 namespace: default
 labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
 app.kubernetes.io/managed-by: Helm
 app.kubernetes.io/instance: kube-prometheus-stack
 app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43631,7 @@
 spec:
 containers:
 - name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - create
@@ -43247,7 +43676,7 @@
 spec:
 containers:
 - name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
 imagePullPolicy: IfNotPresent
 args:
 - patch

carpenike-bot[bot] avatar Aug 21 '22 09:08 carpenike-bot[bot]