k8s-gitops
k8s-gitops copied to clipboard
feat!: update helm chart kube-prometheus-stack to 39.1.0
This PR contains the following updates:
| Package | Update | Change |
|---|---|---|
| kube-prometheus-stack | major | 35.6.2 -> 39.4.0 |
Configuration
📅 Schedule: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined).
🚦 Automerge: Disabled by config. Please merge this manually once you are satisfied.
♻ Rebasing: Renovate will not automatically rebase this PR, because other commits have been found.
🔕 Ignore: Close this PR and you won't be reminded about this update again.
- [ ] If you want to rebase/retry this PR, click this checkbox. ⚠ Warning: custom changes will be lost.
This PR has been generated by Renovate Bot.
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.0.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -11355,7 +11359,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11718,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11727,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11736,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11745,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11754,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14124,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14483,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14492,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14501,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14510,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14519,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17375,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +17941,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +17950,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +17959,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +17968,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +17977,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30453,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31512,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39550,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39642,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39690,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -41126,11 +41368,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41501,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41532,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41850,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42122,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42195,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42273,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42287,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42398,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42417,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42428,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42437,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +42862,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43171,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43285,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43530,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.1.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -11355,7 +11359,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11718,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11727,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11736,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11745,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11754,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14124,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14483,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14492,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14501,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14510,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14519,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17375,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +17941,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +17950,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +17959,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +17968,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +17977,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30453,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31512,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39550,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39642,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39690,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -41126,11 +41368,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41501,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41532,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41850,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42122,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42195,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42273,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42287,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42398,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42417,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42428,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42437,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +42862,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43171,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43285,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43530,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.0.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -11355,7 +11359,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11718,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11727,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11736,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11745,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11754,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14124,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14483,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14492,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14501,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14510,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14519,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17375,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +17941,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +17950,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +17959,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +17968,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +17977,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30453,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31512,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39550,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39642,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39690,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -41126,11 +41368,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41501,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41532,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41850,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42122,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42195,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42273,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42287,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42398,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42417,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42428,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42437,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +42862,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43171,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43285,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43530,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.1.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -11355,7 +11359,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11718,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11727,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11736,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11745,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11754,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14124,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14483,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14492,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14501,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14510,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14519,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17375,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +17941,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +17950,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +17959,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +17968,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +17977,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30453,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31512,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39550,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39642,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39690,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -41126,11 +41368,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41501,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41532,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41850,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42122,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42195,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42273,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42287,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42398,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42417,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42428,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42437,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +42862,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43171,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43285,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43530,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.2.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -11355,7 +11359,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11718,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11727,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11736,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11745,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11754,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14124,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14483,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14492,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14501,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14510,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14519,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17375,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +17941,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +17950,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +17959,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +17968,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +17977,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30453,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31512,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39550,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39642,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39690,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -41126,11 +41368,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41501,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41532,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41850,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42122,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42195,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42273,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42287,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42398,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42417,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42428,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42437,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +42862,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43171,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43285,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43530,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.2.1
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -11355,7 +11359,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11718,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11727,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11736,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11745,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11754,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14124,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14483,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14492,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14501,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14510,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14519,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17375,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +17941,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +17950,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +17959,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +17968,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +17977,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30453,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31512,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39550,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39642,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39690,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -41126,11 +41368,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41501,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41532,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41850,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42122,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42195,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42273,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42287,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42398,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42417,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42428,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42437,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +42862,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43171,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43285,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43530,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.4.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -11355,7 +11359,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11448,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11709,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11718,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11727,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11736,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11745,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11754,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14124,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14213,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14474,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14483,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14492,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14501,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14510,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14519,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17375,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17383,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17472,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17480,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +17932,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +17941,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +17950,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +17959,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +17968,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +17977,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30453,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30515,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31512,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31574,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39550,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39642,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39690,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39873,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +39945,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40002,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -41126,11 +41368,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41501,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41532,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41850,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +41858,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42122,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42195,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42273,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42287,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42398,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42417,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42428,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42437,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42605,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +42862,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43009,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43171,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43285,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43485,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43530,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.4.1
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -5685,7 +5689,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
"step": 2
},
{
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
"steppedLine": true,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
"step": 4
},
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"intervalFactor": 2,
"legendFormat": "{{instance}} DB fsync",
"metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic In",
"metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic Out",
"metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"intervalFactor": 2,
"legendFormat": "{{instance}} Peer Traffic In",
"metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"hide": false,
"interval": "",
"intervalFactor": 2,
@@ -6421,7 +6425,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Failure Rate",
"metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Commit Rate",
"metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Apply Rate",
"refId": "D",
@@ -6566,6 +6570,115 @@
"show": true
}
]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
}
],
"title": "New row"
@@ -6574,7 +6687,9 @@
"schemaVersion": 13,
"sharedCrosshair": false,
"style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
"templating": {
"list": [
{
@@ -6583,7 +6698,7 @@
"value": "Prometheus"
},
"hide": 0,
- "label": null,
+ "label": "Data Source",
"name": "datasource",
"options": [],
"query": "prometheus",
@@ -6605,7 +6720,7 @@
"name": "cluster",
"options": [],
"query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11829,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11838,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11847,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11856,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11865,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14235,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14594,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14603,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14612,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14621,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14630,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17486,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +18052,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +18061,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +18070,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +18079,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +18088,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30564,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31623,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39661,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39753,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39801,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -40001,61 +40354,84 @@
groups:
- name: etcd
rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdInsufficientMembers
annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@@ -40064,53 +40440,64 @@
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdHighCommitDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
for: 10m
labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
for: 10m
labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
for: 10m
labels:
severity: warning
@@ -41126,11 +41513,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41646,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41677,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41995,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42267,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42340,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42418,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42432,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42543,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42562,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42573,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42582,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +43007,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43316,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43430,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43675,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.5.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -5685,7 +5689,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
"step": 2
},
{
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
"steppedLine": true,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
"step": 4
},
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"intervalFactor": 2,
"legendFormat": "{{instance}} DB fsync",
"metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic In",
"metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic Out",
"metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"intervalFactor": 2,
"legendFormat": "{{instance}} Peer Traffic In",
"metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"hide": false,
"interval": "",
"intervalFactor": 2,
@@ -6421,7 +6425,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Failure Rate",
"metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Commit Rate",
"metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Apply Rate",
"refId": "D",
@@ -6566,6 +6570,115 @@
"show": true
}
]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
}
],
"title": "New row"
@@ -6574,7 +6687,9 @@
"schemaVersion": 13,
"sharedCrosshair": false,
"style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
"templating": {
"list": [
{
@@ -6583,7 +6698,7 @@
"value": "Prometheus"
},
"hide": 0,
- "label": null,
+ "label": "Data Source",
"name": "datasource",
"options": [],
"query": "prometheus",
@@ -6605,7 +6720,7 @@
"name": "cluster",
"options": [],
"query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11829,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11838,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11847,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11856,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11865,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14235,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14594,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14603,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14612,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14621,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14630,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17486,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +18052,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +18061,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +18070,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +18079,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +18088,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30564,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31623,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39661,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39753,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39801,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -40001,61 +40354,84 @@
groups:
- name: etcd
rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdInsufficientMembers
annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@@ -40064,53 +40440,64 @@
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdHighCommitDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
for: 10m
labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
for: 10m
labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
for: 10m
labels:
severity: warning
@@ -41126,11 +41513,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41646,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41677,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41995,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42267,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42340,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42418,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42432,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42543,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42562,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42573,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42582,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +43007,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43316,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43430,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43675,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.6.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -5685,7 +5689,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
"step": 2
},
{
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
"steppedLine": true,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
"step": 4
},
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"intervalFactor": 2,
"legendFormat": "{{instance}} DB fsync",
"metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic In",
"metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic Out",
"metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"intervalFactor": 2,
"legendFormat": "{{instance}} Peer Traffic In",
"metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"hide": false,
"interval": "",
"intervalFactor": 2,
@@ -6421,7 +6425,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Failure Rate",
"metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Commit Rate",
"metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Apply Rate",
"refId": "D",
@@ -6566,6 +6570,115 @@
"show": true
}
]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
}
],
"title": "New row"
@@ -6574,7 +6687,9 @@
"schemaVersion": 13,
"sharedCrosshair": false,
"style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
"templating": {
"list": [
{
@@ -6583,7 +6698,7 @@
"value": "Prometheus"
},
"hide": 0,
- "label": null,
+ "label": "Data Source",
"name": "datasource",
"options": [],
"query": "prometheus",
@@ -6605,7 +6720,7 @@
"name": "cluster",
"options": [],
"query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11829,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11838,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11847,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11856,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11865,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14235,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14594,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14603,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14612,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14621,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14630,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17486,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +18052,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +18061,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +18070,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +18079,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +18088,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30564,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31623,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39661,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39753,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39801,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -40001,61 +40354,84 @@
groups:
- name: etcd
rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdInsufficientMembers
annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@@ -40064,53 +40440,64 @@
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdHighCommitDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
for: 10m
labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
for: 10m
labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
for: 10m
labels:
severity: warning
@@ -41126,11 +41513,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41646,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41677,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41995,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42267,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42340,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42418,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42432,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42543,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42562,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42573,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42582,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +43007,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43316,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43430,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43675,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.7.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -5685,7 +5689,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
"step": 2
},
{
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
"steppedLine": true,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
"step": 4
},
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"intervalFactor": 2,
"legendFormat": "{{instance}} DB fsync",
"metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic In",
"metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic Out",
"metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"intervalFactor": 2,
"legendFormat": "{{instance}} Peer Traffic In",
"metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"hide": false,
"interval": "",
"intervalFactor": 2,
@@ -6421,7 +6425,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Failure Rate",
"metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Commit Rate",
"metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Apply Rate",
"refId": "D",
@@ -6566,6 +6570,115 @@
"show": true
}
]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
}
],
"title": "New row"
@@ -6574,7 +6687,9 @@
"schemaVersion": 13,
"sharedCrosshair": false,
"style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
"templating": {
"list": [
{
@@ -6583,7 +6698,7 @@
"value": "Prometheus"
},
"hide": 0,
- "label": null,
+ "label": "Data Source",
"name": "datasource",
"options": [],
"query": "prometheus",
@@ -6605,7 +6720,7 @@
"name": "cluster",
"options": [],
"query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11829,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11838,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11847,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11856,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11865,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14235,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14594,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14603,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14612,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14621,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14630,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17486,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +18052,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +18061,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +18070,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +18079,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +18088,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30564,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31623,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39661,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39753,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39801,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -40001,61 +40354,84 @@
groups:
- name: etcd
rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdInsufficientMembers
annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@@ -40064,53 +40440,64 @@
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdHighCommitDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
for: 10m
labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
for: 10m
labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
for: 10m
labels:
severity: warning
@@ -41126,11 +41513,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41646,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41677,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41995,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42267,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42340,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42418,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42432,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42543,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42562,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42573,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42582,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +43007,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43316,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43430,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43675,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.8.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -5685,7 +5689,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
"step": 2
},
{
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
"steppedLine": true,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
"step": 4
},
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"intervalFactor": 2,
"legendFormat": "{{instance}} DB fsync",
"metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic In",
"metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic Out",
"metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"intervalFactor": 2,
"legendFormat": "{{instance}} Peer Traffic In",
"metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"hide": false,
"interval": "",
"intervalFactor": 2,
@@ -6421,7 +6425,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Failure Rate",
"metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Commit Rate",
"metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Apply Rate",
"refId": "D",
@@ -6566,6 +6570,115 @@
"show": true
}
]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
}
],
"title": "New row"
@@ -6574,7 +6687,9 @@
"schemaVersion": 13,
"sharedCrosshair": false,
"style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
"templating": {
"list": [
{
@@ -6583,7 +6698,7 @@
"value": "Prometheus"
},
"hide": 0,
- "label": null,
+ "label": "Data Source",
"name": "datasource",
"options": [],
"query": "prometheus",
@@ -6605,7 +6720,7 @@
"name": "cluster",
"options": [],
"query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11829,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11838,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11847,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11856,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11865,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14235,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14594,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14603,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14612,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14621,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14630,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17486,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +18052,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +18061,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +18070,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +18079,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +18088,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30564,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31623,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39661,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39753,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39801,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39760,8 +40113,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -40001,61 +40354,84 @@
groups:
- name: etcd
rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdInsufficientMembers
annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@@ -40064,53 +40440,64 @@
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdHighCommitDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
for: 10m
labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
for: 10m
labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
for: 10m
labels:
severity: warning
@@ -41126,11 +41513,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41646,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41677,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41995,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +42003,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42267,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42340,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42418,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42432,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42543,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42562,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42573,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42582,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42750,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +43007,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43154,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43316,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43430,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43630,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43675,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch
Path: k8s/clusters/cluster-0/manifests/monitoring/kube-prometheus-stack/helm-release.yaml
Version: 35.6.2 -> 39.9.0
@@ -158,6 +158,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "current set of alerts stored in the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -251,6 +252,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid alerts received by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -364,6 +366,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "rate of successful and invalid notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -463,6 +466,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
+ "description": "latency of notifications sent by the Alertmanager",
"fill": 1,
"fillGradient": 0,
"gridPos": {
@@ -5685,7 +5689,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[5m]))",
+ "expr": "sum(rate(grpc_server_started_total{job=\"$cluster\",grpc_type=\"unary\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Rate",
@@ -5694,7 +5698,7 @@
"step": 2
},
{
- "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))",
+ "expr": "sum(rate(grpc_server_handled_total{job=\"$cluster\",grpc_type=\"unary\",grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "RPC Failed Rate",
@@ -5941,7 +5945,7 @@
"steppedLine": true,
"targets": [
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{instance}} WAL fsync",
@@ -5950,7 +5954,7 @@
"step": 4
},
{
- "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[5m])) by (instance, le))",
+ "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job=\"$cluster\"}[$__rate_interval])) by (instance, le))",
"intervalFactor": 2,
"legendFormat": "{{instance}} DB fsync",
"metric": "etcd_disk_backend_commit_duration_seconds_bucket",
@@ -6108,7 +6112,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_received_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic In",
"metric": "etcd_network_client_grpc_received_bytes_total",
@@ -6184,7 +6188,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[5m])",
+ "expr": "rate(etcd_network_client_grpc_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])",
"intervalFactor": 2,
"legendFormat": "{{instance}} Client Traffic Out",
"metric": "etcd_network_client_grpc_sent_bytes_total",
@@ -6260,7 +6264,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_received_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"intervalFactor": 2,
"legendFormat": "{{instance}} Peer Traffic In",
"metric": "etcd_network_peer_received_bytes_total",
@@ -6337,7 +6341,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[5m])) by (instance)",
+ "expr": "sum(rate(etcd_network_peer_sent_bytes_total{job=\"$cluster\"}[$__rate_interval])) by (instance)",
"hide": false,
"interval": "",
"intervalFactor": 2,
@@ -6421,7 +6425,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_failed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Failure Rate",
"metric": "etcd_server_proposals_failed_total",
@@ -6437,7 +6441,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_committed_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Commit Rate",
"metric": "etcd_server_proposals_committed_total",
@@ -6445,7 +6449,7 @@
"step": 2
},
{
- "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[5m]))",
+ "expr": "sum(rate(etcd_server_proposals_applied_total{job=\"$cluster\"}[$__rate_interval]))",
"intervalFactor": 2,
"legendFormat": "Proposal Apply Rate",
"refId": "D",
@@ -6566,6 +6570,115 @@
"show": true
}
]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "decimals": 0,
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": false,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.4.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{job=\"$cluster\"}[$__rate_interval])))",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "{{instance}} Peer round trip time",
+ "metric": "etcd_network_peer_round_trip_time_seconds_bucket",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Peer round trip time",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:925",
+ "decimals": null,
+ "format": "s",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:926",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
}
],
"title": "New row"
@@ -6574,7 +6687,9 @@
"schemaVersion": 13,
"sharedCrosshair": false,
"style": "dark",
- "tags": [],
+ "tags": [
+ "etcd-mixin"
+ ],
"templating": {
"list": [
{
@@ -6583,7 +6698,7 @@
"value": "Prometheus"
},
"hide": 0,
- "label": null,
+ "label": "Data Source",
"name": "datasource",
"options": [],
"query": "prometheus",
@@ -6605,7 +6720,7 @@
"name": "cluster",
"options": [],
"query": "label_values(etcd_server_has_leader, job)",
- "refresh": 1,
+ "refresh": 2,
"regex": "",
"sort": 2,
"tagValuesQuery": "",
@@ -11355,7 +11470,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11444,7 +11559,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@@ -11705,7 +11820,7 @@
],
"targets": [
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11714,7 +11829,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11723,7 +11838,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11732,7 +11847,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11741,7 +11856,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -11750,7 +11865,7 @@
"step": 10
},
{
- "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
+ "expr": "sum by(namespace) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace!=\"\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14120,7 +14235,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14209,7 +14324,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{container!=\"\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@@ -14470,7 +14585,7 @@
],
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14479,7 +14594,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14488,7 +14603,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14497,7 +14612,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14506,7 +14621,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -14515,7 +14630,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17371,7 +17486,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17379,7 +17494,7 @@
"step": 10
},
{
- "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
+ "expr": "ceil(sum by(pod) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\",namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17468,7 +17583,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Reads",
@@ -17476,7 +17591,7 @@
"step": 10
},
{
- "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(pod) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$pod\"}[$__rate_interval]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Writes",
@@ -17928,7 +18043,7 @@
],
"targets": [
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17937,7 +18052,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\",device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17946,7 +18061,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17955,7 +18070,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17964,7 +18079,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -17973,7 +18088,7 @@
"step": 10
},
{
- "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
+ "expr": "sum by(container) (rate(container_fs_reads_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]) + rate(container_fs_writes_bytes_total{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\", container!=\"\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"}[$__rate_interval]))",
"format": "table",
"instant": true,
"intervalFactor": 2,
@@ -30449,21 +30564,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -30511,111 +30626,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -31396,21 +31623,21 @@
"steppedLine": false,
"targets": [
{
- "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} read",
"refId": "A"
},
{
- "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} written",
"refId": "B"
},
{
- "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+\"}[$__rate_interval])",
+ "expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\", device=~\"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)\"}[$__rate_interval])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{device}} io time",
@@ -31458,111 +31685,223 @@
]
},
{
- "aliasColors": {
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ },
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "yellow",
+ "value": 0.8
+ },
+ {
+ "color": "red",
+ "value": 0.9
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Mounted on"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 260
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Size"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 93
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 72
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Available"
+ },
+ "properties": [
+ {
+ "id": "custom.width",
+ "value": 88
+ }
+ ]
+ },
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Used, %"
+ },
+ "properties": [
+ {
+ "id": "unit",
+ "value": "percentunit"
+ },
+ {
+ "id": "custom.displayMode",
+ "value": "gradient-gauge"
+ },
+ {
+ "id": "max",
+ "value": 1
+ },
+ {
+ "id": "min",
+ "value": 0
+ }
+ ]
+ }
+ ]
},
- "bars": false,
- "dashLength": 10,
- "dashes": false,
- "datasource": "$datasource",
- "fill": 1,
- "fillGradient": 0,
"gridPos": {
},
"id": 7,
- "legend": {
- "alignAsTable": false,
- "avg": false,
- "current": false,
- "max": false,
- "min": false,
- "rightSide": false,
- "show": true,
- "sideWidth": null,
- "total": false,
- "values": false
- },
- "lines": true,
- "linewidth": 1,
- "links": [
-
- ],
- "nullPointMode": "null",
- "percentage": false,
- "pointradius": 5,
- "points": false,
- "renderer": "flot",
- "repeat": null,
- "seriesOverrides": [
- {
- "alias": "used",
- "color": "#E0B400"
- },
- {
- "alias": "available",
- "color": "#73BF69"
- }
- ],
- "spaceLength": 10,
"span": 6,
- "stack": true,
- "steppedLine": false,
"targets": [
{
- "expr": "sum(\n max by (device) (\n node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n -\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_size_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "used",
- "refId": "A"
+ "legendFormat": ""
},
{
- "expr": "sum(\n max by (device) (\n node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"}\n )\n)\n",
- "format": "time_series",
+ "expr": "max by (mountpoint) (node_filesystem_avail_bytes{job=\"node-exporter\", instance=\"$instance\", fstype!=\"\"})\n",
+ "format": "table",
+ "instant": true,
"intervalFactor": 2,
- "legendFormat": "available",
- "refId": "B"
+ "legendFormat": ""
}
],
- "thresholds": [
-
- ],
- "timeFrom": null,
- "timeShift": null,
"title": "Disk Space Usage",
- "tooltip": {
- "shared": true,
- "sort": 0,
- "value_type": "individual"
- },
- "type": "graph",
- "xaxis": {
- "buckets": null,
- "mode": "time",
- "name": null,
- "show": true,
- "values": [
+ "transformations": [
+ {
+ "id": "groupBy",
+ "options": {
+ "fields": {
+ "Value #A": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "Value #B": {
+ "aggregations": [
+ "lastNotNull"
+ ],
+ "operation": "aggregate"
+ },
+ "mountpoint": {
+ "aggregations": [
- ]
- },
- "yaxes": [
+ ],
+ "operation": "groupby"
+ }
+ }
+ }
+ },
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "merge",
+ "options": {
+
+ }
},
{
- "format": "bytes",
- "label": null,
- "logBase": 1,
- "max": null,
- "min": 0,
- "show": true
+ "id": "calculateField",
+ "options": {
+ "alias": "Used",
+ "binary": {
+ "left": "Value #A (lastNotNull)",
+ "operator": "-",
+ "reducer": "sum",
+ "right": "Value #B (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "calculateField",
+ "options": {
+ "alias": "Used, %",
+ "binary": {
+ "left": "Used",
+ "operator": "/",
+ "reducer": "sum",
+ "right": "Value #A (lastNotNull)"
+ },
+ "mode": "binary",
+ "reduce": {
+ "reducer": "sum"
+ }
+ }
+ },
+ {
+ "id": "organize",
+ "options": {
+ "excludeByName": {
+
+ },
+ "indexByName": {
+
+ },
+ "renameByName": {
+ "Value #A (lastNotNull)": "Size",
+ "Value #B (lastNotNull)": "Available",
+ "mountpoint": "Mounted on"
+ }
+ }
+ },
+ {
+ "id": "sortBy",
+ "options": {
+ "fields": {
+
+ },
+ "sort": [
+ {
+ "field": "Mounted on"
+ }
+ ]
+ }
}
- ]
+ ],
+ "transparent": false,
+ "type": "table"
}
],
"repeat": null,
@@ -39322,13 +39661,27 @@
containerPort: 9100
protocol: TCP
livenessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
readinessProbe:
+ failureThreshold: 3
httpGet:
+ httpHeaders:
path: /
port: 9100
+ scheme: HTTP
+ initialDelaySeconds: 0
+ periodSeconds: 10
+ successThreshold: 1
+ timeoutSeconds: 1
resources: {}
volumeMounts:
- name: proc
@@ -39400,7 +39753,7 @@
- --metric-labels-allowlist=persistentvolumeclaims=[*]
- --telemetry-port=8081
imagePullPolicy: IfNotPresent
- image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.1"
+ image: "registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.5.0"
ports:
- containerPort: 8080
name: "http"
@@ -39448,17 +39801,17 @@
spec:
containers:
- name: kube-prometheus-stack
- image: "quay.io/prometheus-operator/prometheus-operator:v0.56.3"
+ image: "quay.io/prometheus-operator/prometheus-operator:v0.58.0"
imagePullPolicy: "IfNotPresent"
args:
- --kubelet-service=kube-system/prometheus-kubelet
- --localhost=127.0.0.1
- - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.56.3
+ - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.58.0
- --config-reloader-cpu-request=200m
- --config-reloader-cpu-limit=200m
- --config-reloader-memory-request=50Mi
- --config-reloader-memory-limit=50Mi
- - --thanos-default-base-image=quay.io/thanos/thanos:v0.25.2
+ - --thanos-default-base-image=quay.io/thanos/thanos:v0.27.0
- --web.enable-tls=true
- --web.cert-file=/cert/cert
- --web.key-file=/cert/key
@@ -39631,7 +39984,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10252
protocol: TCP
---
# Source: kube-prometheus-stack/templates/exporters/kube-etcd/endpoints.yaml
@@ -39703,7 +40056,7 @@
- ip: 10.20.0.22
ports:
- name: http-metrics
- port:
+ port: 10251
protocol: TCP
---
# Source: kube-prometheus-stack/templates/prometheus-operator/admission-webhooks/mutatingWebhookConfiguration.yaml
@@ -39736,6 +40089,7 @@
namespace: default
name: prometheus-operator
path: /admission-prometheusrules/mutate
+ timeoutSeconds: 10
admissionReviewVersions: ["v1", "v1beta1"]
sideEffects: None
---
@@ -39760,8 +40114,8 @@
port: http-web
pathPrefix: "/"
apiVersion: v2
- image: "quay.io/prometheus/prometheus:v2.35.0"
- version: v2.35.0
+ image: "quay.io/prometheus/prometheus:v2.37.0"
+ version: v2.37.0
replicaExternalLabelName: "replica"
externalUrl: "http://prometheus.${SECRET_DOMAIN}/"
paused: false
@@ -40001,61 +40355,84 @@
groups:
- name: etcd
rules:
+ - alert: etcdMembersDown
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
+ summary: etcd cluster members are down.
+ expr: |-
+ max without (endpoint) (
+ sum without (instance) (up{job=~".*etcd.*"} == bool 0)
+ or
+ count without (To) (
+ sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
+ )
+ )
+ > 0
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdInsufficientMembers
annotations:
- message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
- expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
+ description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).'
+ summary: etcd cluster has insufficient number of members.
+ expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.'
+ summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
- message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.'
- expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
- for: 15m
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+ summary: etcd cluster has high number of leader changes.
+ expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
+ for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of failed grpc requests.
expr: |-
- 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ 100 README.md Taskfile.yml default docs k8s secrets sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
- sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
+ sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.'
+ summary: etcd grpc requests are slow
expr: |-
- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
+ histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
- message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
@@ -40064,53 +40441,64 @@
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
- message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
+ - alert: etcdHighFsyncDurations
+ annotations:
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile fsync durations are too high.
+ expr: |-
+ histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
+ > 1
+ for: 10m
+ labels:
+ severity: critical
- alert: etcdHighCommitDurations
annotations:
- message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.'
+ summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ - alert: etcdDatabaseQuotaLowSpace
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.01
+ description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
+ summary: etcd cluster database is running full.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
for: 10m
labels:
- severity: warning
- - alert: etcdHighNumberOfFailedHTTPRequests
+ severity: critical
+ - alert: etcdExcessiveDatabaseGrowth
annotations:
- message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.'
- expr: |-
- sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
- BY (method) > 0.05
+ description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
+ summary: etcd cluster database growing very fast.
+ expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
for: 10m
labels:
- severity: critical
- - alert: etcdHTTPRequestsSlow
+ severity: warning
+ - alert: etcdDatabaseHighFragmentationRatio
annotations:
- message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.
- expr: |-
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
- > 0.15
+ description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
+ runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
+ summary: etcd database size in use is less than 50% of the actual allocated storage.
+ expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
for: 10m
labels:
severity: warning
@@ -41126,11 +41514,11 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
- sum by (namespace, pod) (
- max by(namespace, pod) (
+ sum by (namespace, pod, cluster) (
+ max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown"}
- ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
- 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
+ ) README.md Taskfile.yml default docs k8s secrets on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
+ 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
@@ -41259,7 +41647,7 @@
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
- expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
+ expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
@@ -41290,7 +41678,7 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
- time() - max by(namespace, job_name) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
+ time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
@@ -41608,7 +41996,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
- expr: sum by(name, namespace)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+ expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
@@ -41616,7 +42004,7 @@
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
- expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
+ expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) README.md Taskfile.yml default docs k8s secrets 100 < 85
for: 5m
labels:
severity: warning
@@ -41880,7 +42268,7 @@
description: There are {{ $value }} different semantic versions of Kubernetes components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
- expr: count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
+ expr: count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@@ -41953,9 +42341,9 @@
record: instance:node_memory_utilisation:ratio
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[5m])
+ - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |-
sum without (device) (
@@ -42031,10 +42419,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 3% space left.
+ summary: Filesystem has less than 5% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42045,10 +42433,10 @@
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
- summary: Filesystem has less than 5% space left.
+ summary: Filesystem has less than 3% space left.
expr: |-
(
- node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 5
+ node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} README.md Taskfile.yml default docs k8s secrets 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
@@ -42156,15 +42544,15 @@
summary: Clock skew detected.
expr: |-
(
- node_timex_offset_seconds > 0.05
+ node_timex_offset_seconds{job="node-exporter"} > 0.05
and
- deriv(node_timex_offset_seconds[5m]) >= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
- node_timex_offset_seconds < -0.05
+ node_timex_offset_seconds{job="node-exporter"} < -0.05
and
- deriv(node_timex_offset_seconds[5m]) <= 0
+ deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
@@ -42175,9 +42563,9 @@
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
- min_over_time(node_timex_sync_status[5m]) == 0
+ min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
- node_timex_maxerror_seconds >= 16
+ node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
@@ -42186,7 +42574,7 @@
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
- expr: node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
+ expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
@@ -42195,7 +42583,7 @@
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
- expr: node_md_disks{state="failed"} > 0
+ expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
@@ -42363,7 +42751,7 @@
description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
summary: Prometheus operator not ready
- expr: min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
+ expr: min by (controller,namespace) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
@@ -42620,6 +43008,15 @@
for: 5m
labels:
severity: critical
+ - alert: PrometheusHighQueryLoad
+ annotations:
+ description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+ summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+ expr: avg_over_time(prometheus_engine_queries{job="prometheus-prometheus",namespace="default"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-prometheus",namespace="default"}[5m]) > 0.8
+ for: 15m
+ labels:
+ severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
@@ -42758,6 +43155,12 @@
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: https
scheme: https
+ metricRelabelings:
+ - action: drop
+ regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
+ sourceLabels:
+ - __name__
+ - le
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
@@ -42914,6 +43317,32 @@
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+ metricRelabelings:
+ - action: drop
+ regex: container_cpu_(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_memory_(mapped_file|swap)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_(file_descriptors|tasks_state|threads_max)
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: container_spec.*
+ sourceLabels:
+ - __name__
+ - action: drop
+ regex: .+;
+ sourceLabels:
+ - id
+ - pod
relabelings:
- sourceLabels:
- __metrics_path__
@@ -43002,10 +43431,10 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
- name: prometheus-thanos-discovery
+ name: prometheus-thanos-sidecar
namespace: default
labels:
- app: kube-prometheus-stack-thanos-discovery
+ app: kube-prometheus-stack-thanos-sidecar
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/instance: kube-prometheus-stack
app.kubernetes.io/part-of: kube-prometheus-stack
@@ -43202,7 +43631,7 @@
spec:
containers:
- name: create
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- create
@@ -43247,7 +43676,7 @@
spec:
containers:
- name: patch
- image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.1.1
+ image: k8s.gcr.io/ingress-nginx/kube-webhook-certgen:v1.2.0
imagePullPolicy: IfNotPresent
args:
- patch