grafana-dashboards icon indicating copy to clipboard operation
grafana-dashboards copied to clipboard

Add a heatmap for event loop durations

Open yuvipanda opened this issue 4 months ago • 0 comments

We capture an event loop durations metric, and this is extremely helpful to figure out if there are issues with the hub process itself being 'stuck'. This is much more helpful than the 99th percentile of hub response latencies.

Image

This is what it looks like during a stress test. That's far too many >1s invocations!

Image

This is what it looks like when we have a few hundred users joining at the same time. Acceptable but still too many >1s hits.

Most of these are probably db queries that should be either made async or optimized (like in https://github.com/jupyterhub/jupyterhub/pull/5109). Unlike that PR, these are queries that are probably being made on an every request basis.

Here's a JSON model of this for future reference:

{
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": {
          "type": "grafana",
          "uid": "-- Grafana --"
        },
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
  "id": 44,
  "links": [],
  "panels": [
    {
      "datasource": {
        "default": false,
        "type": "prometheus",
        "uid": "B5M_zxhnz"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "hideFrom": {
              "legend": false,
              "tooltip": false,
              "viz": false
            },
            "scaleDistribution": {
              "type": "linear"
            }
          }
        },
        "overrides": []
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 0
      },
      "id": 1,
      "options": {
        "calculate": false,
        "cellGap": 1,
        "color": {
          "exponent": 0.5,
          "fill": "dark-orange",
          "mode": "scheme",
          "reverse": false,
          "scale": "exponential",
          "scheme": "Viridis",
          "steps": 64
        },
        "exemplars": {
          "color": "rgba(255,0,255,0.7)"
        },
        "filterValues": {
          "le": 1e-9
        },
        "legend": {
          "show": true
        },
        "rowsFrame": {
          "layout": "auto"
        },
        "tooltip": {
          "mode": "single",
          "showColorScale": false,
          "yHistogram": false
        },
        "yAxis": {
          "axisPlacement": "left",
          "reverse": false,
          "unit": "dtdurations"
        }
      },
      "pluginVersion": "12.0.1",
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "B5M_zxhnz"
          },
          "editorMode": "code",
          "expr": "sum(increase(jupyterhub_event_loop_interval_seconds_bucket{namespace=\"workshop\"}[2m])) by (le)",
          "format": "heatmap",
          "hide": false,
          "instant": false,
          "interval": "2m",
          "legendFormat": "{{le}}",
          "range": true,
          "refId": "A"
        }
      ],
      "title": "Event Loop Duration distribution",
      "type": "heatmap"
    }
  ],
  "preload": false,
  "refresh": "",
  "schemaVersion": 41,
  "tags": [],
  "templating": {
    "list": []
  },
  "time": {
    "from": "now-1h",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "browser",
  "title": "New dashboard",
  "uid": "aei94kc647u2of",
  "version": 4
}

yuvipanda avatar Jul 22 '25 18:07 yuvipanda