qryn
qryn copied to clipboard
Slow metrics requests
Version: 3.0.26-bun (dockerhub)
Promql request:
http_server_request_duration_seconds_bucket{job="Service", deployment_environment="Production", instance=~""}
request duration: 8 sec for 3 hours of metrics
Clickhouse request:
SELECT
count() AS series_n,
sum(length(values)) AS points_n
FROM
(
select
any(labels) as `stream`,
arraySort(groupArray((raw.timestamp_ms, raw.value))) as `values`
from
(
select
argMaxMerge(last) as `value`,
`fingerprint`,
intDiv(timestamp_ns, 15000000000) * 15000 as `timestamp_ms`,
min(time_series.labels) as `labels`
from
`metrics_15s_dist` as `metrics_15s` any
left join (
select
`fingerprint`,
arraySort(JSONExtractKeysAndValues(labels, 'String')) as `labels`
from
`qrynprod`.`time_series`
where
(
`fingerprint` in (
(
select
`fingerprint`
from
`qrynprod`.`time_series_gin` as `time_series_gin`
where
(
(
(
(`key` = 'job')
and (`val` = 'Service')
)
or (
(`key` = 'deployment_environment')
and (`val` = 'Production')
)
or (
(`key` = 'instance')
and (match(val, '') = 1)
)
or (
(`key` = '__name__')
and (
`val` = 'http_server_request_duration_seconds_bucket'
)
)
)
and (`date` >= toDate(fromUnixTimestamp(1703042340)))
and (`date` <= toDate(fromUnixTimestamp(1703063940)))
)
group by
`fingerprint`
having
(
groupBitOr(
bitShiftLeft(
(
(`key` = 'job')
and (`val` = 'Service')
) :: UInt64,
0
) + bitShiftLeft(
(
(`key` = 'deployment_environment')
and (`val` = 'Production')
) :: UInt64,
1
) + bitShiftLeft(
(
(`key` = 'instance')
and (match(val, '') = 1)
) :: UInt64,
2
) + bitShiftLeft(
(
(`key` = '__name__')
and (
`val` = 'http_server_request_duration_seconds_bucket'
)
) :: UInt64,
3
)
) = 15
)
)
)
)
) as time_series on `time_series`.`fingerprint` = metrics_15s.fingerprint
where
(
(
`fingerprint` in (
(
select
`fingerprint`
from
`qrynprod`.`time_series_gin` as `time_series_gin`
where
(
(
(
(`key` = 'job')
and (`val` = 'Service')
)
or (
(`key` = 'deployment_environment')
and (`val` = 'Production')
)
or (
(`key` = 'instance')
and (match(val, '') = 1)
)
or (
(`key` = '__name__')
and (
`val` = 'http_server_request_duration_seconds_bucket'
)
)
)
and (`date` >= toDate(fromUnixTimestamp(1703042340)))
and (`date` <= toDate(fromUnixTimestamp(1703063940)))
)
group by
`fingerprint`
having
(
groupBitOr(
bitShiftLeft(
(
(`key` = 'job')
and (`val` = 'Service')
) :: UInt64,
0
) + bitShiftLeft(
(
(`key` = 'deployment_environment')
and (`val` = 'Production')
) :: UInt64,
1
) + bitShiftLeft(
(
(`key` = 'instance')
and (match(val, '') = 1)
) :: UInt64,
2
) + bitShiftLeft(
(
(`key` = '__name__')
and (
`val` = 'http_server_request_duration_seconds_bucket'
)
) :: UInt64,
3
)
) = 15
)
)
)
)
and (`timestamp_ns` >= 1703042340000000000)
and (`timestamp_ns` <= 1703063940000000000)
)
group by
`fingerprint`,
`timestamp_ms`
order by
`fingerprint`,
`timestamp_ms`
) as `raw`
group by
`raw`.`fingerprint`
order by
`raw`.`fingerprint`
) AS a
1 rows in set. Elapsed: 0.992 sec. Processed 115.41 million rows, 6.01 GB (116.40 million rows/s., 6.06 GB/s.)
976 rows in set. Elapsed: 2.863 sec. Processed 114.24 million rows, 8.16 GB (39.90 million rows/s., 2.85 GB/s.) for 6 hours of metrics
here you can find my qryn settings:
qryn-read:
image: qxip/qryn:3.0.26-bun
restart: unless-stopped
environment:
CLICKHOUSE_SERVER: clickhouse
CLUSTER_NAME: cluster
CLICKHOUSE_PORT: 8123
CLICKHOUSE_DB: qrynprod
CLICKHOUSE_AUTH: "default:{{ clickhouse.password }}"
CLICKHOUSE_PROTO: http
LABELS_DAYS: 3
SAMPLES_DAYS: 3
HOST: 0.0.0.0
PORT: 3100
FASTIFY_METRICS: "false"
READONLY: "true"
NODE_OPTIONS: "--max-old-space-size=61440"
ADVANCED_PROMETHEUS_MAX_SAMPLES: 5000000
@deathalt Can you please specify the rewults of the request you mentioned earlier? There should be one row like:
┌─series_n─┬─points_n─┐
│ 3101 │ 21707 │
└──────────┴──────────┘
@akvlad
┌─series_n─┬─points_n─┐
│ 976 │ 351536│
└──────────┴──────────┘
Looks like I was able to almost completely get rid of the problem.
I organized access to clickhouse via chproxy and added caches there.
I added incremental requests and caches in grafana prometheus datasource.
Now everything looks pretty good and due to the fact that I don't have to make a bunch of big queries metrics are displayed almost in real time.
Maybe it will be useful for someone.
@deathalt feel free to share a config or PR so we can add a chproxy example to our docker demo 👍
@lmangani https://github.com/metrico/qryn-oss-demo/pull/7 done