helm-charts
helm-charts copied to clipboard
loki-loki-distributed-ruler.loki.svc.cluster.local could not be resolved (3: Host not found)
Hello,
Getting an error on grafana dashboard "Failed to load the data source configuration for Loki. Unable to fetch alert rules. Is the Loki data source properly configured?"
Checked and found this error in logs "loki-loki-distributed-ruler.loki.svc.cluster.local could not be resolved (3: Host not found)" .
/docker-entrypoint.sh: No files found in /docker-entrypoint.d/, skipping configuration 2023/09/04 14:20:51 [error] 10#10: *41 loki-loki-distributed-ruler.loki.svc.cluster.local could not be resolved (3: Host not found), client: 10.0.26.230, server: , request: "GET /prometheus/api/v1/rules HTTP/1.1", host: "loki-loki-distributed-gateway.loki" 10.0.26.230 - - [04/Sep/2023:14:20:51 +0000] 502 "GET /prometheus/api/v1/rules HTTP/1.1" 157 "-" "Grafana/10.0.3" "-" 2023/09/04 14:21:04 [error] 10#10: *41 loki-loki-distributed-ruler.loki.svc.cluster.local could not be resolved (3: Host not found), client: 10.0.26.230, server: , request: "GET /prometheus/api/v1/rules HTTP/1.1", host: "loki-loki-distributed-gateway.loki"
Its working for me. I'm using loki-distributed helm chart. Just in case if you are using gateway instead of ingress, you must configure loki datasource url as gateway service url. In case if you are using ingress instead of gateway you must configure loki tadasource url to your ingress host.
Example helm ovveride values.yaml configuration with AWS EKS + ELB
loki:
server:
http_listen_port: 3100
config: |
auth_enabled: false
server:
{{- toYaml .Values.loki.server | nindent 6 }}
common:
compactor_address: http://{{ include "loki.compactorFullname" . }}:3100
distributor:
ring:
kvstore:
store: memberlist
memberlist:
join_members:
- {{ include "loki.fullname" . }}-memberlist
ingester:
lifecycler:
ring:
kvstore:
store: memberlist
replication_factor: 1
chunk_idle_period: 30m
chunk_block_size: 262144
chunk_encoding: snappy
chunk_retain_period: 1m
max_transfer_retries: 0
wal:
dir: /var/loki/wal
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
max_cache_freshness_per_query: 2m
split_queries_by_interval: 2m
max_entries_limit_per_query: 5000000
{{- if .Values.loki.schemaConfig}}
schema_config:
{{- toYaml .Values.loki.schemaConfig | nindent 2}}
{{- end}}
{{- if .Values.loki.storageConfig}}
storage_config:
{{- if .Values.indexGateway.enabled}}
{{- $indexGatewayClient := dict "server_address" (printf "dns:///%s:9095" (include "loki.indexGatewayFullname" .)) }}
{{- $_ := set .Values.loki.storageConfig.boltdb_shipper "index_gateway_client" $indexGatewayClient }}
{{- end}}
{{- toYaml .Values.loki.storageConfig | nindent 2}}
{{- if .Values.memcachedIndexQueries.enabled }}
index_queries_cache_config:
memcached_client:
addresses: dnssrv+_memcached-client._tcp.{{ include "loki.memcachedIndexQueriesFullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.global.clusterDomain }}
consistent_hash: true
{{- end}}
{{- end}}
runtime_config:
file: /var/{{ include "loki.name" . }}-runtime/runtime.yaml
chunk_store_config:
max_look_back_period: 0s
{{- if .Values.memcachedChunks.enabled }}
chunk_cache_config:
embedded_cache:
enabled: false
memcached_client:
consistent_hash: true
addresses: dnssrv+_memcached-client._tcp.{{ include "loki.memcachedChunksFullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.global.clusterDomain }}
{{- end }}
{{- if .Values.memcachedIndexWrites.enabled }}
write_dedupe_cache_config:
memcached_client:
consistent_hash: true
addresses: dnssrv+_memcached-client._tcp.{{ include "loki.memcachedIndexWritesFullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.global.clusterDomain }}
{{- end }}
table_manager:
retention_deletes_enabled: false
retention_period: 0s
query_range:
align_queries_with_step: true
max_retries: 1
cache_results: true
results_cache:
cache:
{{- if .Values.memcachedFrontend.enabled }}
memcached_client:
addresses: dnssrv+_memcached-client._tcp.{{ include "loki.memcachedFrontendFullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.global.clusterDomain }}
consistent_hash: true
{{- else }}
embedded_cache:
enabled: true
ttl: 24h
{{- end }}
frontend_worker:
{{- if .Values.queryScheduler.enabled }}
scheduler_address: {{ include "loki.querySchedulerFullname" . }}:9095
{{- else }}
frontend_address: {{ include "loki.queryFrontendFullname" . }}-headless:9095
{{- end }}
frontend:
log_queries_longer_than: 5s
compress_responses: true
{{- if .Values.queryScheduler.enabled }}
scheduler_address: {{ include "loki.querySchedulerFullname" . }}:9095
{{- end }}
tail_proxy_url: http://{{ include "loki.querierFullname" . }}:3100
compactor:
shared_store: s3
ruler:
storage:
type: local
local:
directory: /etc/loki/rules
ring:
kvstore:
store: memberlist
rule_path: /tmp/loki/scratch
enable_api: true
enable_alertmanager_v2: true
alertmanager_url: http://<YOUR_ALERTMANAGER_URL>
compactor:
working_directory: /var/loki/compactor
shared_store: s3
compaction_interval: 10m
retention_enabled: true
limits_config:
retention_period: 2160h
max_query_length: 2200h
max_entries_limit_per_query: 5000000
schema_config:
configs:
- from: "2023-03-29"
store: boltdb-shipper
object_store: s3
schema: v11
index:
period: 24h
prefix: loki_index_
storage_config:
filesystem: null
aws:
region: us-east-1
bucketnames: BUCKET-NAME
s3forcepathstyle: false
boltdb_shipper:
active_index_directory: /var/loki/boltdb-shipper-active
shared_store: s3
cache_location: /var/loki/boltdb-shipper-cache
cache_ttl: 24h
serviceAccount:
create: true
name: loki
imagePullSecrets: []
annotations:
eks.amazonaws.com/role-arn: Role-ARN
automountServiceAccountToken: true
ingester:
kind: StatefulSet
nodeSelector: &node_selector
role: logging
ruler:
enabled: true
nodeSelector: &node_selector
directories:
fake:
rules.yml: |
groups:
- name: ProductionErros
interval: 1m
rules:
- alert: Production Erros
expr: |
count_over_time({component!="", environment="prod"} |~ "[eE][rR][rR][oO][rR]" != "ERROR:root" | pattern `<message>` [1m])
for: 0m
labels:
severity: critical
receiver: slack_logs
annotations:
summary: 'Errors count {{ $value }}.'
title: 'Production {{ $labels.app }} errors'
cluster: 'prod-eks'
description: 'The application/{{ $labels.app }} threw errors.'
message: '{{ $labels.message }}'
distributor:
nodeSelector: *node_selector
querier:
nodeSelector: *node_selector
queryFrontend:
nodeSelector: *node_selector
compactor:
nodeSelector: *node_selector
indexGateway:
nodeSelector: *node_selector
memcachedExporter:
enabled: true
memcachedChunks:
enabled: true
nodeSelector: *node_selector
memcachedFrontend:
enabled: true
nodeSelector: *node_selector
memcachedIndexQueries:
enabled: true
nodeSelector: *node_selector
memcachedIndexWrites:
enabled: true
nodeSelector: *node_selector
gateway:
enabled: false
ingress:
enabled: true
ingressClassName: alb
annotations:
alb.ingress.kubernetes.io/certificate-arn: "CERT-ARN"
alb.ingress.kubernetes.io/group.name: "LB-Group_Name"
alb.ingress.kubernetes.io/healthcheck-path: "/ready"
alb.ingress.kubernetes.io/healthcheck-port: "traffic-port"
alb.ingress.kubernetes.io/load-balancer-name: "LB-Name"
alb.ingress.kubernetes.io/scheme: "internal"
alb.ingress.kubernetes.io/target-type: "ip"
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
paths:
distributor:
- /api/prom/push
- /loki/api/v1/push
querier:
- /api/prom/tail
- /loki/api/v1/tail
query-frontend:
- /loki/api
ruler:
- /api/prom/rules
- /loki/api/v1/rules
- /prometheus/api/v1/rules
- /prometheus/api/v1/alerts
hosts:
- LOKI-HOST
In the Grafana configure the datasource like this:
URL: http(s)://<YOUR_INGRESS_HOST_NAME>
In case when you are using Gateway just disable ingress and enable gateway and in the Grafana configure the datasource like this:
URL: http://loki-distributed-gateway.<NAMESPACE>.svc.cluster.local
I suspect this is a bug. i have the same issue and no one seems to have the solution for it.
This configurations is working for me until now.
This was happening to me within the loki-gateway
pod. Killing & restarting the pod solved it. I think it is caused by the dns pod being restarted, as per this comment.