teks icon indicating copy to clipboard operation
teks copied to clipboard

Thanos Query not able to fetch data from Thanos Store

Open ramesh-kumarjha opened this issue 2 years ago • 13 comments

Only 2 hour data is visible in grafna dashbord i have checked promethus also getting only 2 hour of data . And data is pushed to s3 bucket my terragrunt.hcl file in eks-adons folder is include { path = "${find_in_parent_folders()}" }

terraform { source = "github.com/particuleio/terraform-kubernetes-addons.git//modules/aws?ref=v2.1.0" }

dependency "eks" { config_path = "../eks"

mock_outputs = { cluster_id = "cluster-name" cluster_oidc_issuer_url = "https://oidc.eks.eu-west-3.amazonaws.com/id/0000000000000000" } }

dependency "vpc" { config_path = "../vpc"

mock_outputs = { private_subnets_cidr_blocks = [ "privateip.cidr", "privateip.cidr" ] } }

generate "provider" { path = "provider.tf" if_exists = "overwrite" contents = <<-EOF provider "aws" { region = "${local.aws_region}" } provider "kubectl" { host = data.aws_eks_cluster.cluster.endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data) token = data.aws_eks_cluster_auth.cluster.token load_config_file = false } provider "kubernetes" { host = data.aws_eks_cluster.cluster.endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data) token = data.aws_eks_cluster_auth.cluster.token } provider "helm" { kubernetes { host = data.aws_eks_cluster.cluster.endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data) token = data.aws_eks_cluster_auth.cluster.token } } data "aws_eks_cluster" "cluster" { name = var.cluster-name } data "aws_eks_cluster_auth" "cluster" { name = var.cluster-name } EOF }

locals { aws_region = yamldecode(file("${find_in_parent_folders("region_values.yaml")}"))["aws_region"] custom_tags = merge( yamldecode(file("${find_in_parent_folders("global_tags.yaml")}")), yamldecode(file("${find_in_parent_folders("env_tags.yaml")}")) ) default_domain_name = yamldecode(file("${find_in_parent_folders("global_values.yaml")}"))["default_domain_name"] default_domain_suffix = "${local.custom_tags["Env"]}.${local.custom_tags["Project"]}.${local.default_domain_name}" }

inputs = {

cluster-name = dependency.eks.outputs.cluster_id

tags = merge( local.custom_tags )

eks = { "cluster_oidc_issuer_url" = dependency.eks.outputs.cluster_oidc_issuer_url }

aws-ebs-csi-driver = { enabled = true is_default_class = true }

aws-for-fluent-bit = { enabled = true }

test this with nginx controller

aws-load-balancer-controller = { enabled = true }

aws-node-termination-handler = { enabled = false }

calico = { enabled = true }

cert-manager = { enabled = false acme_email = "[email protected]" acme_http01_enabled = true acme_http01_ingress_class = "nginx" acme_dns01_enabled = true allowed_cidrs = dependency.vpc.outputs.private_subnets_cidr_blocks experimental_csi_driver = true }

cluster-autoscaler = { enabled = true }

cni-metrics-helper = { enabled = false }

external-dns = { external-dns = { enabled = true }, }

ingress-nginx = { enabled = true use_l7 = true allowed_cidrs = dependency.vpc.outputs.private_subnets_cidr_blocks }

istio-operator = { enabled = false }

karma = { enabled = false }

keycloak = { enabled = false }

kong = { enabled = false }

kube-prometheus-stack = { enabled = true allowed_cidrs = dependency.vpc.outputs.private_subnets_cidr_blocks thanos_sidecar_enabled = true thanos_bucket_force_destroy = true extra_values = <<-EXTRA_VALUES grafana: deploymentStrategy: type: Recreate ingress: enabled: true #paths: # - /grafana annotations: kubernetes.io/ingress.class: nginx cert-manager.io/cluster-issuer: "letsencrypt" hosts: - grafana.${local.default_domain_suffix} #tls: # - secretName: grafana.${local.default_domain_suffix} # hosts: # - grafana.${local.default_domain_suffix} persistence: enabled: true storageClassName: ebs-sc accessModes: - ReadWriteOnce size: 1Gi prometheus: ingress: enabled: true #paths: # - /prometheus annotations: kubernetes.io/ingress.class: nginx cert-manager.io/cluster-issuer: "letsencrypt" hosts: - prometheus.${local.default_domain_suffix} #tls: # - secretName: prometheus.${local.default_domain_suffix} # hosts: # - prometheus.${local.default_domain_suffix} prometheusSpec: additionalScrapeConfigs: - job_name: 'divum' scrape_interval: 5s ec2_sd_configs: - region: ap-south-1 port: 9100 # This should not be here! # check: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config, https://github.com/prometheus/prometheus/issues/5738, https://www.robustperception.io/automatically-monitoring-ec2-instances access_key: xxxxxxxxxx secret_key: xyz relabel_configs: - source_labels: [__meta_ec2_tag_Name] action: keep - source_labels: [__meta_ec2_tag_Name] target_label: instance - source_labels: [__meta_ec2_public_ip] target_label: ip - source_labels: [__meta_ec2_tag_release_env,__meta_ec2_tag_service_name] separator: ' | ' target_label: job replicas: 1 retention: 2d retentionSize: "6GB" ruleSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false storageSpec: volumeClaimTemplate: spec: storageClassName: ebs-sc accessModes: ["ReadWriteOnce"] resources: requests: storage: 10Gi alertmanager: ingress: enabled: true #paths: # - /alert-manager annotations: kubernetes.io/ingress.class: nginx cert-manager.io/cluster-issuer: "letsencrypt" hosts: - alert-manager.${local.default_domain_suffix} #tls: # - secretName: alert-manager.${local.default_domain_suffix} # hosts: # - alert-manager.${local.default_domain_suffix} EXTRA_VALUES }

loki-stack = { enabled = false bucket_force_destroy = true }

metrics-server = { enabled = true allowed_cidrs = dependency.vpc.outputs.private_subnets_cidr_blocks }

npd = { enabled = false }

sealed-secrets = { enabled = false }

thanos = { enabled = true generate_ca = true bucket_force_destroy = true }

}

thanos sidecar --prometheus.url=http://127.0.0.1:9090/

  • --grpc-address=[$(POD_IP)]:10901
  • --http-address=[$(POD_IP)]:10902
  • --objstore.config=$(OBJSTORE_CONFIG)
  • --tsdb.path=/prometheus
  • --log.level=info
  • --log.format=logfmt

thanos query

  • --log.level=info
  • --log.format=logfmt
  • --grpc-address=0.0.0.0:10901
  • --http-address=0.0.0.0:10902
  • --query.replica-label=prometheus_replica
  • --store=dnssrv+_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local
  • --store=dnssrv+_grpc._tcp.thanos-storegateway.monitoring.svc.cluster.local
  • --query.timeout=5m
  • --query.lookback-delta=15m
  • --query.replica-label=rule_replica

thanos store

  • --log.level=info
  • --log.format=logfmt
  • --grpc-address=0.0.0.0:10901
  • --http-address=0.0.0.0:10902
  • --data-dir=/data
  • --objstore.config-file=/conf/objstore.yml
  • --ignore-deletion-marks-delay=24h

could you please help me out because this running on prod env in eks cluster in Grafana datsource is promethus and url is http://thanos-query-frontend:9090

ramesh-kumarjha avatar Feb 16 '22 07:02 ramesh-kumarjha

@ramesh-kumarjha Could you post more readable logs please, with ``` like this:

thanos = {
  enabled = true
  generate_ca = true
  bucket_force_destroy = true
}

Also do you have the logs of the thanos storegateway, and did you check the webUI of the thanos query component to this if it was correctly connected to all the other thanos store ?

k -n monitoring port-forward thanos-query-687f8bb88d-fjgh5 10902 for example, you should see the different store that connect to thanos query. If the store is not here, check first the logs of the storage pods, then the target discovery on the thanos query web UI

ArchiFleKs avatar Feb 16 '22 08:02 ArchiFleKs

thanos store gateway log level=info ts=2022-02-17T05:56:17.43673342Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.236173081s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T05:56:17.540376268Z caller=bucket.go:513 msg="loaded new block" elapsed=92.706563ms id=01FW33ZXWA3ZJF7CH388D5AF5P level=info ts=2022-02-17T05:59:17.508499603Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.307940491s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:02:17.461234825Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.260705933s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:05:18.038379147Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.837820605s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:08:17.455223976Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.254691351s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:11:17.612413666Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.41186025s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:14:17.429340455Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.228784226s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:17:17.431249292Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.230694789s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:20:17.42373825Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.223204792s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:23:17.34807254Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.147537653s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:26:17.397752675Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.197219232s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:29:17.542319718Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.341761633s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:32:17.677187671Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.476629429s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:35:17.519868831Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.319311406s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:38:17.565135446Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.3645836s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:41:17.621925009Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.421476717s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:44:17.603466758Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.402927503s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:47:17.489051528Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.288518499s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:50:17.689273179Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.488684121s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:53:17.484109896Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.283572883s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:56:17.463296518Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.262759675s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T06:59:17.768505986Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.567968554s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:02:17.568864154Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.368335433s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:05:17.521485233Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.320928251s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:08:17.519714232Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.319162799s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:11:17.376375946Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.175820356s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:14:17.491599933Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.291031802s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:17:17.406186544Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.205635847s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:20:17.550942515Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.350388626s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:23:17.506301026Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.305749563s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:26:17.705765217Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.505231531s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:29:17.441927977Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.241371459s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:32:17.763026145Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.562467439s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:35:17.576272397Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.375736872s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:38:17.536328301Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.33576825s cached=1436 returned=1436 partial=0 level=info ts=2022-02-17T07:41:17.428154034Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.227627146s cached=1436 returned=1436 partial=0

ramesh-kumarjha avatar Feb 17 '22 07:02 ramesh-kumarjha

Screenshot 2022-02-17 at 1 16 49 PM

ramesh-kumarjha avatar Feb 17 '22 08:02 ramesh-kumarjha

kubectl logs -f thanos-query-676554596f-2wpnb -n monitoring
evel=error ts=2022-02-16T18:09:53.788310173Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=error ts=2022-02-16T18:10:23.787361183Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=error ts=2022-02-16T18:10:53.792948581Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=info ts=2022-02-16T18:11:28.778535607Z caller=storeset.go:408 component=storeset msg="adding new storeAPI to query storeset" address=10.32.28.123:10901 extLset="{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}"
level=warn ts=2022-02-16T18:33:48.635408868Z caller=proxy.go:279 component=proxy request="min_time:1645014829000 max_time:1645036429000 matchers:<name:\"cluster\" > matchers:<name:\"__name__\" value:\"es_jvm_uptime_seconds\" > max_resolution_window:9223372036854775807 aggregates:COUNT aggregates:SUM skip_chunks:true " err="No StoreAPIs matched for this query" stores="store Addr: 10.32.28.123:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"} Mint: 1645020000051 Maxt: 9223372036854775807 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" __name__=\"es_jvm_uptime_seconds\"];store Addr: 172.20.119.125:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"},{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} Mint: 1627285712587 Maxt: 1645027200000 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" __name__=\"es_jvm_uptime_seconds\"]"
level=warn ts=2022-02-16T18:33:48.743646723Z caller=proxy.go:279 component=proxy request="min_time:1645013700000 max_time:1645036200000 matchers:<name:\"cluster\" > matchers:<name:\"__name__\" value:\"es_cluster_pending_tasks_number\" > aggregates:MAX " err="No StoreAPIs matched for this query" stores="store Addr: 10.32.28.123:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"} Mint: 1645020000051 Maxt: 9223372036854775807 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" __name__=\"es_cluster_pending_tasks_number\"];store Addr: 172.20.119.125:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"},{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} Mint: 1627285712587 Maxt: 1645027200000 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" __name__=\"es_cluster_pending_tasks_number\"]"
level=warn ts=2022-02-16T18:33:48.747312189Z caller=proxy.go:279 component=proxy request="min_time:1645013700000 max_time:1645036200000 matchers:<name:\"cluster\" > matchers:<name:\"__name__\" value:\"es_cluster_datanodes_number\" > aggregates:MAX " err="No StoreAPIs matched for this query" stores="store Addr: 10.32.28.123:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"} Mint: 1645020000051 Maxt: 9223372036854775807 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" __name__=\"es_cluster_datanodes_number\"];store Addr: 172.20.119.125:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"},{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} Mint: 1627285712587 Maxt: 1645027200000 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" __name__=\"es_cluster_datanodes_number\"]"
level=warn ts=2022-02-16T18:33:48.787842511Z caller=proxy.go:279 component=proxy request="min_time:1645013700000 max_time:1645036200000 matchers:<name:\"cluster\" > matchers:<name:\"type\" value:\"()\" > matchers:<name:\"__name__\" value:\"es_cluster_shards_number\" > aggregates:MAX " err="No StoreAPIs matched for this query" stores="store Addr: 10.32.28.123:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"} Mint: 1645020000051 Maxt: 9223372036854775807 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" type=\"()\" __name__=\"es_cluster_shards_number\"];store Addr: 172.20.119.125:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"},{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} Mint: 1627285712587 Maxt: 1645027200000 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" type=\"()\" __name__=\"es_cluster_shards_number\"]"
level=warn ts=2022-02-16T18:33:48.794330003Z caller=proxy.go:279 component=proxy request="min_time:1645013820000 max_time:1645036320000 matchers:<name:\"cluster\" > matchers:<type:RE name:\"node\" value:\"()\" > matchers:<name:\"__name__\" value:\"es_os_cpu_percent\" > aggregates:COUNT aggregates:SUM " err="No StoreAPIs matched for this query" stores="store Addr: 172.20.119.125:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"},{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} Mint: 1627285712587 Maxt: 1645027200000 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\"} {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" node=~\"()\" __name__=\"es_os_cpu_percent\"];store Addr: 10.32.28.123:10901 LabelSets: {cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"} Mint: 1645020000051 Maxt: 9223372036854775807 filtered out: external labels [{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}] does not match request label matchers: [cluster=\"\" node=~\"()\" __name__=\"es_os_cpu_percent\"]"
level=warn ts=2022-02-17T05:52:53.774512491Z caller=storeset.go:490 component=storeset msg="update of store node failed" err="getting metadata: fetching store info from 10.32.28.123:10901: rpc error: code = DeadlineExceeded desc = latest balancer error: connection error: desc = \"transport: Error while dialing dial tcp 10.32.28.123:10901: connect: no route to host\"" address=10.32.28.123:10901
level=info ts=2022-02-17T05:52:53.775126586Z caller=storeset.go:379 component=storeset msg="removing store because it's unhealthy or does not exist" address=10.32.28.123:10901 extLset="{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}"
level=error ts=2022-02-17T05:52:53.792607013Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=warn ts=2022-02-17T05:52:58.775687286Z caller=storeset.go:490 component=storeset msg="update of store node failed" err="getting metadata: fetching store info from 10.32.28.123:10901: rpc error: code = DeadlineExceeded desc = latest balancer error: connection error: desc = \"transport: Error while dialing dial tcp 10.32.28.123:10901: connect: no route to host\"" address=10.32.28.123:10901
level=error ts=2022-02-17T05:53:23.78259837Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=error ts=2022-02-17T05:53:53.777864813Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=error ts=2022-02-17T05:54:23.790957491Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=error ts=2022-02-17T05:54:53.779371431Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=error ts=2022-02-17T05:55:23.795386995Z caller=resolver.go:99 msg="failed to lookup SRV records" host=_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local err="lookup _grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local on 172.20.0.10:53: no such host"
level=info ts=2022-02-17T05:55:58.787607673Z caller=storeset.go:408 component=storeset msg="adding new storeAPI to query storeset" address=10.32.29.91:10901 extLset="{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}"
 kubectl logs -f thanos-query-frontend-5d69565ff8-5lcdz -n monitoring
 level=info ts=2022-02-15T18:55:53.650782524Z caller=query_frontend.go:252 msg="starting query frontend"
level=info ts=2022-02-15T18:55:53.651102847Z caller=intrumentation.go:48 msg="changing probe status" status=ready
level=info ts=2022-02-15T18:55:53.651278728Z caller=intrumentation.go:60 msg="changing probe status" status=healthy
level=info ts=2022-02-15T18:55:53.651310029Z caller=http.go:62 service=http/server component=query-frontend msg="listening for requests and metrics" address=0.0.0.0:10902
level=info ts=2022-02-16T14:06:02.193099491Z caller=handler.go:159 org_id=anonymous msg="slow query detected" method=GET host=thanos-query-frontend:9090 path=/api/v1/query_range time_taken=10.555474322s param_start=1645016745 param_end=1645020345 param_step=15 param_query="sum(rate(node_cpu_seconds_total{alias=\"\"}[5s])) by (mode) * 100 / count(node_cpu_seconds_total{alias=\"\"}) by (mode) or sum(irate(node_cpu_seconds_total{alias=\"\"}[5m])) by (mode) * 100 / count(node_cpu_seconds_total{alias=\"\"}) by (mode)"
level=info ts=2022-02-16T14:06:34.784519089Z caller=handler.go:159 org_id=anonymous msg="slow query detected" method=GET host=thanos-query-frontend:9090 path=/api/v1/query_range time_taken=11.918820291s param_start=1645016775 param_end=1645020375 param_step=15 param_query="sum(rate(node_cpu_seconds_total{alias=\"\"}[5s])) by (mode) * 100 / count(node_cpu_seconds_total{alias=\"\"}) by (mode) or sum(irate(node_cpu_seconds_total{alias=\"\"}[5m])) by (mode) * 100 / count(node_cpu_seconds_total{alias=\"\"}) by (mode)"
level=error ts=2022-02-16T15:39:34.541119445Z caller=retry.go:73 org_id=anonymous msg="error processing request" try=0 err="context canceled"
level=error ts=2022-02-16T15:39:34.860531724Z caller=retry.go:73 org_id=anonymous msg="error processing request" try=0 err="context canceled"
level=info ts=2022-02-16T17:23:36.266233483Z caller=handler.go:159 org_id=anonymous msg="slow query detected" method=GET host=thanos-query-frontend:9090 path=/api/v1/query_range time_taken=11.023259094s param_query="sum(rate(node_cpu_seconds_total{alias=\"\"}[5s])) by (mode) * 100 / count(node_cpu_seconds_total{alias=\"\"}) by (mode) or sum(irate(node_cpu_seconds_total{alias=\"\"}[5m])) by (mode) * 100 / count(node_cpu_seconds_total{alias=\"\"}) by (mode)" param_start=1644988980 param_end=1645032180 param_step=30
kubectl logs -f prometheus-kube-prometheus-stack-prometheus-0 -c thanos-sidecar -n monitoring
level=warn ts=2022-02-17T05:54:12.111799Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:14.111637007Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:16.127220177Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:18.112409645Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:20.124106191Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:22.113806503Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:24.127050286Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:26.127147129Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:28.111677912Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:30.114762838Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:32.112611213Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:34.113451635Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:36.136597217Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:38.113216587Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:40.119072787Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:42.113207243Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:44.115243286Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:46.11386139Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:48.111752281Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:50.119067379Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:52.111034016Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:54.115042124Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:56.113678549Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:54:58.116092919Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:00.116344502Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:02.119102168Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:04.111057516Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:06.114915731Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:08.115185928Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:10.114382986Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:12.115034799Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:14.115092282Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:16.11322394Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:18.115105264Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:20.127078715Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:22.123927021Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:24.11936511Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:26.12719218Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=warn ts=2022-02-17T05:55:28.115113313Z caller=sidecar.go:305 msg="failed to get Prometheus flags. Is Prometheus running? Retrying" err="got non-200 response code: 503, response: Service Unavailable"
level=info ts=2022-02-17T05:55:30.21338556Z caller=sidecar.go:156 msg="successfully loaded prometheus external labels" external_labels="{cluster=\"bb-thanos-observer\", prometheus=\"monitoring/kube-prometheus-stack-prometheus\", prometheus_replica=\"prometheus-kube-prometheus-stack-prometheus-0\"}"
level=info ts=2022-02-17T05:55:30.213787227Z caller=intrumentation.go:48 msg="changing probe status" status=ready
level=info ts=2022-02-17T05:56:02.367076363Z caller=shipper.go:337 msg="upload new block" id=01FW33ZXWA3ZJF7CH388D5AF5P

ramesh-kumarjha avatar Feb 17 '22 08:02 ramesh-kumarjha

eks-adon terragrant.hcl

include {
  path = "${find_in_parent_folders()}"
}

terraform {
  source = "github.com/particuleio/terraform-kubernetes-addons.git//modules/aws?ref=v2.1.0"
}

dependency "eks" {
  config_path = "../eks"

  mock_outputs = {
    cluster_id              = "cluster-name"
    cluster_oidc_issuer_url = "https://oidc.eks.eu-west-3.amazonaws.com/id/0000000000000000"
  }
}

generate "provider" {
  path      = "provider.tf"
  if_exists = "overwrite"
  contents  = <<-EOF
    provider "aws" {
      region = "${local.aws_region}"
    }
    provider "kubectl" {
      host                   = data.aws_eks_cluster.cluster.endpoint
      cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
      token                  = data.aws_eks_cluster_auth.cluster.token
      load_config_file       = false
    }
    provider "kubernetes" {
      host                   = data.aws_eks_cluster.cluster.endpoint
      cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
      token                  = data.aws_eks_cluster_auth.cluster.token
    }
    provider "helm" {
      kubernetes {
        host                   = data.aws_eks_cluster.cluster.endpoint
        cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
        token                  = data.aws_eks_cluster_auth.cluster.token
      }
    }
    data "aws_eks_cluster" "cluster" {
      name = var.cluster-name
    }
    data "aws_eks_cluster_auth" "cluster" {
      name = var.cluster-name
    }
  EOF
}


locals {
  aws_region = yamldecode(file("${find_in_parent_folders("region_values.yaml")}"))["aws_region"]
  custom_tags = merge(
    yamldecode(file("${find_in_parent_folders("global_tags.yaml")}")),
    yamldecode(file("${find_in_parent_folders("env_tags.yaml")}"))
  )
  default_domain_name   = yamldecode(file("${find_in_parent_folders("global_values.yaml")}"))["default_domain_name"]
  default_domain_suffix = "${local.custom_tags["Env"]}.${local.custom_tags["Project"]}.${local.default_domain_name}"
  public_subnets_cidr_blocks = ["10.32.32.0/20", "10.32.0.0/20", "10.32.16.0/20"]
}

inputs = {

  cluster-name = dependency.eks.outputs.cluster_id

  tags = merge(
    local.custom_tags
  )

  eks = {
    "cluster_oidc_issuer_url" = dependency.eks.outputs.cluster_oidc_issuer_url
  }

  aws-ebs-csi-driver = {
    enabled          = true
    is_default_class = true
  }

  aws-for-fluent-bit = {
    enabled = true
  }
  
  # test this with nginx controller 
  aws-load-balancer-controller = {
    enabled = true
  }

  aws-node-termination-handler = {
    enabled = false
  }

  calico = {
    enabled = true
  }

  cert-manager = {
    enabled                   = false
    acme_email                = "[email protected]"
    acme_http01_enabled       = true
    acme_http01_ingress_class = "nginx"
    acme_dns01_enabled        = true
    allowed_cidrs             = local.public_subnets_cidr_blocks
    experimental_csi_driver   = true
  }

  cluster-autoscaler = {
    enabled = true
  }

  cni-metrics-helper = {
    enabled = false
  }

  external-dns = {
    external-dns = {
      enabled = true
    },
  }

  ingress-nginx = {
    enabled       = true
    use_l7        = true
    allowed_cidrs = local.public_subnets_cidr_blocks
  }

  istio-operator = {
    enabled = false
  }

  karma = {
    enabled = false
  }

  keycloak = {
    enabled = false
  }

  kong = {
    enabled = false
  }

  kube-prometheus-stack = {
    enabled                     = true
    allowed_cidrs               = local.public_subnets_cidr_blocks
    thanos_sidecar_enabled      = true
    thanos_bucket_force_destroy = true
    extra_values                = <<-EXTRA_VALUES
      grafana:
        deploymentStrategy:
          type: Recreate
        ingress:
          enabled: true
          #paths: 
          #  - /grafana
          annotations:
            kubernetes.io/ingress.class: nginx
            cert-manager.io/cluster-issuer: "letsencrypt"
          hosts:
            - grafana.${local.default_domain_suffix}
          #tls:
          #  - secretName: grafana.${local.default_domain_suffix}
          #    hosts:
          #      - grafana.${local.default_domain_suffix}
        persistence:
          enabled: true
          storageClassName: ebs-sc
          accessModes:
            - ReadWriteOnce
          size: 1Gi
      prometheus:
        ingress:
          enabled: true
          #paths: 
          #  - /prometheus
          annotations:
            kubernetes.io/ingress.class: nginx
            cert-manager.io/cluster-issuer: "letsencrypt"
          hosts:
            - prometheus.${local.default_domain_suffix}
          #tls:
          #  - secretName: prometheus.${local.default_domain_suffix}
          #    hosts:
          #      - prometheus.${local.default_domain_suffix}
        prometheusSpec:
          additionalScrapeConfigs:
            - job_name: 'divum'
              scrape_interval: 5s
              ec2_sd_configs:
                - region: ap-south-1
                  port: 9100
                  # This should not be here! 
                  # check: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config, https://github.com/prometheus/prometheus/issues/5738, https://www.robustperception.io/automatically-monitoring-ec2-instances 
                  access_key: xyz
                  secret_key: abcd
                  # https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-PrometheusEC2.html
                  filters:
                    - name: tag:release_env
                      values:
                        - prod
              relabel_configs:
                - source_labels: [__meta_ec2_tag_Name]
                  action: keep
                - source_labels: [__meta_ec2_tag_Name]
                  target_label: instance
                - source_labels: [__meta_ec2_public_ip]
                  target_label: ip
                #- source_labels: [__meta_ec2_tag_release_env,__meta_ec2_tag_service_name]
                #  separator: ' | '
                #  target_label: job
                - source_labels: [__meta_ec2_tag_service_name]
                  target_label: job
                - source_labels: [__meta_ec2_tag_release_env]
                  target_label: release_env
          replicas: 1
          retention: 2d
          retentionSize: "6GB"
          ruleSelectorNilUsesHelmValues: false
          serviceMonitorSelectorNilUsesHelmValues: false
          podMonitorSelectorNilUsesHelmValues: false
          storageSpec:
            volumeClaimTemplate:
              spec:
                storageClassName: ebs-sc
                accessModes: ["ReadWriteOnce"]
                resources:
                  requests:
                    storage: 10Gi
      alertmanager:
        ingress:
          enabled: true
          #paths: 
          #  - /alert-manager
          annotations:
            kubernetes.io/ingress.class: nginx
            cert-manager.io/cluster-issuer: "letsencrypt"
          hosts:
            - alert-manager.${local.default_domain_suffix}
          #tls:
          #  - secretName: alert-manager.${local.default_domain_suffix}
          #    hosts:
          #      - alert-manager.${local.default_domain_suffix}
      EXTRA_VALUES
  }

  loki-stack = {
    enabled              = false
    bucket_force_destroy = true
  }

  metrics-server = {
    enabled       = true
    allowed_cidrs = local.public_subnets_cidr_blocks
  }

  npd = {
    enabled = false
  }

  sealed-secrets = {
    enabled = false
  }

  thanos = {
    enabled              = true
    generate_ca          = true
    bucket_force_destroy = true
  }

}

ramesh-kumarjha avatar Feb 17 '22 08:02 ramesh-kumarjha

Hum, query seems ok and the rest also, did you try querying from the thanos query webui to see if you get more than 2h data ?

Also the prometheus datasource in grafana should be set either http://thanos-query-frontend:9090/ or http://thanos-query:9090/

ArchiFleKs avatar Feb 17 '22 09:02 ArchiFleKs

yes i configured it in grafna dashboard http://thanos-query-frontend:9090/ but not getting result Screenshot 2022-02-17 at 3 12 32 PM and also querying from the thanos webui not getting result more then 2 hour

ramesh-kumarjha avatar Feb 17 '22 09:02 ramesh-kumarjha

@ArchiFleKs any update on this or please help me out because this is our prod steup

ramesh-kumarjha avatar Feb 21 '22 08:02 ramesh-kumarjha

I don't see anything wrong from your config, has it worked at some point ? is there any block inside the S3 bucket, does thanos sidecar upload the blocks in S3?

ArchiFleKs avatar Feb 21 '22 08:02 ArchiFleKs

@ArchiFleKs thanos sidecar is able to upload chunk and index in every 2 hour in s3 bucket . I applied our ssl at alb level . free ssl certificate from aws .

ramesh-kumarjha avatar Feb 24 '22 04:02 ramesh-kumarjha

@ArchiFleKs any update or any solution to slove this . because it run on prod setup . please let me know if anything is there to fixed this

ramesh-kumarjha avatar Mar 02 '22 14:03 ramesh-kumarjha

@ramesh-kumarjha I can't do anything without more information, when you are are querying direclty from thanos-query webui, you only see 2 hours of data. Can you check the logs of these query on the storage gateway component for example.

ArchiFleKs avatar Mar 09 '22 22:03 ArchiFleKs

what more information what from my end plz let me know


 kubectl logs -f thanos-storegateway-0 -n monitoring
level=info ts=2022-04-05T03:11:17.379985654Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.179457132s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:14:17.493719333Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.293166185s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:17:17.406868102Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.206336268s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:20:17.405161135Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.204600957s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:23:17.44959183Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.249036491s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:26:17.490507757Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.289976341s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:29:17.425374692Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.224842664s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:32:17.463904383Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.263374604s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:35:17.482427712Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.28187525s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:38:17.573946241Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.373392314s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:41:17.421739839Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.221196333s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:44:17.438620377Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.23806441s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:47:17.447683922Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.247126975s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:50:17.372182076Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.171632616s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:53:17.389607262Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.189074223s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:56:17.384127964Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.183573005s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T03:59:17.45253186Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.251972857s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T04:02:17.454901007Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.254353544s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T04:05:17.502114751Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.301563501s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T04:08:17.509445898Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.308892109s cached=1437 returned=1437 partial=0
level=info ts=2022-04-05T04:11:17.435166517Z caller=fetcher.go:476 component=block.BaseFetcher msg="successfully synchronized block metadata" duration=1.234612907s cached=1437 returned=1437 partial=0

ramesh-kumarjha avatar Apr 06 '22 05:04 ramesh-kumarjha