karpenter-provider-aws icon indicating copy to clipboard operation
karpenter-provider-aws copied to clipboard

Cannot disrupt NodeClaim: state node doesn't contain both a node and a nodeclaim

Open midestefanis opened this issue 1 year ago • 14 comments

Description

Observed Behavior:

Karpenter is not spinning up nodes

Expected Behavior:

New nodes

Reproduction Steps (Please include YAML):

Versions: 1.0.2

  • Chart Version: 1.0.2
  • Kubernetes Version (kubectl version): 1.30

Karpenter logs:

{"level":"DEBUG","time":"2024-09-20T17:51:59.015Z","logger":"controller","caller":"lifecycle/controller.go:104","message":"terminating due to registration ttl","commit":"b897114","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-qtpc4"},"namespace":"","name":"default-qtpc4","reconcileID":"de007d3b-ec7c-4c7a-9dbc-4f5fab5388b6","ttl":"15m0s"}
{"level":"DEBUG","time":"2024-09-20T17:51:59.015Z","logger":"controller","caller":"lifecycle/controller.go:104","message":"terminating due to registration ttl","commit":"b897114","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-qfk2r"},"namespace":"","name":"default-qfk2r","reconcileID":"db912214-e90f-421a-9250-634e78fdbefe","ttl":"15m0s"}
{"level":"INFO","time":"2024-09-20T17:52:06.099Z","logger":"controller","caller":"provisioning/provisioner.go:130","message":"found provisionable pod(s)","commit":"b897114","controller":"provisioner","namespace":"","name":"","reconcileID":"cc27657e-df3d-4bd7-b2c9-07aa63a98ae9","Pods":"default/inflate-96f968594-5h7wb, default/inflate-96f968594-n9xwh, default/inflate-96f968594-496xw, default/inflate-96f968594-q49vj, default/inflate-96f968594-n6m7q and 30 other(s)","duration":"166.537539ms"}
{"level":"INFO","time":"2024-09-20T17:52:06.099Z","logger":"controller","caller":"provisioning/provisioner.go:355","message":"computed new nodeclaim(s) to fit pod(s)","commit":"b897114","controller":"provisioner","namespace":"","name":"","reconcileID":"cc27657e-df3d-4bd7-b2c9-07aa63a98ae9","nodeclaims":2,"pods":35}
{"level":"INFO","time":"2024-09-20T17:52:06.111Z","logger":"controller","caller":"provisioning/provisioner.go:151","message":"created nodeclaim","commit":"b897114","controller":"provisioner","namespace":"","name":"","reconcileID":"cc27657e-df3d-4bd7-b2c9-07aa63a98ae9","NodePool":{"name":"default"},"NodeClaim":{"name":"default-qjlrv"},"requests":{"cpu":"31150m","pods":"34"},"instance-types":"c3.8xlarge, c5a.8xlarge, c5ad.8xlarge, c6a.8xlarge, c6i.8xlarge and 36 other(s)"}
{"level":"INFO","time":"2024-09-20T17:52:06.115Z","logger":"controller","caller":"provisioning/provisioner.go:151","message":"created nodeclaim","commit":"b897114","controller":"provisioner","namespace":"","name":"","reconcileID":"cc27657e-df3d-4bd7-b2c9-07aa63a98ae9","NodePool":{"name":"default"},"NodeClaim":{"name":"default-v5rjk"},"requests":{"cpu":"4150m","pods":"7"},"instance-types":"c3.2xlarge, c4.2xlarge, c5.2xlarge, c5.4xlarge, c5a.2xlarge and 55 other(s)"}
{"level":"DEBUG","time":"2024-09-20T17:52:06.299Z","logger":"controller","caller":"launchtemplate/launchtemplate.go:203","message":"created launch template","commit":"b897114","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-qjlrv"},"namespace":"","name":"default-qjlrv","reconcileID":"671e7479-a28f-4149-9018-02461be4e21a","launch-template-name":"karpenter.k8s.aws/13187031080724844461","id":"lt-0c4ce6ccf55858473"}
{"level":"DEBUG","time":"2024-09-20T17:52:06.448Z","logger":"controller","caller":"launchtemplate/launchtemplate.go:203","message":"created launch template","commit":"b897114","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-v5rjk"},"namespace":"","name":"default-v5rjk","reconcileID":"213487fa-0f07-4ff9-8fad-de45bdc09ff9","launch-template-name":"karpenter.k8s.aws/14925608515456331079","id":"lt-024f013ea96f3fd6a"}
{"level":"DEBUG","time":"2024-09-20T17:52:06.611Z","logger":"controller","caller":"launchtemplate/launchtemplate.go:203","message":"created launch template","commit":"b897114","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-v5rjk"},"namespace":"","name":"default-v5rjk","reconcileID":"213487fa-0f07-4ff9-8fad-de45bdc09ff9","launch-template-name":"karpenter.k8s.aws/15733972023742296453","id":"lt-00f868d7e3c423169"}
{"level":"DEBUG","time":"2024-09-20T17:52:07.693Z","logger":"controller","caller":"singleton/controller.go:26","message":"waiting on cluster sync","commit":"b897114","controller":"disruption","namespace":"","name":"","reconcileID":"b4532ab3-b66b-46e4-b045-9b55a87579a1"}
{"level":"INFO","time":"2024-09-20T17:52:08.120Z","logger":"controller","caller":"lifecycle/launch.go:61","message":"launched nodeclaim","commit":"b897114","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-qjlrv"},"namespace":"","name":"default-qjlrv","reconcileID":"671e7479-a28f-4149-9018-02461be4e21a","provider-id":"aws:///us-east-1b/i-02b580d8bab19db30","instance-type":"c6a.8xlarge","zone":"us-east-1b","capacity-type":"on-demand","allocatable":{"cpu":"31850m","ephemeral-storage":"17Gi","memory":"57691Mi","pods":"234","vpc.amazonaws.com/pod-eni":"84"}}
{"level":"INFO","time":"2024-09-20T17:52:08.428Z","logger":"controller","caller":"lifecycle/launch.go:61","message":"launched nodeclaim","commit":"b897114","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-v5rjk"},"namespace":"","name":"default-v5rjk","reconcileID":"213487fa-0f07-4ff9-8fad-de45bdc09ff9","provider-id":"aws:///us-east-1a/i-0541e23b7457a876d","instance-type":"c6a.2xlarge","zone":"us-east-1a","capacity-type":"on-demand","allocatable":{"cpu":"7910m","ephemeral-storage":"17Gi","memory":"14162Mi","pods":"58","vpc.amazonaws.com/pod-eni":"38"}}
{"level":"INFO","time":"2024-09-20T17:52:41.718Z","logger":"controller","caller":"termination/controller.go:79","message":"deleted nodeclaim","commit":"b897114","controller":"nodeclaim.termination","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-qfk2r"},"namespace":"","name":"default-qfk2r","reconcileID":"d3574ff2-51c8-471e-999b-fcf06a5b263e","Node":{"name":""},"provider-id":"aws:///us-east-1b/i-07794e8ad576593f9"}
{"level":"INFO","time":"2024-09-20T17:52:57.415Z","logger":"controller","caller":"termination/controller.go:79","message":"deleted nodeclaim","commit":"b897114","controller":"nodeclaim.termination","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"default-qtpc4"},"namespace":"","name":"default-qtpc4","reconcileID":"1648a78d-9ca4-4879-a0be-989210205585","Node":{"name":""},"provider-id":"aws:///us-east-1a/i-0ba084b06045af3f1"}
{"level":"DEBUG","time":"2024-09-20T17:53:15.547Z","logger":"controller","caller":"launchtemplate/launchtemplate.go:367","message":"deleted launch template","commit":"b897114","id":"lt-0c4ce6ccf55858473","name":"karpenter.k8s.aws/13187031080724844461"}
{"level":"DEBUG","time":"2024-09-20T17:53:15.650Z","logger":"controller","caller":"launchtemplate/launchtemplate.go:367","message":"deleted launch template","commit":"b897114","id":"lt-024f013ea96f3fd6a","name":"karpenter.k8s.aws/14925608515456331079"}
{"level":"DEBUG","time":"2024-09-20T17:53:15.748Z","logger":"controller","caller":"launchtemplate/launchtemplate.go:367","message":"deleted launch template","commit":"b897114","id":"lt-00f868d7e3c423169","name":"karpenter.k8s.aws/15733972023742296453"}

Node Claims are showing this:

Cannot disrupt NodeClaim: state node doesn't contain both a node and a nodeclaim

midestefanis avatar Sep 20 '24 18:09 midestefanis

I have exactly the same bug

kuzmacska avatar Sep 23 '24 08:09 kuzmacska

Could you please share your nodeclass and nodepool configurations? As well as any other steps to reproduce?

rschalo avatar Sep 23 '24 20:09 rschalo

i have same bug

nitinjain999 avatar Sep 25 '24 14:09 nitinjain999

share your nodeclass and nodepool configurations? As well as any other steps t

seems that for me too.

alekseikurepin avatar Sep 26 '24 07:09 alekseikurepin

@rschalo We've also experienced this issue. After upgrading to 1.0.1/1.0.2 (+ patching the CRDs to enable conversion webhooks), everything is fine. But once we apply a new nodeclass/nodepool (or even update an existing one), it stopped working and every nodeclaim is showing status of unknown and that error. We've even tried applying both v1beta1 and v1 manifests, to no avail. We had to downgrade back to v0.33.0.

More information: We see the EC2 instances spinning up, and their system logs show that kubelet has started, but they can't join the cluster (we couldn't find a reason for that).

We started suspecting there's something wrong with the AmiSelectorTerms, but we couldn't figure it out. We tried using id (for an AL2 AMI), and then switched to alias (with AL2023), but it made no difference.

roi-zentner avatar Sep 26 '24 07:09 roi-zentner

Can someone provide the kubelet logs from a node that fails to register?

GnatorX avatar Sep 27 '24 21:09 GnatorX

One thing to note (not sure if this is the issue) but on 0.37+ version of Karpenter there is a new readiness check on EC2NodeClass CRD. Was this updated?

Karpenter now adds a readiness status condition to the EC2NodeClass. Make sure to upgrade your Custom Resource Definitions before proceeding with the upgrade. Failure to do so will result in Karpenter being unable to provision new nodes.

https://karpenter.sh/v1.0/upgrading/upgrade-guide/#upgrading-to-0370

GnatorX avatar Sep 27 '24 21:09 GnatorX

I see this when I am trying to migrate from Cluster Autoscaler. (Almost a fresh installation)

Event Message for Node:

Events: Type Reason Age From Message


Normal DisruptionBlocked 4m32s (x211 over 7h4m) karpenter Cannot disrupt Node: state node doesn't contain both a node and a nodeclaim

The below is the log from the karpenter:

{"level":"ERROR","time":"2024-09-27T20:19:23.141Z","logger":"webhook.ConversionWebhook","message":"Reconcile error","commit":"688ea21","knative.dev/traceid":"b844441e-37e2-4c12-bdd3-8b3395383977","knative.dev/key":"nodeclaims.karpenter.sh","duration":"167.207687ms","error":"failed to update webhook: Operation cannot be fulfilled on customresourcedefinitions.apiextensions.k8s.io "nodeclaims.karpenter.sh": the object has been modified; please apply your changes to the latest version and try again"}

I see that the karpenter pods are up and running without any issue! I tried to patch and update the crd's but no luck

maheshsgithub avatar Sep 28 '24 04:09 maheshsgithub

In our case the problem was with missing toleration for taint as described here: https://github.com/kubernetes-sigs/aws-ebs-csi-driver/issues/2158

oskarpilch avatar Sep 29 '24 11:09 oskarpilch

UPDATE: First of all, this message is not an error. It is part of the lifecycle and every nodeclaim will have this event until the node it is assigned to is registered to the cluster.

We figured out what went wrong for us: Our cluster is still using aws-auth ConfigMap and we missed updating the role name there. This is why Karpenter was able to create the EC2 instance, but the instance wasn't able to join the cluster. We renamed the role as part of some other task and forgot about it when we upgraded Karpenter.

roi-zentner avatar Sep 30 '24 10:09 roi-zentner

@midestefanis can you confirm if the issue that @roi-zentner ran into is relevant to your issue? If not then are you able to share other info about how to reproduce the issue you're seeing?

rschalo avatar Sep 30 '24 21:09 rschalo

I have right aws-auth and still get Normal DisruptionBlocked 2m37s (x716 over 23h) karpenter Cannot disrupt Node: state node doesn't contain both a node and a nodeclaim and this node is not even managed by Karpenter

nitinjain999 avatar Oct 03 '24 05:10 nitinjain999

I have the same issue.

jadiaheno avatar Oct 03 '24 22:10 jadiaheno

same issue, I get this error on nodes NOT managed by karpenter

rsimiciuc avatar Oct 12 '24 12:10 rsimiciuc

Hi All, I've attempted to reproduce with a fresh install of v1.0.2 on K8S 1.30 and am not encountering this issue. For people that do see this, could you please share the Karpenter logs, nodepool, and ec2nodeclass definitions used that resulted in this behavior? Without reproduction it will be hard to determine if this is a bug or part of the normal lifecycle of nodeclaims. In my quick test, I set consolidateAfter for a nodepool to 20s and saw:

  Normal  DisruptionBlocked  62s   karpenter  Cannot disrupt NodeClaim: state node doesn't contain both a node and a nodeclaim

On the nodeclaim while it was waiting for the node to spin up. Are node objects and instances being created for these nodeclaims that have this?

rschalo avatar Oct 14 '24 17:10 rschalo

Also, we used to emit events for non-managed nodes but that was addressed in https://github.com/kubernetes-sigs/karpenter/pull/1644 which has been merged to main.

rschalo avatar Oct 14 '24 17:10 rschalo

I think the above log is a red herring to the issue. I agree we should change our eventing messages here to be more descriptive of what's actually happening, rather than describing internal karpenter state.

https://github.com/kubernetes-sigs/karpenter/blob/main/pkg/controllers/state/statenode.go#L177-L185

njtran avatar Oct 14 '24 21:10 njtran

I just ran into this. In my case the node claim would appear and the instance would be provisioned but remain in unknown status, never getting the node info or joining the cluster. The issue for me was the tag I picked to use on the subnetSelectorTerms. I used kubernetes.io/cluster/clustername: shared and as soon as I changed that selector to a different tag the node joins the cluster.

kylejep avatar Oct 18 '24 02:10 kylejep

In my case the node comes up and joins the cluster but the nodeclaim remains in unknown status. This happens quite often (not always, some nodes work fine). Relates to #6803.

I'm using loki to store the kubernetes event log, with these queries:

{app="eventrouter"} | json | line_format "{{.event_reason}}: {{.event_message}} ({{.event_metadata_name}})" |= "mynodepool-wz4vx" | verb = "ADDED"
{app="karpenter"}|="mynodepool-wz4vx"

I get:

2024-10-24 15:10:10.332	Nominated: Pod should schedule on: nodeclaim/mynodepool-wz4vx, node/i-0573b18a64d7a4ea5.eu-west-1.compute.internal (overprovisioning-755d56c54-4bkvw.18016592298fad4f)
2024-10-24 15:01:02.188	Nominated: Pod should schedule on: nodeclaim/mynodepool-wz4vx, node/i-0573b18a64d7a4ea5.eu-west-1.compute.internal (overprovisioning-755d56c54-578ct.18016512893be628)
2024-10-24 14:41:43.075	Nominated: Pod should schedule on: nodeclaim/mynodepool-wz4vx, node/i-0573b18a64d7a4ea5.eu-west-1.compute.internal (overprovisioning-755d56c54-4fp2p.18016404a8f8b6db)
2024-10-24 14:25:39.477	Nominated: Pod should schedule on: nodeclaim/mynodepool-wz4vx, node/i-0573b18a64d7a4ea5.eu-west-1.compute.internal (overprovisioning-755d56c54-669n7.180163244e3ed23d)
2024-10-24 13:57:25.965	Nominated: Pod should schedule on: nodeclaim/mynodepool-wz4vx, node/i-0573b18a64d7a4ea5.eu-west-1.compute.internal (overprovisioning-755d56c54-znbkz.1801619a01386d52)
2024-10-24 13:42:55.374	DisruptionBlocked: Cannot disrupt NodeClaim: state node isn't initialized (mynodepool-wz4vx.180160cf4dc0fc94)
2024-10-24 13:42:44.714	Nominated: Pod should schedule on: nodeclaim/mynodepool-wz4vx, node/i-0573b18a64d7a4ea5.eu-west-1.compute.internal (overprovisioning-755d56c54-vgw7j.180160ccd2866116)
2024-10-24 13:41:03.322	Registered: Status condition transitioned, Type: Registered, Status: Unknown -> True, Reason: Registered (mynodepool-wz4vx.180160b536a368e0)
2024-10-24 13:41:03.236	{"level":"INFO","time":"2024-10-24T11:41:03.235Z","logger":"controller","message":"registered nodeclaim","commit":"6174c75","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"mynodepool-wz4vx"},"namespace":"","name":"mynodepool-wz4vx","reconcileID":"8c5b8d81-72b3-4bf1-88d1-bf21d6d7ae68","provider-id":"aws:///eu-west-1c/i-0573b18a64d7a4ea5","Node":{"name":"i-0573b18a64d7a4ea5.eu-west-1.compute.internal"}}
2024-10-24 13:40:46.880	DisruptionBlocked: Cannot disrupt NodeClaim: state node doesn't contain both a node and a nodeclaim (mynodepool-wz4vx.180160b162ed10a5)
2024-10-24 13:40:37.654	Launched: Status condition transitioned, Type: Launched, Status: Unknown -> True, Reason: Launched (mynodepool-wz4vx.180160af3c5d8d0d)
2024-10-24 13:40:37.568	{"level":"INFO","time":"2024-10-24T11:40:37.568Z","logger":"controller","message":"launched nodeclaim","commit":"6174c75","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"mynodepool-wz4vx"},"namespace":"","name":"mynodepool-wz4vx","reconcileID":"8caaaf02-d0d1-4601-aef9-dbcaaf0dd99b","provider-id":"aws:///eu-west-1c/i-0573b18a64d7a4ea5","instance-type":"g5.2xlarge","zone":"eu-west-1c","capacity-type":"on-demand","allocatable":{"cpu":"7910m","ephemeral-storage":"403926258176","memory":"29317Mi","nvidia.com/gpu":"1","pods":"58","vpc.amazonaws.com/pod-eni":"17"}}
2024-10-24 13:40:35.873	Nominated: Pod should schedule on: nodeclaim/mynodepool-wz4vx (overprovisioning-755d56c54-vgw7j.180160aed18aa393)
2024-10-24 13:40:35.843	{"level":"INFO","time":"2024-10-24T11:40:35.843Z","logger":"controller","message":"created nodeclaim","commit":"6174c75","controller":"provisioner","namespace":"","name":"","reconcileID":"ad1171e5-67d1-4e54-929d-f9aa6ec5795d","NodePool":{"name":"mynodepool"},"NodeClaim":{"name":"mynodepool-wz4vx"},"requests":{"cpu":"210m","memory":"20720Mi","nvidia.com/gpu":"1","pods":"9"},"instance-types":"g5.2xlarge"}

Note that I had issues with the conversion webhook being broken, so removed it from the CRDs, but now I get:

{"level":"ERROR","time":"2024-10-24T11:56:13.458Z","logger":"webhook.ConversionWebhook","message":"Reconcile error","commit":"6174c75","knative.dev/traceid":"44702ea0-bdce-48bc-93ab-8d2d8b1a4d02","knative.dev/key":"nodepools.karpenter.sh","duration":"260.002µs","error":"custom resource \"nodepools.karpenter.sh\" isn't configured for webhook conversion"}

EDIT oh no I might have found the issue in the nodeclaim:

status:
  - lastTransitionTime: "2024-10-24T11:41:18Z"
    message: Resource "nvidia.com/gpu" was requested but not registered
    reason: ResourceNotRegistered
    status: Unknown
    type: Initialized
  - lastTransitionTime: "2024-10-24T11:41:03Z"
    message: Initialized=Unknown
    reason: UnhealthyDependents
    status: Unknown
    type: Ready

I'm using Bottlerocket OS 1.25.0 (aws-k8s-1.31-nvidia)

nodepool.yaml
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
  annotations:
    compatibility.karpenter.sh/v1beta1-nodeclass-reference: '{"kind":"EC2NodeClass","name":"large-disk","apiVersion":"karpenter.k8s.aws/v1beta1"}'
    karpenter.sh/nodepool-hash: "4010951020068392240"
    karpenter.sh/nodepool-hash-version: v3
    karpenter.sh/stored-version-migrated: "true"
  creationTimestamp: "2024-05-29T10:11:27Z"
  generation: 3
  name: mynodepool
  resourceVersion: "445358182"
  uid: 423a86e4-1596-4768-b0bd-c1bd4fbbd051
spec:
  disruption:
    budgets:
    - nodes: 10%
    consolidateAfter: 5m
    consolidationPolicy: WhenEmpty
  template:
    metadata:
      labels:
        nvidia.com/gpu: A10G
    spec:
      expireAfter: Never
      nodeClassRef:
        group: karpenter.k8s.aws
        kind: EC2NodeClass
        name: large-disk
      requirements:
      - key: karpenter.k8s.aws/instance-family
        operator: In
        values:
        - g5
      - key: karpenter.k8s.aws/instance-memory
        operator: Gt
        values:
        - "20000"
      - key: karpenter.k8s.aws/instance-memory
        operator: Lt
        values:
        - "60000"
      - key: karpenter.sh/capacity-type
        operator: In
        values:
        - on-demand
      taints:
      - effect: NoSchedule
        key: nvidia.com/gpu
        value: "true"
  weight: 40
status:
  conditions:
  - lastTransitionTime: "2024-09-30T12:22:53Z"
    message: ""
    reason: NodeClassReady
    status: "True"
    type: NodeClassReady
  - lastTransitionTime: "2024-09-30T12:22:53Z"
    message: ""
    reason: Ready
    status: "True"
    type: Ready
  - lastTransitionTime: "2024-09-30T12:22:51Z"
    message: ""
    reason: ValidationSucceeded
    status: "True"
    type: ValidationSucceeded
  resources:
    cpu: "40"
    ephemeral-storage: 1023670Mi
    hugepages-1Gi: "0"
    hugepages-2Mi: "0"
    memory: 162288212Ki
    nodes: "5"
    nvidia.com/gpu: "4"
    pods: "290"
    vpc.amazonaws.com/pod-eni: "17"
nodeclaim.yaml
apiVersion: karpenter.sh/v1
kind: NodeClaim
metadata:
  annotations:
    compatibility.karpenter.k8s.aws/cluster-name-tagged: "true"
    compatibility.karpenter.k8s.aws/kubelet-drift-hash: "15379597991425564585"
    karpenter.k8s.aws/ec2nodeclass-hash: "6440581379273964080"
    karpenter.k8s.aws/ec2nodeclass-hash-version: v3
    karpenter.k8s.aws/tagged: "true"
    karpenter.sh/nodepool-hash: "4010951020068392240"
    karpenter.sh/nodepool-hash-version: v3
    karpenter.sh/stored-version-migrated: "true"
  creationTimestamp: "2024-10-24T11:40:35Z"
  finalizers:
  - karpenter.sh/termination
  generateName: mynodepool-
  generation: 1
  labels:
    karpenter.k8s.aws/instance-category: g
    karpenter.k8s.aws/instance-cpu: "8"
    karpenter.k8s.aws/instance-cpu-manufacturer: amd
    karpenter.k8s.aws/instance-ebs-bandwidth: "3500"
    karpenter.k8s.aws/instance-encryption-in-transit-supported: "true"
    karpenter.k8s.aws/instance-family: g5
    karpenter.k8s.aws/instance-generation: "5"
    karpenter.k8s.aws/instance-gpu-count: "1"
    karpenter.k8s.aws/instance-gpu-manufacturer: nvidia
    karpenter.k8s.aws/instance-gpu-memory: "24576"
    karpenter.k8s.aws/instance-gpu-name: a10g
    karpenter.k8s.aws/instance-hypervisor: nitro
    karpenter.k8s.aws/instance-local-nvme: "450"
    karpenter.k8s.aws/instance-memory: "32768"
    karpenter.k8s.aws/instance-network-bandwidth: "5000"
    karpenter.k8s.aws/instance-size: 2xlarge
    karpenter.sh/capacity-type: on-demand
    karpenter.sh/nodepool: mynodepool
    kubernetes.io/arch: amd64
    kubernetes.io/os: linux
    node-role: inference
    node.kubernetes.io/instance-type: g5.2xlarge
    nvidia.com/gpu: A10G
    topology.k8s.aws/zone-id: euw1-az3
    topology.kubernetes.io/region: eu-west-1
    topology.kubernetes.io/zone: eu-west-1c
  name: mynodepool-wz4vx
  ownerReferences:
  - apiVersion: karpenter.sh/v1
    blockOwnerDeletion: true
    kind: NodePool
    name: mynodepool
    uid: 423a86e4-1596-4768-b0bd-c1bd4fbbd051
  resourceVersion: "445301688"
  uid: e0b8e064-e3f5-46a7-a78c-c1c1e89907ad
spec:
  expireAfter: Never
  nodeClassRef:
    group: karpenter.k8s.aws
    kind: EC2NodeClass
    name: large-disk
  requirements:
  - key: karpenter.sh/nodepool
    operator: In
    values:
    - mynodepool
  - key: karpenter.k8s.aws/instance-family
    operator: In
    values:
    - g5
  - key: node.kubernetes.io/instance-type
    operator: In
    values:
    - g5.2xlarge
  - key: karpenter.k8s.aws/instance-memory
    operator: Gt
    values:
    - "20000"
  - key: karpenter.sh/capacity-type
    operator: In
    values:
    - on-demand
  - key: node-role
    operator: In
    values:
    - inference
  - key: nvidia.com/gpu
    operator: In
    values:
    - A10G
  resources:
    requests:
      cpu: 210m
      memory: 20720Mi
      nvidia.com/gpu: "1"
      pods: "9"
  taints:
  - effect: NoSchedule
    key: nvidia.com/gpu
    value: "true"
status:
  allocatable:
    cpu: 7910m
    ephemeral-storage: "403926258176"
    memory: 29317Mi
    nvidia.com/gpu: "1"
    pods: "58"
    vpc.amazonaws.com/pod-eni: "17"
  capacity:
    cpu: "8"
    ephemeral-storage: 450G
    memory: 30310Mi
    nvidia.com/gpu: "1"
    pods: "58"
    vpc.amazonaws.com/pod-eni: "17"
  conditions:
  - lastTransitionTime: "2024-10-24T11:50:37Z"
    message: ""
    reason: ConsistentStateFound
    status: "True"
    type: ConsistentStateFound
  - lastTransitionTime: "2024-10-24T11:41:18Z"
    message: Resource "nvidia.com/gpu" was requested but not registered
    reason: ResourceNotRegistered
    status: Unknown
    type: Initialized
  - lastTransitionTime: "2024-10-24T11:40:37Z"
    message: ""
    reason: Launched
    status: "True"
    type: Launched
  - lastTransitionTime: "2024-10-24T11:41:03Z"
    message: Initialized=Unknown
    reason: UnhealthyDependents
    status: Unknown
    type: Ready
  - lastTransitionTime: "2024-10-24T11:41:03Z"
    message: ""
    reason: Registered
    status: "True"
    type: Registered
  imageID: ami-0eae4d86f31ea2ae1
  nodeName: i-0573b18a64d7a4ea5.eu-west-1.compute.internal
  providerID: aws:///eu-west-1c/i-0573b18a64d7a4ea5

awoimbee avatar Oct 24 '24 13:10 awoimbee

Hi All,

This log line is part of the normal lifecycle of nodeclaim disruption, we've adjusted the log line to be more clear in https://github.com/kubernetes-sigs/karpenter/pull/1644 and https://github.com/kubernetes-sigs/karpenter/pull/1766. If there is other behavior that is being observed that may be incorrect then please open a new issue.

rschalo avatar Oct 28 '24 17:10 rschalo

I have the same issue node is created but not joining the cluster

Message: Cannot disrupt Node: state node doesn't contain both a node and a nodeclaim

eahangari-8x8 avatar Nov 01 '24 19:11 eahangari-8x8

I have the same issue node is created but not joining the cluster

Message: Cannot disrupt Node: state node doesn't contain both a node and a nodeclaim

WamBamBoozle avatar Nov 01 '24 19:11 WamBamBoozle

I'm having the same issue node is created but not joining the cluster and forever stuck in Unknown state Karpenter helm chart: 1.0.7 Kubernetes version: v1.31.0-eks-a737599 Logs

{"level":"INFO","time":"2024-11-04T15:13:30.311Z","logger":"controller","message":"Starting workers","commit":"901a5dc","controller":"nodepool.readiness","controllerGroup":"karpenter.sh","controllerKind":"NodePool","worker count":10}
{"level":"INFO","time":"2024-11-04T15:13:30.311Z","logger":"controller","message":"Starting workers","commit":"901a5dc","controller":"nodeclaim.disruption","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","worker count":10}
{"level":"INFO","time":"2024-11-04T15:13:30.311Z","logger":"controller","message":"Starting workers","commit":"901a5dc","controller":"status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","worker count":10}
{"level":"INFO","time":"2024-11-04T15:13:30.311Z","logger":"controller","message":"Starting workers","commit":"901a5dc","controller":"nodeclass.hash","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","worker count":10}
{"level":"INFO","time":"2024-11-04T15:13:30.610Z","logger":"controller","message":"discovered ssm parameter","commit":"901a5dc","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"default"},"namespace":"","name":"default","reconcileID":"95358f1e-4cad-4b64-bf6f-c89472c89cb7","parameter":"/aws/service/eks/optimized-ami/1.31/amazon-linux-2023/arm64/standard/recommended/image_id","value":"ami-0ae1e07e02f98b306"}
{"level":"INFO","time":"2024-11-04T15:13:30.636Z","logger":"controller","message":"discovered ssm parameter","commit":"901a5dc","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"default"},"namespace":"","name":"default","reconcileID":"95358f1e-4cad-4b64-bf6f-c89472c89cb7","parameter":"/aws/service/eks/optimized-ami/1.31/amazon-linux-2023/x86_64/standard/recommended/image_id","value":"ami-0be82d98bb3e7f36c"}
{"level":"INFO","time":"2024-11-04T15:13:30.670Z","logger":"controller","message":"discovered ssm parameter","commit":"901a5dc","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"default"},"namespace":"","name":"default","reconcileID":"95358f1e-4cad-4b64-bf6f-c89472c89cb7","parameter":"/aws/service/eks/optimized-ami/1.31/amazon-linux-2023/x86_64/nvidia/recommended/image_id","value":"ami-0ed2f679097182d7a"}
{"level":"INFO","time":"2024-11-04T15:13:30.694Z","logger":"controller","message":"discovered ssm parameter","commit":"901a5dc","controller":"nodeclass.status","controllerGroup":"karpenter.k8s.aws","controllerKind":"EC2NodeClass","EC2NodeClass":{"name":"default"},"namespace":"","name":"default","reconcileID":"95358f1e-4cad-4b64-bf6f-c89472c89cb7","parameter":"/aws/service/eks/optimized-ami/1.31/amazon-linux-2023/x86_64/neuron/recommended/image_id","value":"ami-0c42cc0277c8e37ac"}
{"level":"INFO","time":"2024-11-04T15:16:39.902Z","logger":"controller","message":"found provisionable pod(s)","commit":"901a5dc","controller":"provisioner","namespace":"","name":"","reconcileID":"b8f66e83-5fe0-403e-8a92-8ec5c7d2117d","Pods":"worker-port-scan-stage2/worker-port-scan-stage2-679564f457-5pqch","duration":"46.212393ms"}
{"level":"INFO","time":"2024-11-04T15:16:39.902Z","logger":"controller","message":"computed new nodeclaim(s) to fit pod(s)","commit":"901a5dc","controller":"provisioner","namespace":"","name":"","reconcileID":"b8f66e83-5fe0-403e-8a92-8ec5c7d2117d","nodeclaims":1,"pods":1}
{"level":"INFO","time":"2024-11-04T15:16:39.917Z","logger":"controller","message":"created nodeclaim","commit":"901a5dc","controller":"provisioner","namespace":"","name":"","reconcileID":"b8f66e83-5fe0-403e-8a92-8ec5c7d2117d","NodePool":{"name":"spot"},"NodeClaim":{"name":"spot-5q86d"},"requests":{"cpu":"680m","memory":"785Mi","pods":"8"},"instance-types":"c5.xlarge, c5a.2xlarge, c6a.2xlarge, c6i.xlarge, c7i-flex.xlarge and 7 other(s)"}
{"level":"INFO","time":"2024-11-04T15:16:43.397Z","logger":"controller","message":"launched nodeclaim","commit":"901a5dc","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"spot-5q86d"},"namespace":"","name":"spot-5q86d","reconcileID":"70b8ee7c-e93d-4d6f-989f-844ee57c3c73","provider-id":"aws:///ap-southeast-1c/i-0665d2caaa8e38f76","instance-type":"c5.xlarge","zone":"ap-southeast-1c","capacity-type":"spot","allocatable":{"cpu":"3920m","ephemeral-storage":"35Gi","memory":"6584Mi","pods":"58","vpc.amazonaws.com/pod-eni":"18"}}
{"level":"INFO","time":"2024-11-04T15:17:03.759Z","logger":"controller","message":"deleted nodeclaim","commit":"901a5dc","controller":"nodeclaim.termination","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"spot-kngcp"},"namespace":"","name":"spot-kngcp","reconcileID":"08090e8d-9157-4e71-a7a9-77a950fe68e1","Node":{"name":""},"provider-id":"aws:///ap-southeast-1b/i-0865258af1c615c6f"}
{"level":"INFO","time":"2024-11-04T15:17:44.539Z","logger":"controller","message":"deleted nodeclaim","commit":"901a5dc","controller":"nodeclaim.termination","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"spot-fkcd6"},"namespace":"","name":"spot-fkcd6","reconcileID":"8f517525-5166-4d6b-8fe4-cf9858a0dc6b","Node":{"name":""},"provider-id":"aws:///ap-southeast-1c/i-031a02558cc13b16a"}
{"level":"INFO","time":"2024-11-04T15:31:49.255Z","logger":"controller","message":"found provisionable pod(s)","commit":"901a5dc","controller":"provisioner","namespace":"","name":"","reconcileID":"08e5d957-23e7-4de8-b2cf-2108b51ac98b","Pods":"microservice-public-web-manage/microservice-public-web-manage-5549b5b56f-xrk9w, microservice-admin-dashboard/microservice-admin-dashboard-cbf6cb68d-h6zhm, worker-cyberbay-scan-stage-complete/worker-cyberbay-scan-stage-complete-7746f99f85-smk8v, worker-auto-unlock-bug-report/worker-auto-unlock-bug-report-c4477f77d-bbc4c, keda/keda-metrics-apiserver-c5b6b66c-lsgsx and 2 other(s)","duration":"26.295328ms"}
{"level":"INFO","time":"2024-11-04T15:31:49.255Z","logger":"controller","message":"computed new nodeclaim(s) to fit pod(s)","commit":"901a5dc","controller":"provisioner","namespace":"","name":"","reconcileID":"08e5d957-23e7-4de8-b2cf-2108b51ac98b","nodeclaims":1,"pods":7}
{"level":"INFO","time":"2024-11-04T15:31:49.268Z","logger":"controller","message":"created nodeclaim","commit":"901a5dc","controller":"provisioner","namespace":"","name":"","reconcileID":"08e5d957-23e7-4de8-b2cf-2108b51ac98b","NodePool":{"name":"spot"},"NodeClaim":{"name":"spot-2nm9f"},"requests":{"cpu":"1780m","memory":"1981Mi","pods":"14"},"instance-types":"c5.xlarge, c5a.2xlarge, c6a.2xlarge, c6i.xlarge, c7i-flex.xlarge and 6 other(s)"}
{"level":"INFO","time":"2024-11-04T15:31:52.384Z","logger":"controller","message":"launched nodeclaim","commit":"901a5dc","controller":"nodeclaim.lifecycle","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"spot-2nm9f"},"namespace":"","name":"spot-2nm9f","reconcileID":"797858ad-e717-4641-9673-ae97537038fa","provider-id":"aws:///ap-southeast-1c/i-0bc5178ae30d55bca","instance-type":"c5.xlarge","zone":"ap-southeast-1c","capacity-type":"spot","allocatable":{"cpu":"3920m","ephemeral-storage":"35Gi","memory":"6584Mi","pods":"58","vpc.amazonaws.com/pod-eni":"18"}}
{"level":"INFO","time":"2024-11-04T15:33:02.965Z","logger":"controller","message":"deleted nodeclaim","commit":"901a5dc","controller":"nodeclaim.termination","controllerGroup":"karpenter.sh","controllerKind":"NodeClaim","NodeClaim":{"name":"spot-5q86d"},"namespace":"","name":"spot-5q86d","reconcileID":"5ef88cce-50cd-4d4c-8014-ed005543fbba","Node":{"name":""},"provider-id":"aws:///ap-southeast-1c/i-0665d2caaa8e38f76"}

Nodeclaim

kg nodeclaim
NAME         TYPE        CAPACITY   ZONE              NODE   READY     AGE
spot-2nm9f   c5.xlarge   spot       ap-southeast-1c          Unknown   3m14s
k describe nodeclaim spot-2nm9f
Name:         spot-2nm9f
Namespace:
Labels:       karpenter.k8s.aws/instance-category=c
              karpenter.k8s.aws/instance-cpu=4
              karpenter.k8s.aws/instance-cpu-manufacturer=intel
              karpenter.k8s.aws/instance-ebs-bandwidth=4750
              karpenter.k8s.aws/instance-encryption-in-transit-supported=false
              karpenter.k8s.aws/instance-family=c5
              karpenter.k8s.aws/instance-generation=5
              karpenter.k8s.aws/instance-hypervisor=nitro
              karpenter.k8s.aws/instance-memory=8192
              karpenter.k8s.aws/instance-network-bandwidth=1250
              karpenter.k8s.aws/instance-size=xlarge
              karpenter.sh/capacity-type=spot
              karpenter.sh/nodepool=spot
              kubernetes.io/arch=amd64
              kubernetes.io/os=linux
              node.kubernetes.io/instance-type=c5.xlarge
              topology.k8s.aws/zone-id=apse1-az3
              topology.kubernetes.io/region=ap-southeast-1
              topology.kubernetes.io/zone=ap-southeast-1c
Annotations:  compatibility.karpenter.k8s.aws/kubelet-drift-hash: 15379597991425564585
              karpenter.k8s.aws/ec2nodeclass-hash: 17935570713262261599
              karpenter.k8s.aws/ec2nodeclass-hash-version: v3
              karpenter.sh/nodepool-hash: 6821555240594823858
              karpenter.sh/nodepool-hash-version: v3
              karpenter.sh/stored-version-migrated: true
API Version:  karpenter.sh/v1
Kind:         NodeClaim
Metadata:
  Creation Timestamp:  2024-11-04T15:31:49Z
  Finalizers:
    karpenter.sh/termination
  Generate Name:  spot-
  Generation:     1
  Owner References:
    API Version:           karpenter.sh/v1
    Block Owner Deletion:  true
    Kind:                  NodePool
    Name:                  spot
    UID:                   f7479647-2be2-4c33-88eb-0261218ad48f
  Resource Version:        60672567
  UID:                     84b5c89d-9192-475a-96b3-3a1931748888
Spec:
  Expire After:  720h
  Node Class Ref:
    Group:  karpenter.k8s.aws
    Kind:   EC2NodeClass
    Name:   default
  Requirements:
    Key:       kubernetes.io/os
    Operator:  In
    Values:
      linux
    Key:       node.kubernetes.io/instance-type
    Operator:  In
    Values:
      c5.xlarge
      c5a.2xlarge
      c6a.2xlarge
      c6i.xlarge
      c7i-flex.xlarge
      m5.xlarge
      m6a.xlarge
      r5.xlarge
      r6i.xlarge
      t2.2xlarge
      t3.2xlarge
    Key:       karpenter.sh/nodepool
    Operator:  In
    Values:
      spot
    Key:       karpenter.sh/capacity-type
    Operator:  In
    Values:
      spot
    Key:       kubernetes.io/arch
    Operator:  In
    Values:
      amd64
  Resources:
    Requests:
      Cpu:     1780m
      Memory:  1981Mi
      Pods:    14
Status:
  Allocatable:
    Cpu:                        3920m
    Ephemeral - Storage:        35Gi
    Memory:                     6584Mi
    Pods:                       58
    vpc.amazonaws.com/pod-eni:  18
  Capacity:
    Cpu:                        4
    Ephemeral - Storage:        40Gi
    Memory:                     7577Mi
    Pods:                       58
    vpc.amazonaws.com/pod-eni:  18
  Conditions:
    Last Transition Time:  2024-11-04T15:31:52Z
    Message:               Node not registered with cluster
    Reason:                NodeNotFound
    Status:                Unknown
    Type:                  Initialized
    Last Transition Time:  2024-11-04T15:31:52Z
    Message:
    Reason:                Launched
    Status:                True
    Type:                  Launched
    Last Transition Time:  2024-11-04T15:31:52Z
    Message:               Initialized=Unknown, Registered=Unknown
    Reason:                UnhealthyDependents
    Status:                Unknown
    Type:                  Ready
    Last Transition Time:  2024-11-04T15:31:52Z
    Message:               Node not registered with cluster
    Reason:                NodeNotFound
    Status:                Unknown
    Type:                  Registered
  Image ID:                ami-0be82d98bb3e7f36c
  Provider ID:             aws:///ap-southeast-1c/i-0bc5178ae30d55bca
Events:
  Type    Reason             Age                  From       Message
  ----    ------             ----                 ----       -------
  Normal  Launched           3m28s                karpenter  Status condition transitioned, Type: Launched, Status: Unknown -> True, Reason: Launched
  Normal  DisruptionBlocked  86s (x2 over 3m26s)  karpenter  Cannot disrupt NodeClaim: state node doesn't contain both a node and a nodeclaim

alexhungnguyen avatar Nov 04 '24 15:11 alexhungnguyen

I fixed it using pod identity. I assigned the karpenter role to the karpenter service account. And it fixed it.

jadiaheno avatar Nov 04 '24 17:11 jadiaheno

Is there any update on this issue ?

bshre12 avatar Nov 07 '24 15:11 bshre12

@bshre12 have you assigned a role to your karpenter service account?

jadiaheno avatar Nov 07 '24 16:11 jadiaheno

I'm getting this issue where nodes occassionaly stuck in Unknown status, but after 15 minutes or so resolves itself. Any one face something similar?

timjaya avatar Nov 22 '24 01:11 timjaya

I have issue too when I was upgarded to v1.0.8 image It seems I see a node that is ready and joins the EKS cluster for 2 minutes, but then it gets killed and replaced with a new node. image any one facing something similar, please help me

tainnsre avatar Nov 22 '24 07:11 tainnsre

Facing same issue. It happens when upgrading from v0.37 to v1. (helm chart 1.0.7)

Screenshot 2024-11-22 at 10 33 21 Screenshot 2024-11-22 at 10 36 12 Screenshot 2024-11-22 at 10 36 23

snizs avatar Nov 22 '24 10:11 snizs

Hello @snizs U can rollback config nodepool and ec2nodeclass use api karpenter.sh/v1beta1 instead of use karpenter.sh/v1 This config, I am using eks: v1.31, helm chart karpenter v.1.0.8

NodePool-EC2NodeClass.yaml

apiVersion: karpenter.sh/v1beta1
# apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
  name: nodepool-large-spot
  # namespace: karpenter
spec:
  template:
    metadata:
      labels:
        ProjectCost: SRE
        environments: SRE-Dev
        billing-team: SRE
      annotations:
        teams/owner: "SRE"
    spec:
      requirements:
        - key: kubernetes.io/os
          operator: In
          values:
          - linux
        - key: kubernetes.io/arch
          operator: In
          values:
          - amd64
        - key: karpenter.sh/capacity-type
          operator: In
          values:
          - spot
        - key: karpenter.k8s.aws/instance-family
          operator: In
          values:
          - t3a
          - t3
        - key: karpenter.k8s.aws/instance-size
          operator: In
          values:
          - large
      nodeClassRef:
        # group: karpenter.k8s.aws
        # kind: EC2NodeClass
        name: nodeclass-xlarge-spot
  limits:
    cpu: 24
  disruption:
    budgets:
    - nodes: 10%
    consolidationPolicy: WhenEmpty
    consolidateAfter: 24h
    expireAfter: 120h
---
# apiVersion: karpenter.k8s.aws/v1
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
  name: nodeclass-large-spot
  # namespace: karpenter
spec:
  amiFamily: AL2 # Amazon Linux 2
  role: eksctl-KarpenterNodeRole-eks-sre-op
  blockDeviceMappings:                                                                                                                                                                                                                                                                                                                                                                                                                                                            
  - deviceName: /dev/xvda
    ebs:
      deleteOnTermination: true
      iops: 3000
      throughput: 125
      volumeSize: 30Gi
      volumeType: gp3
  subnetSelectorTerms:
    - tags:
        karpenter.sh/discovery: eks-sre-op
    - id: subnet-1234 #     zone: ap-southeast-1c
    - id: subnet-2456 #     zone: ap-southeast-1a
    - id: subnet-789 #     zone: ap-southeast-1b
  securityGroupSelectorTerms:
    - tags:
        karpenter.sh/discovery: eks-sre-op
    - id: sg-234
    - id: sg-45645
    - id: sg-7567
  amiSelectorTerms:
    - id: "ami-09ef1cc7d6d5336f8" # EKS v1.31 AMD_AMI_ID
    - id: "ami-0351d81156d749ea9" # EKS v1.31 ARM_AMI_ID
    - id: "ami-0a080add16b2ccab0" # EKS v1.31 GPU_AMI_ID
  tags:
    ProjectCost: SRE
	environments: SRE-Dev

tainnsre avatar Nov 23 '24 14:11 tainnsre