kops
kops copied to clipboard
Unable to pull image from private ECR from master node
/kind bug
1. What kops
version are you running? The command kops version
, will display
this information.
Version 1.23.0 (git-a067cd7742a497a5c512762b9880664d865289f1)
root@xxxxxxx:~/.aws# docker-credential-ecr-login version 0.6.0
2. What Kubernetes version are you running? kubectl version
will print the
version if a cluster is running or provide the Kubernetes version specified as
a kops
flag.
1.21
3. What cloud provider are you using?
aws
4. What commands did you run? What is the simplest way to reproduce this issue?
docker pull xxxxxx.dkr.ecr.xxx.amazonaws.com/xxx:1.1
5. What happened after the commands executed?
cat /root/.ecr/log/ecr-login.log
time="2022-04-15T20:43:19Z" level=debug msg="Calling ECR.GetAuthorizationToken" registry=xxxxxx time="2022-04-15T20:43:19Z" level=error msg="Error retrieving credentials" error="ecr: Failed to get authorization token: EC2RoleRequestError: no EC2 instance role found\ncaused by: EC2MetadataError: failed to make EC2Metadata request\ncaused by: " time="2022-04-15T20:47:17Z" level=debug msg="Could not fetch credentials for cache prefix, disabling cache" error="EC2RoleRequestError: no EC2 instance role found\ncaused by: EC2MetadataError: failed to make EC2Metadata request\ncaused by: " time="2022-04-15T20:47:17Z" level=debug msg="Retrieving credentials" region=xxxxx registry=xxxxxx serverURL=xxxxxx.dkr.ecr.xxxxxx.amazonaws.com.cn
6. What did you expect to happen?
docker pull from private ECR inside master node should be working well
7. Please provide your cluster manifest. Execute
kops get --name my.example.com -o yaml
to display your cluster manifest.
You may want to remove your cluster name and other sensitive information.
apiVersion: kops.k8s.io/v1alpha2
kind: Cluster
metadata:
creationTimestamp: "2022-04-11T20:26:16Z"
generation: 3
name: xxxxx.k8s.local
spec:
api:
loadBalancer:
class: Network
type: Public
assets:
containerRegistry: xxxxxxx.dkr.ecr.xxxxxxx.amazonaws.com.cn
fileRepository: https://s3.xxxx.amazonaws.com.cn/xxxxxxx-file-repository
authorization:
rbac: {}
channel: stable
cloudProvider: aws
configBase: s3://xxxxxxxxx-state-store/xxxxxxxx.k8s.local
etcdClusters:
- cpuRequest: 200m
etcdMembers:
- encryptedVolume: true
instanceGroup: master-xxxxxxxa
name: a
- encryptedVolume: true
instanceGroup: master-xxxxxxxb
name: b
- encryptedVolume: true
instanceGroup: master-xxxxxxxd
name: d
memoryRequest: 100Mi
name: main
- cpuRequest: 100m
etcdMembers:
- encryptedVolume: true
instanceGroup: master-xxxxxxxa
name: a
- encryptedVolume: true
instanceGroup: master-xxxxxxxb
name: b
- encryptedVolume: true
instanceGroup: master-xxxxxxxd
name: d
memoryRequest: 100Mi
name: events
iam:
allowContainerRegistry: true
legacy: false
kubelet:
anonymousAuth: false
kubernetesApiAccess:
- 0.0.0.0/0
- ::/0
kubernetesVersion: 1.23.5
masterInternalName: api.internal.xxxxxxx.k8s.local
masterPublicName: api.xxxxxxx.k8s.local
networkCIDR: 10.100.0.0/16
networking:
calico: {}
nonMasqueradeCIDR: 100.64.0.0/10
sshAccess:
- 0.0.0.0/0
- ::/0
subnets:
- cidr: 10.100.32.0/19
name: xxxxxxxa
type: Private
zone: xxxxxxxa
- cidr: 10.100.64.0/19
name: xxxxxxxb
type: Private
zone: xxxxxxxb
- cidr: 10.100.96.0/19
name: xxxxxxxd
type: Private
zone: xxxxxxxd
- cidr: 10.100.0.0/22
name: utility-xxxxxxxa
type: Utility
zone: xxxxxxxa
- cidr: 10.100.4.0/22
name: utility-xxxxxxxb
type: Utility
zone: xxxxxxxb
- cidr: 10.100.8.0/22
name: utility-xxxxxxxd
type: Utility
zone: xxxxxxxd
topology:
dns:
type: Public
masters: private
nodes: private
---
apiVersion: kops.k8s.io/v1alpha2
kind: InstanceGroup
metadata:
creationTimestamp: "2022-04-11T20:26:18Z"
labels:
kops.k8s.io/cluster: xxxxxxx.k8s.local
name: bastions
spec:
image: ami-xxxxxxx
instanceMetadata:
httpPutResponseHopLimit: 1
httpTokens: required
machineType: t3.micro
maxSize: 1
minSize: 1
nodeLabels:
kops.k8s.io/instancegroup: bastions
role: Bastion
subnets:
- xxxxxxxa
- xxxxxxxb
- xxxxxxxd
---
apiVersion: kops.k8s.io/v1alpha2
kind: InstanceGroup
metadata:
creationTimestamp: "2022-04-11T20:26:16Z"
labels:
kops.k8s.io/cluster: xxxxxxx.k8s.local
name: master-xxxxxxxa
spec:
image: ami-xxxxxxx
instanceMetadata:
httpPutResponseHopLimit: 3
httpTokens: required
machineType: t3.small
maxSize: 1
minSize: 1
nodeLabels:
kops.k8s.io/instancegroup: master-xxxxxxxa
role: Master
subnets:
- xxxxxxxa
---
apiVersion: kops.k8s.io/v1alpha2
kind: InstanceGroup
metadata:
creationTimestamp: "2022-04-11T20:26:17Z"
labels:
kops.k8s.io/cluster: xxxxxxx.k8s.local
name: master-xxxxxxxb
spec:
image: ami-xxxxxxx
instanceMetadata:
httpPutResponseHopLimit: 3
httpTokens: required
machineType: t3.small
maxSize: 1
minSize: 1
nodeLabels:
kops.k8s.io/instancegroup: master-xxxxxxxb
role: Master
subnets:
- xxxxxxxb
---
apiVersion: kops.k8s.io/v1alpha2
kind: InstanceGroup
metadata:
creationTimestamp: "2022-04-11T20:26:17Z"
labels:
kops.k8s.io/cluster: xxxxxxx.k8s.local
name: master-xxxxxxxd
spec:
image: ami-xxxxxxx
instanceMetadata:
httpPutResponseHopLimit: 3
httpTokens: required
machineType: t3.small
maxSize: 1
minSize: 1
nodeLabels:
kops.k8s.io/instancegroup: master-xxxxxxxd
role: Master
subnets:
- xxxxxxxd
---
apiVersion: kops.k8s.io/v1alpha2
kind: InstanceGroup
metadata:
creationTimestamp: "2022-04-11T20:26:17Z"
labels:
kops.k8s.io/cluster: xxxxxxx.k8s.local
name: nodes-xxxxxxxa
spec:
image: ami-xxxxxxx
instanceMetadata:
httpPutResponseHopLimit: 1
httpTokens: required
machineType: c5d.large
maxSize: 1
minSize: 1
nodeLabels:
kops.k8s.io/instancegroup: nodes-xxxxxxxa
role: Node
subnets:
- xxxxxxxa
---
apiVersion: kops.k8s.io/v1alpha2
kind: InstanceGroup
metadata:
creationTimestamp: "2022-04-11T20:26:17Z"
labels:
kops.k8s.io/cluster: xxxxxxx.k8s.local
name: nodes-xxxxxxxb
spec:
image: ami-xxxxxxx
instanceMetadata:
httpPutResponseHopLimit: 1
httpTokens: required
machineType: c5d.large
maxSize: 1
minSize: 1
nodeLabels:
kops.k8s.io/instancegroup: nodes-xxxxxxxb
role: Node
subnets:
- xxxxxxxb
---
apiVersion: kops.k8s.io/v1alpha2
kind: InstanceGroup
metadata:
creationTimestamp: "2022-04-11T20:26:18Z"
labels:
kops.k8s.io/cluster: xxxxxxx.k8s.local
name: nodes-xxxxxxxd
spec:
image: ami-xxxxxxx
instanceMetadata:
httpPutResponseHopLimit: 1
httpTokens: required
machineType: c5d.large
maxSize: 0
minSize: 0
nodeLabels:
kops.k8s.io/instancegroup: nodes-xxxxxxxd
role: Node
subnets:
- xxxxxxxd
8. Please run the commands with most verbose logging by adding the -v 10
flag.
Paste the logs into this report, or in a gist and provide the gist link here.
9. Anything else do we need to know?
root@ip-xxxxxxxx:~/.aws# TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` && curl -H "X-aws-ec2-metadata-token: $TOKEN" -v http://169.254.169.254/latest/meta-data/iam/security-credentials/xxxxxxxx
we can see the credentials properly: { "Code" : "Success", "LastUpdated" : "2022-04-15T20:47:16Z", "Type" : "AWS-HMAC", "AccessKeyId" : "xxxxxxxxxxxx", "SecretAccessKey" : "xxxxxxxxxxxxxxxxxx", "Token" : "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "Expiration" : "2022-04-16T02:52:56Z" }
Got the reason to this is from the wrongly claim about supporting aws SDK v2 in awslabs/amazon-ecr-credential-helper#285, the current amazon-ecr-credential-helper is unable to handle IMDSv2 credentials properly, but kops latest enables this by default, reverting back to IMDSv1 resolves this issue.
aws ec2 modify-instance-metadata-options --instance-id i-xxxxxx --http-tokens optional
If I understand this issue correctly, you are pulling images outside of k8s and this isn't working because of an issue with the AWS ECR credentials helper? You do not have issues with pulling images for Pods?
If I understand this issue correctly, you are pulling images outside of k8s and this isn't working because of an issue with the AWS ECR credentials helper? You do not have issues with pulling images for Pods?
We may have to pull infra pod from ecr private repository during building the control plane via kops create
, then found 401 error response for pulling from ECR and disabled the http-tokens
to optional
resolved the 401 issues in troubleshooting.
Further, we tried the ecr credentials helper
in a new initialized EC2 with the http-tokens: required
for EC2 IMDS, and pulled images from private ecr
with the same failure. Therefore we can tell that the ECR credentials helper is actually not well supported for http-tokens
which is enabled by default in kops latest version.
How are you trying to build the cluster using images stored in ECR? The ECR credential helper supports containerd?
The AMI used is Ubuntu 20.04 LTS, the containerd
is freshly installed by following https://docs.docker.com/engine/install/ubuntu/ and ecr credentials helper
is simply installed by apt install amazon-ecr-credential-helper
mentioned in https://github.com/awslabs/amazon-ecr-credential-helper. Then try pulling private image by simply docker pull xxxx.dks..aws.com/image:tag
, it will fail with 401, disable http-tokens: required
, everything recovers to good.
If you are manually installing docker and ecr credentials helper, then I don't see where the kops bug is. This is far from a supported setup. Meaning that you have to do custom configurations to make things work.
Can you point to where kops is doing something incorrect?
I think I can. I just ran into this now as well. The context is I am using the kops get assets --copy
to push all the container images to ECR. I've had ECR working for applications of a while, but when I enable this, nodes fail to come up with permission denied on the pulls for items like apiserver, etc. It's like the node itself is not authorised to the repository at the early stage. I can get logs if you need but I just shut things down and it's late. :sleepy:
Private ECR repositories requires something authenticating to ECR. As long as spec.iam.allowContainerRegistry
is set, the instance itself is allowed to authenticate. But something else need to actually log in to ECR and all that. kOps do not support any mechanism for authenticating before kubelet is able to do so. So any pulls that happens before kubelet starts pulling is not supported. If it is kubelet that fails to pull it may be that kubelet does not use registry authentication plugins before the node has joined the cluster.
Is the statement about this way of setting up registry auth, working with containerd, still true? https://github.com/kubernetes/kops/blob/master/docs/cli/kops_create_secret_dockerconfig.md#synopsis
I didn't use exactly that since it wouldn't work with ECR well, but I tried using an execContainer to run an ECR login using awscli and generating a /root/.docker/config.json on the node, and so far no luck there. Should that work?
I could try and get a login token at cluster setup time and make either a docker secret if that works, or setting up the registryMirrors, but I'm worried about configuring in a short duration token into the config for the latter. I do only need it for bootstrap, and wouldn't want it to interfere with the actual way ECR gets setup to auth (Which I haven't traced down, but it looks like maybe the CCM dispatches these for the kubelets, and it's feeding these to containerd at pull time vs. it being some global config?)
The statement is true, but only for use by kubelet. Containerd will not use the docker config directly. Instead kubelet will read it and pass on the credentials to containerd using the CRI API.
When using ECR from kubelet, it is indeed the CCM that sets that up: https://github.com/kubernetes/cloud-provider-aws/tree/master/cmd/ecr-credential-provider
The Kubernetes project currently lacks enough contributors to adequately respond to all issues and PRs.
This bot triages issues and PRs according to the following rules:
- After 90d of inactivity,
lifecycle/stale
is applied - After 30d of inactivity since
lifecycle/stale
was applied,lifecycle/rotten
is applied - After 30d of inactivity since
lifecycle/rotten
was applied, the issue is closed
You can:
- Mark this issue or PR as fresh with
/remove-lifecycle stale
- Mark this issue or PR as rotten with
/lifecycle rotten
- Close this issue or PR with
/close
- Offer to help out with Issue Triage
Please send feedback to sig-contributor-experience at kubernetes/community.
/lifecycle stale
/remove-kind bug
The Kubernetes project currently lacks enough active contributors to adequately respond to all issues and PRs.
This bot triages issues and PRs according to the following rules:
- After 90d of inactivity,
lifecycle/stale
is applied - After 30d of inactivity since
lifecycle/stale
was applied,lifecycle/rotten
is applied - After 30d of inactivity since
lifecycle/rotten
was applied, the issue is closed
You can:
- Mark this issue or PR as fresh with
/remove-lifecycle rotten
- Close this issue or PR with
/close
- Offer to help out with Issue Triage
Please send feedback to sig-contributor-experience at kubernetes/community.
/lifecycle rotten
The Kubernetes project currently lacks enough active contributors to adequately respond to all issues and PRs.
This bot triages issues according to the following rules:
- After 90d of inactivity,
lifecycle/stale
is applied - After 30d of inactivity since
lifecycle/stale
was applied,lifecycle/rotten
is applied - After 30d of inactivity since
lifecycle/rotten
was applied, the issue is closed
You can:
- Reopen this issue with
/reopen
- Mark this issue as fresh with
/remove-lifecycle rotten
- Offer to help out with Issue Triage
Please send feedback to sig-contributor-experience at kubernetes/community.
/close not-planned
@k8s-triage-robot: Closing this issue, marking it as "Not Planned".
In response to this:
The Kubernetes project currently lacks enough active contributors to adequately respond to all issues and PRs.
This bot triages issues according to the following rules:
- After 90d of inactivity,
lifecycle/stale
is applied- After 30d of inactivity since
lifecycle/stale
was applied,lifecycle/rotten
is applied- After 30d of inactivity since
lifecycle/rotten
was applied, the issue is closedYou can:
- Reopen this issue with
/reopen
- Mark this issue as fresh with
/remove-lifecycle rotten
- Offer to help out with Issue Triage
Please send feedback to sig-contributor-experience at kubernetes/community.
/close not-planned
Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository.
Using Kops Version 1.23.4 and K8S version 1.23.15. We've just switched from docker to containerd runtime in the kops config and are hitting this issue now since all images were being pulled from ECR (we have assets.containerProxy configured in our kops config and pointed at our ECR registry).
I added ECR login credentials to the kops state using 'kops create secret dockerconfig'. I replaced one of the nodes in the cluster and verified in its syslog that nodeup was creating the docker config file and that it contained the correct config. I installed docker client and verified i could pull an image from ECR without first having to log in, which worked fine.
Unfortunately logs in syslog showed that containerd was getting the 401 unauthorized error still trying to pull images from ECR e.g.
Jan 25 10:55:51 ip-10-202-98-64 containerd[5937]: time="2023-01-25T10:55:51.649220752Z" level=error msg="RunPodSandbox for &PodSandboxMetadata{Name:kube-proxy-ip-10-202-98-64.eu-west-1.compute.internal,Uid:9052eed18345e582e40df80962c11ede,Namespace:kube-system,Attempt:0,} failed, error" error="failed to get sandbox image "REDACTED.dkr.ecr.eu-west-1.amazonaws.com/pause:3.6": failed to pull image "REDACTED.dkr.ecr.eu-west-1.amazonaws.com/pause:3.6": failed to pull and unpack image "REDACTED.dkr.ecr.eu-west-1.amazonaws.com/pause:3.6": failed to resolve reference "REDACTED.dkr.ecr.eu-west-1.amazonaws.com/pause:3.6": pulling from host REDACTED.dkr.ecr.eu-west-1.amazonaws.com failed with status code [manifests 3.6]: 401 Unauthorized"
This page: https://github.com/kubernetes/kops/blob/master/docs/cli/kops_create_secret_dockerconfig.md#synopsis does explicitly state that use of the dockerconfig file "......will also work when using containerd as the container runtime." So a bit stumped at the moment. Does the ecr-credential-provider need to be installed and configured independently to get this working I wonder?
@wlawton please file a new issue. as long as kubelet pulls the image, this doesn't have anything to do with containerd vs dockerd.
No worries. I worked around my problem by creating public ECR repo's, and configuring it in assets.containerProxy - no authentication required for pulling.