talos
talos copied to clipboard
talosctl bootstrap fails on Azure Stack Hub
Bug Report
Description
Following the Azure installation guide on Azure Stack Hub. The talosctl bootstrap step fails with
$ talosctl --talosconfig ./talosconfig bootstrap
error executing bootstrap: rpc error: code = Unavailable desc = connection error: desc = "transport: authentication handshake failed: tls: failed to verify certificate: x509: certificate is valid for 10.0.0.4, 127.0.0.1, ::1, not A.B.C.D"
where A.B.C.D is the talos-controlplane-public-ip-0 public IP address and 10.0.0.4 is the private IP address assigned to the nic.
Logs
talos-controlplane-0 VM boot logs:
talos-controlplane-0.log
create.sh script with az cli commands to install Talos:
#!/usr/bin/env bash
program="$0"
start_bold="$(tput bold)$(tput setaf 3)"
start_warn="$(tput bold)$(tput setaf 1)"
start_cmd="$(tput setaf 2)"
end_color="$(tput sgr 0)"
verbose=
export TALOS_IMAGE=azure-amd64.vhd
export TALOS_VERSION=v1.6.7
export TALOS_IMAGE_URL=https://github.com/siderolabs/talos/releases/download/$TALOS_VERSION/$TALOS_IMAGE.xz
export PREFIX=eas-talos
export DUMBPREFIX=eastalos
export CLUSTER=scratch
export SUBSCRIPTION="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
export GROUP="$PREFIX-$CLUSTER"
export STORAGE_ACCOUNT="$DUMBPREFIX$CLUSTER"
export STORAGE_CONTAINER="$PREFIX-$CLUSTER"
export STORAGE_BASE_URL="blob.azurestackhub.example.com"
export LOCATION="azurestackhub"
error () {
local msg="$@"; shift
: "${msg:?"$program: error: missing message"}"
echo "${start_warn}$program: $msg${end_color}" >&2
exit 1
}
prompt () {
local msg="$@"
: "${msg:?"prompt: missing message"}"
echo
while true; do
[ -t 0 ] && read -u 0 -p "${start_bold}${msg}. Continue (Y/n/q)?${end_color} " answer
case "$answer" in
''|y*|Y*)
return 0
;;
n*|N*)
return 1
;;
q*|Q*)
exit 1
;;
*)
echo "${start_warn}prompt: unexpected response: $answer${end_color}"
;;
esac
done
}
run () {
local cmd=("$@")
: "${cmd:?"run: missing command"}"
[ "$verbose" ] && echo "${cmd[@]}"
"${cmd[@]}"
[ "$?" -eq 0 ] || prompt "$start_warn" "run: command failed" || exit 1
}
prompted_run () {
local msg="$1"; shift
: "${msg:?"prompted_run: missing message"}"
local cmd=("$@")
: "${cmd:?"prompted_run: missing command"}"
if prompt "$msg"; then
[ "$verbose" ] && echo "${cmd[@]}"
"${cmd[@]}"
[ "$?" -eq 0 ] || prompt "$start_warn" "prompted_run: command failed" || exit 1
echo
echo
fi
}
# Set active subscription
prompted_run "Set active subscription" \
az account set --subscription "$SUBSCRIPTION"
# (re)create talos resource group
case "$(az group exists -n "$GROUP")" in
true) if prompt "remove and recreate existing resource group"; then
run az group delete --name $GROUP --yes
run az group create --name $GROUP --location $LOCATION
fi
;;
false)
run az group create --name $GROUP --location $LOCATION
;;
esac
prompted_run "Create storage account" \
az storage account create \
--name $STORAGE_ACCOUNT \
-g $GROUP \
-l $LOCATION \
--sku Standard_LRS
export CONNECTION=$(az storage account show-connection-string \
-n $STORAGE_ACCOUNT \
-g $GROUP \
-o tsv)
prompted_run "Create storage container" \
az storage container create \
--name $STORAGE_CONTAINER \
--account-name $STORAGE_ACCOUNT \
-g $GROUP \
--connection-string "$CONNECTION"
if prompt "Download talos release image"; then
echo "$TALOS_IMAGE_URL"
run rm -f image/$TALOS_IMAGE.xz image/$TALOS_IMAGE
run curl -L -o image/$TALOS_IMAGE.xz "$TALOS_IMAGE_URL"
run xz -d image/$TALOS_IMAGE.xz
fi
prompted_run "Upload talos image" \
az storage blob upload \
--connection-string "$CONNECTION" \
--container-name "$STORAGE_CONTAINER" \
-f "image/$TALOS_IMAGE" \
-n $TALOS_IMAGE
prompted_run "Register talos image" \
az image create \
--name talos \
--source "https://$STORAGE_ACCOUNT.$STORAGE_BASE_URL/$STORAGE_CONTAINER/$TALOS_IMAGE" \
--os-type linux \
-g "$GROUP"
if prompt "Set up talos networking"; then
prompted_run "create vnet" \
az network vnet create \
--resource-group $GROUP \
--location $LOCATION \
--name talos-vnet \
--subnet-name talos-subnet
prompted_run "Create network security group" \
az network nsg create -g $GROUP -n talos-sg
prompted_run "client -> apid" \
az network nsg rule create \
-g $GROUP \
--nsg-name talos-sg \
-n apid \
--priority 1001 \
--destination-port-ranges 50000 \
--direction inbound
prompted_run trustd \
az network nsg rule create \
-g $GROUP \
--nsg-name talos-sg \
-n trustd \
--priority 1002 \
--destination-port-ranges 50001 \
--direction inbound
prompted_run etcd \
az network nsg rule create \
-g $GROUP \
--nsg-name talos-sg \
-n etcd \
--priority 1003 \
--destination-port-ranges 2379-2380 \
--direction inbound
prompted_run "Kubernetes API Server" \
az network nsg rule create \
-g $GROUP \
--nsg-name talos-sg \
-n kube \
--priority 1004 \
--destination-port-ranges 6443 \
--direction inbound
fi
if prompt "Set up public IP and load balancer"; then
prompted_run "Create public ip" \
az network public-ip create \
--resource-group $GROUP \
--name talos-public-ip \
--allocation-method static
prompted_run "Create lb" \
az network lb create \
--resource-group $GROUP \
--name talos-lb \
--public-ip-address talos-public-ip \
--frontend-ip-name talos-fe \
--backend-pool-name talos-be-pool
prompted_run "Create health check" \
az network lb probe create \
--resource-group $GROUP \
--lb-name talos-lb \
--name talos-lb-health \
--protocol tcp \
--port 6443
prompted_run "Create lb rule for 6443" \
az network lb rule create \
--resource-group $GROUP \
--lb-name talos-lb \
--name talos-6443 \
--protocol tcp \
--frontend-ip-name talos-fe \
--frontend-port 6443 \
--backend-pool-name talos-be-pool \
--backend-port 6443 \
--probe-name talos-lb-health
fi
if prompt "Set up network interfaces"; then
for i in $( seq 0 1 2 ); do
# Create public IP for each nic
az network public-ip create \
--resource-group $GROUP \
--name talos-controlplane-public-ip-$i \
--allocation-method static
# Create nic
az network nic create \
--resource-group $GROUP \
--name talos-controlplane-nic-$i \
--vnet-name talos-vnet \
--subnet talos-subnet \
--network-security-group talos-sg \
--public-ip-address talos-controlplane-public-ip-$i\
--lb-name talos-lb \
--lb-address-pools talos-be-pool
done
# NOTES:
# Talos can detect PublicIPs automatically if PublicIP SKU is Basic.
# Use `--sku Basic` to set SKU to Basic.
fi
LB_PUBLIC_IP=$(az network public-ip show \
--resource-group $GROUP \
--name talos-public-ip \
--query "ipAddress" \
--output tsv)
echo "LB Public IP: $LB_PUBLIC_IP"
CONTROL_PLANE_0_IP=$(az network public-ip show \
--resource-group $GROUP \
--name talos-controlplane-public-ip-0 \
--query "ipAddress" \
--output tsv)
echo "Control Plane 0 IP: $CONTROL_PLANE_0_IP"
if [ -e "controlplane.yaml" -o -e "talosconfig" -o -e "worker.yaml" ]; then
prompted_run "Remove existing talos config files" \
rm -f controlplane.yaml talosconfig worker.yaml
fi
prompted_run "Generate talos config" \
talosctl gen config talos-k8s-azure-tutorial https://${LB_PUBLIC_IP}:6443 --additional-sans "$CONTROL_PLANE_0_IP"
prompted_run "Create availability set" \
az vm availability-set create \
--name talos-controlplane-av-set \
-g $GROUP
if prompt "Create the controlplane nodes"; then
for i in $( seq 0 1 2 ); do
az vm create \
--name talos-controlplane-$i \
--image talos \
--custom-data ./controlplane.yaml \
-g $GROUP \
--admin-username talos \
--generate-ssh-keys \
--verbose \
--boot-diagnostics-storage $STORAGE_ACCOUNT \
--os-disk-size-gb 20 \
--nics talos-controlplane-nic-$i \
--availability-set talos-controlplane-av-set \
--no-wait
done
fi
prompted_run "Create worker node" \
az vm create \
--name talos-worker-0 \
--image talos \
--vnet-name talos-vnet \
--subnet talos-subnet \
--custom-data ./worker.yaml \
-g $GROUP \
--admin-username talos \
--generate-ssh-keys \
--verbose \
--boot-diagnostics-storage $STORAGE_ACCOUNT \
--nsg talos-sg \
--os-disk-size-gb 20 \
--no-wait
if prompt "Set endpoints and nodes"; then
talosctl --talosconfig talosconfig config endpoint $CONTROL_PLANE_0_IP
talosctl --talosconfig talosconfig config node $CONTROL_PLANE_0_IP
fi
if prompt "Wait for talos-controlplane-0 VM"; then
run az vm wait -n talos-controlplane-0 -g $GROUP --created
fi
prompted_run "Bootstrap etcd" \
talosctl --talosconfig talosconfig bootstrap
prompted_run "Retrieve kubeconfig" \
talosctl --talosconfig talosconfig kubeconfig .
create.sh output:
create.sh.log
Environment
- Talos version: [
talosctl version --nodes <problematic nodes>]$ talosctl version --talosconfig talosconfig --nodes 140.160.232.41 Client: Tag: v1.6.7 SHA: 46c8ac10 Built: Go version: go1.21.8 X:loopvar OS/Arch: darwin/arm64 Server: error getting version: rpc error: code = Unavailable desc = connection error: desc = "transport: authentication handshake failed: tls: failed to verify certificate: x509: certificate is valid for 10.0.0.4, 127.0.0.1, ::1, not 140.160.232.41" - Kubernetes version: n/a
- Platform: Azure Stack Hub version 2206
The core issue (I guess) is
[ 188.818488] [talos] restarting platform network config {"component": "controller-runtime", "controller": "network.PlatformConfigController", "interval": "679.794804ms", "error": "error fetching metadata: 3 error(s) occurred:\n\tfailed to download config from \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": Get \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": dial tcp 169.254.169.254:80: connect: network is unreachable\n\tfailed to download config from \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": failed to download config, status code 400, body \"{ \\\"error\\\": \\\"Bad request. api-ver\"\n\ttimeout"}
[ 189.589785] [talos] fetching azure instance config from: "http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json"
[ 190.644138] [talos] retrying error: failed to download config from "http://169.2tadata/instance/compute?api-version=2021-12-13&format=json": failed to download config, status code 400, body "{ \"error\": \"Bad request. api-ver"
[ 369.608066] [talos] restarting platform network config {"component": "controller-runtime", "controller": "network.PlatformConfigController", "interval": "979.096533ms", "error": "error fetching metadata: 2 error(s) occurred:\n\tfailed to download config from \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": failed to download config, status code 400, body \"{ \\\"error\\\": \\\"Bad request. api-ver\"\n\ttimeout"}
Yes, it appears that api-version is not supported in Azure Stack Hub.
azureuser@k8s-master-19661870-0:~$ curl -s -H "Metadata: true" "http://169.254.169.254/metadata/versions" | jq .
{
"apiVersions": [
"2017-03-01",
"2017-04-02",
"2017-08-01",
"2017-10-01",
"2017-12-01",
"2018-02-01",
"2018-04-02",
"2018-10-01",
"2019-02-01",
"2019-03-11",
"2019-04-30",
"2019-06-01",
"2019-06-04",
"2019-08-01",
"2019-08-15",
"2019-11-01",
"2020-06-01",
"2020-07-15",
"2020-09-01",
"2020-10-01",
"2020-12-01",
"2021-01-01",
"2021-02-01",
"2021-03-01",
"2021-05-01"
]
}