talos icon indicating copy to clipboard operation
talos copied to clipboard

talosctl bootstrap fails on Azure Stack Hub

Open a01fe opened this issue 2 years ago • 2 comments

Bug Report

Description

Following the Azure installation guide on Azure Stack Hub. The talosctl bootstrap step fails with

$  talosctl --talosconfig ./talosconfig bootstrap
error executing bootstrap: rpc error: code = Unavailable desc = connection error: desc = "transport: authentication handshake failed: tls: failed to verify certificate: x509: certificate is valid for 10.0.0.4, 127.0.0.1, ::1, not A.B.C.D"

where A.B.C.D is the talos-controlplane-public-ip-0 public IP address and 10.0.0.4 is the private IP address assigned to the nic.

Logs

talos-controlplane-0 VM boot logs: talos-controlplane-0.log

create.sh script with az cli commands to install Talos:

#!/usr/bin/env bash

program="$0"
start_bold="$(tput bold)$(tput setaf 3)"
start_warn="$(tput bold)$(tput setaf 1)"
start_cmd="$(tput setaf 2)"
end_color="$(tput sgr 0)"

verbose=

export TALOS_IMAGE=azure-amd64.vhd
export TALOS_VERSION=v1.6.7
export TALOS_IMAGE_URL=https://github.com/siderolabs/talos/releases/download/$TALOS_VERSION/$TALOS_IMAGE.xz

export PREFIX=eas-talos
export DUMBPREFIX=eastalos
export CLUSTER=scratch
export SUBSCRIPTION="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"

export GROUP="$PREFIX-$CLUSTER"
export STORAGE_ACCOUNT="$DUMBPREFIX$CLUSTER"

export STORAGE_CONTAINER="$PREFIX-$CLUSTER"
export STORAGE_BASE_URL="blob.azurestackhub.example.com"

export LOCATION="azurestackhub"

error () {
  local msg="$@"; shift
  : "${msg:?"$program: error: missing message"}"
  echo "${start_warn}$program: $msg${end_color}" >&2
  exit 1
}

prompt () {
  local msg="$@"
  : "${msg:?"prompt: missing message"}"
  echo
  while true; do
    [ -t 0 ] && read -u 0 -p "${start_bold}${msg}. Continue (Y/n/q)?${end_color} " answer
    case "$answer" in
      ''|y*|Y*)
        return 0
        ;;
      n*|N*)
        return 1
        ;;
      q*|Q*)
        exit 1
        ;;
      *)
        echo "${start_warn}prompt: unexpected response: $answer${end_color}"
        ;;
    esac
  done
}

run () {
  local cmd=("$@")
  : "${cmd:?"run: missing command"}"
  [ "$verbose" ] && echo "${cmd[@]}"
  "${cmd[@]}"
  [ "$?" -eq 0 ] || prompt "$start_warn" "run: command failed" || exit 1
}

prompted_run () {
  local msg="$1"; shift
  : "${msg:?"prompted_run: missing message"}"
  local cmd=("$@")
  : "${cmd:?"prompted_run: missing command"}"
  if prompt "$msg"; then
    [ "$verbose" ] && echo "${cmd[@]}"
    "${cmd[@]}"
    [ "$?" -eq 0 ] || prompt "$start_warn" "prompted_run: command failed" || exit 1
    echo
    echo
  fi
}

# Set active subscription
prompted_run "Set active subscription" \
  az account set --subscription "$SUBSCRIPTION"

# (re)create talos resource group
case "$(az group exists -n "$GROUP")" in
  true) if prompt "remove and recreate existing resource group"; then
      run az group delete --name $GROUP --yes
      run az group create --name $GROUP --location $LOCATION
    fi
    ;;
  false)
    run az group create --name $GROUP --location $LOCATION
    ;;
esac

prompted_run "Create storage account" \
    az storage account create \
      --name $STORAGE_ACCOUNT \
      -g $GROUP \
      -l $LOCATION \
      --sku Standard_LRS

export CONNECTION=$(az storage account show-connection-string \
                    -n $STORAGE_ACCOUNT \
                    -g $GROUP \
                    -o tsv)

prompted_run "Create storage container" \
  az storage container create \
    --name $STORAGE_CONTAINER \
    --account-name $STORAGE_ACCOUNT \
    -g $GROUP \
    --connection-string "$CONNECTION"

if prompt "Download talos release image"; then
echo "$TALOS_IMAGE_URL"
  run rm -f image/$TALOS_IMAGE.xz image/$TALOS_IMAGE
  run curl -L -o image/$TALOS_IMAGE.xz "$TALOS_IMAGE_URL"
  run xz -d image/$TALOS_IMAGE.xz
fi

prompted_run "Upload talos image" \
  az storage blob upload \
    --connection-string "$CONNECTION" \
    --container-name "$STORAGE_CONTAINER" \
    -f "image/$TALOS_IMAGE" \
    -n $TALOS_IMAGE

prompted_run "Register talos image" \
  az image create \
    --name talos \
    --source "https://$STORAGE_ACCOUNT.$STORAGE_BASE_URL/$STORAGE_CONTAINER/$TALOS_IMAGE" \
    --os-type linux \
    -g "$GROUP"

if prompt "Set up talos networking"; then
  prompted_run "create vnet" \
    az network vnet create \
      --resource-group $GROUP \
      --location $LOCATION \
      --name talos-vnet \
      --subnet-name talos-subnet

  prompted_run "Create network security group" \
    az network nsg create -g $GROUP -n talos-sg

  prompted_run "client -> apid" \
    az network nsg rule create \
      -g $GROUP \
      --nsg-name talos-sg \
      -n apid \
      --priority 1001 \
      --destination-port-ranges 50000 \
      --direction inbound

  prompted_run trustd \
    az network nsg rule create \
      -g $GROUP \
      --nsg-name talos-sg \
      -n trustd \
      --priority 1002 \
      --destination-port-ranges 50001 \
      --direction inbound

  prompted_run etcd \
    az network nsg rule create \
      -g $GROUP \
      --nsg-name talos-sg \
      -n etcd \
      --priority 1003 \
      --destination-port-ranges 2379-2380 \
      --direction inbound

  prompted_run "Kubernetes API Server" \
    az network nsg rule create \
      -g $GROUP \
      --nsg-name talos-sg \
      -n kube \
      --priority 1004 \
      --destination-port-ranges 6443 \
      --direction inbound
fi

if prompt "Set up public IP and load balancer"; then
  prompted_run "Create public ip" \
    az network public-ip create \
      --resource-group $GROUP \
      --name talos-public-ip \
      --allocation-method static

  prompted_run "Create lb" \
    az network lb create \
      --resource-group $GROUP \
      --name talos-lb \
      --public-ip-address talos-public-ip \
      --frontend-ip-name talos-fe \
      --backend-pool-name talos-be-pool

  prompted_run "Create health check" \
    az network lb probe create \
      --resource-group $GROUP \
      --lb-name talos-lb \
      --name talos-lb-health \
      --protocol tcp \
      --port 6443


  prompted_run "Create lb rule for 6443" \
    az network lb rule create \
      --resource-group $GROUP \
      --lb-name talos-lb \
      --name talos-6443 \
      --protocol tcp \
      --frontend-ip-name talos-fe \
      --frontend-port 6443 \
      --backend-pool-name talos-be-pool \
      --backend-port 6443 \
      --probe-name talos-lb-health
fi

if prompt "Set up network interfaces"; then
  for i in $( seq 0 1 2 ); do
    # Create public IP for each nic
    az network public-ip create \
      --resource-group $GROUP \
      --name talos-controlplane-public-ip-$i \
      --allocation-method static

    # Create nic
    az network nic create \
      --resource-group $GROUP \
      --name talos-controlplane-nic-$i \
      --vnet-name talos-vnet \
      --subnet talos-subnet \
      --network-security-group talos-sg \
      --public-ip-address talos-controlplane-public-ip-$i\
      --lb-name talos-lb \
      --lb-address-pools talos-be-pool
  done
  # NOTES:
  # Talos can detect PublicIPs automatically if PublicIP SKU is Basic.
  # Use `--sku Basic` to set SKU to Basic.
fi

LB_PUBLIC_IP=$(az network public-ip show \
              --resource-group $GROUP \
              --name talos-public-ip \
              --query "ipAddress" \
              --output tsv)
echo "LB Public IP: $LB_PUBLIC_IP"

CONTROL_PLANE_0_IP=$(az network public-ip show \
                    --resource-group $GROUP \
                    --name talos-controlplane-public-ip-0 \
                    --query "ipAddress" \
                    --output tsv)
echo "Control Plane 0 IP: $CONTROL_PLANE_0_IP"

if [ -e "controlplane.yaml" -o -e "talosconfig" -o -e "worker.yaml" ]; then
  prompted_run "Remove existing talos config files" \
    rm -f controlplane.yaml talosconfig worker.yaml
fi

prompted_run "Generate talos config" \
  talosctl gen config talos-k8s-azure-tutorial https://${LB_PUBLIC_IP}:6443 --additional-sans "$CONTROL_PLANE_0_IP"

prompted_run "Create availability set" \
  az vm availability-set create \
    --name talos-controlplane-av-set \
    -g $GROUP

if prompt "Create the controlplane nodes"; then
  for i in $( seq 0 1 2 ); do
    az vm create \
      --name talos-controlplane-$i \
      --image talos \
      --custom-data ./controlplane.yaml \
      -g $GROUP \
      --admin-username talos \
      --generate-ssh-keys \
      --verbose \
      --boot-diagnostics-storage $STORAGE_ACCOUNT \
      --os-disk-size-gb 20 \
      --nics talos-controlplane-nic-$i \
      --availability-set talos-controlplane-av-set \
      --no-wait
  done
fi

prompted_run "Create worker node" \
  az vm create \
    --name talos-worker-0 \
    --image talos \
    --vnet-name talos-vnet \
    --subnet talos-subnet \
    --custom-data ./worker.yaml \
    -g $GROUP \
    --admin-username talos \
    --generate-ssh-keys \
    --verbose \
    --boot-diagnostics-storage $STORAGE_ACCOUNT \
    --nsg talos-sg \
    --os-disk-size-gb 20 \
    --no-wait

if prompt "Set endpoints and nodes"; then
  talosctl --talosconfig talosconfig config endpoint $CONTROL_PLANE_0_IP
  talosctl --talosconfig talosconfig config node $CONTROL_PLANE_0_IP
fi

if prompt "Wait for talos-controlplane-0 VM"; then
  run az vm wait -n talos-controlplane-0 -g $GROUP --created
fi

prompted_run "Bootstrap etcd" \
  talosctl --talosconfig talosconfig bootstrap

prompted_run "Retrieve kubeconfig" \
  talosctl --talosconfig talosconfig kubeconfig .

create.sh output: create.sh.log

Environment

  • Talos version: [talosctl version --nodes <problematic nodes>]
    $ talosctl version --talosconfig talosconfig --nodes 140.160.232.41
    Client:
    	Tag:         v1.6.7
    	SHA:         46c8ac10
    	Built:       
    	Go version:  go1.21.8 X:loopvar
    	OS/Arch:     darwin/arm64
    Server:
    error getting version: rpc error: code = Unavailable desc = connection error: desc = "transport: authentication handshake failed: tls: failed to verify certificate: x509: certificate is valid for 10.0.0.4, 127.0.0.1, ::1, not 140.160.232.41"
    
  • Kubernetes version: n/a
  • Platform: Azure Stack Hub version 2206

a01fe avatar Apr 06 '24 01:04 a01fe

The core issue (I guess) is

[  188.818488] [talos] restarting platform network config {"component": "controller-runtime", "controller": "network.PlatformConfigController", "interval": "679.794804ms", "error": "error fetching metadata: 3 error(s) occurred:\n\tfailed to download config from \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": Get \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": dial tcp 169.254.169.254:80: connect: network is unreachable\n\tfailed to download config from \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": failed to download config, status code 400, body \"{ \\\"error\\\": \\\"Bad request. api-ver\"\n\ttimeout"}
[  189.589785] [talos] fetching azure instance config from: "http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json"
[  190.644138] [talos] retrying error: failed to download config from "http://169.2tadata/instance/compute?api-version=2021-12-13&format=json": failed to download config, status code 400, body "{ \"error\": \"Bad request. api-ver"
[  369.608066] [talos] restarting platform network config {"component": "controller-runtime", "controller": "network.PlatformConfigController", "interval": "979.096533ms", "error": "error fetching metadata: 2 error(s) occurred:\n\tfailed to download config from \"http://169.254.169.254/metadata/instance/compute?api-version=2021-12-13&format=json\": failed to download config, status code 400, body \"{ \\\"error\\\": \\\"Bad request. api-ver\"\n\ttimeout"}

smira avatar Apr 08 '24 10:04 smira

Yes, it appears that api-version is not supported in Azure Stack Hub.

azureuser@k8s-master-19661870-0:~$ curl -s -H "Metadata: true" "http://169.254.169.254/metadata/versions" | jq .
{
  "apiVersions": [
    "2017-03-01",
    "2017-04-02",
    "2017-08-01",
    "2017-10-01",
    "2017-12-01",
    "2018-02-01",
    "2018-04-02",
    "2018-10-01",
    "2019-02-01",
    "2019-03-11",
    "2019-04-30",
    "2019-06-01",
    "2019-06-04",
    "2019-08-01",
    "2019-08-15",
    "2019-11-01",
    "2020-06-01",
    "2020-07-15",
    "2020-09-01",
    "2020-10-01",
    "2020-12-01",
    "2021-01-01",
    "2021-02-01",
    "2021-03-01",
    "2021-05-01"
  ]
}

a01fe avatar Apr 08 '24 15:04 a01fe