lxd Concurrently launching instances backed by `ceph` is unreliable

Please confirm

[x] I have searched existing issues to check if an issue already exists for the bug I encountered.

Distribution

Ubuntu

Distribution version

24.04

Output of "snap list --all lxd core20 core22 core24 snapd"

# snap list --all lxd core20 core22 core24 snapd
Name    Version         Rev    Tracking       Publisher   Notes
core22  20250210        1802   latest/stable  canonical✓  base
core24  20241217        739    latest/stable  canonical✓  base
lxd     5.21.3-c5ae129  33110  5.21/stable    canonical✓  in-cohort
snapd   2.67.1          23771  latest/stable  canonical✓  snapd

Output of "lxc info" or system info if it fails

root@micro1:~# lxc info
config:
  cluster.https_address: 172.24.26.18:8443
  core.https_address: '[::]:8443'
  network.ovn.northbound_connection: ssl:172.24.26.18:6641,ssl:172.24.26.69:6641,ssl:172.24.26.210:6641
  storage.backups_volume: local/backups
  storage.images_volume: local/images
  user.microcloud: 2.1.0
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- backup_compression
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- snapshot_schedule_aliases
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
- firewall_driver
- projects_limits
- container_syscall_intercept_hugetlbfs
- limits_hugepages
- container_nic_routed_gateway
- projects_restrictions
- custom_volume_snapshot_expiry
- volume_snapshot_scheduling
- trust_ca_certificates
- snapshot_disk_usage
- clustering_edit_roles
- container_nic_routed_host_address
- container_nic_ipvlan_gateway
- resources_usb_pci
- resources_cpu_threads_numa
- resources_cpu_core_die
- api_os
- container_nic_routed_host_table
- container_nic_ipvlan_host_table
- container_nic_ipvlan_mode
- resources_system
- images_push_relay
- network_dns_search
- container_nic_routed_limits
- instance_nic_bridged_vlan
- network_state_bond_bridge
- usedby_consistency
- custom_block_volumes
- clustering_failure_domains
- resources_gpu_mdev
- console_vga_type
- projects_limits_disk
- network_type_macvlan
- network_type_sriov
- container_syscall_intercept_bpf_devices
- network_type_ovn
- projects_networks
- projects_networks_restricted_uplinks
- custom_volume_backup
- backup_override_name
- storage_rsync_compression
- network_type_physical
- network_ovn_external_subnets
- network_ovn_nat
- network_ovn_external_routes_remove
- tpm_device_type
- storage_zfs_clone_copy_rebase
- gpu_mdev
- resources_pci_iommu
- resources_network_usb
- resources_disk_address
- network_physical_ovn_ingress_mode
- network_ovn_dhcp
- network_physical_routes_anycast
- projects_limits_instances
- network_state_vlan
- instance_nic_bridged_port_isolation
- instance_bulk_state_change
- network_gvrp
- instance_pool_move
- gpu_sriov
- pci_device_type
- storage_volume_state
- network_acl
- migration_stateful
- disk_state_quota
- storage_ceph_features
- projects_compression
- projects_images_remote_cache_expiry
- certificate_project
- network_ovn_acl
- projects_images_auto_update
- projects_restricted_cluster_target
- images_default_architecture
- network_ovn_acl_defaults
- gpu_mig
- project_usage
- network_bridge_acl
- warnings
- projects_restricted_backups_and_snapshots
- clustering_join_token
- clustering_description
- server_trusted_proxy
- clustering_update_cert
- storage_api_project
- server_instance_driver_operational
- server_supported_storage_drivers
- event_lifecycle_requestor_address
- resources_gpu_usb
- clustering_evacuation
- network_ovn_nat_address
- network_bgp
- network_forward
- custom_volume_refresh
- network_counters_errors_dropped
- metrics
- image_source_project
- clustering_config
- network_peer
- linux_sysctl
- network_dns
- ovn_nic_acceleration
- certificate_self_renewal
- instance_project_move
- storage_volume_project_move
- cloud_init
- network_dns_nat
- database_leader
- instance_all_projects
- clustering_groups
- ceph_rbd_du
- instance_get_full
- qemu_metrics
- gpu_mig_uuid
- event_project
- clustering_evacuation_live
- instance_allow_inconsistent_copy
- network_state_ovn
- storage_volume_api_filtering
- image_restrictions
- storage_zfs_export
- network_dns_records
- storage_zfs_reserve_space
- network_acl_log
- storage_zfs_blocksize
- metrics_cpu_seconds
- instance_snapshot_never
- certificate_token
- instance_nic_routed_neighbor_probe
- event_hub
- agent_nic_config
- projects_restricted_intercept
- metrics_authentication
- images_target_project
- cluster_migration_inconsistent_copy
- cluster_ovn_chassis
- container_syscall_intercept_sched_setscheduler
- storage_lvm_thinpool_metadata_size
- storage_volume_state_total
- instance_file_head
- instances_nic_host_name
- image_copy_profile
- container_syscall_intercept_sysinfo
- clustering_evacuation_mode
- resources_pci_vpd
- qemu_raw_conf
- storage_cephfs_fscache
- network_load_balancer
- vsock_api
- instance_ready_state
- network_bgp_holdtime
- storage_volumes_all_projects
- metrics_memory_oom_total
- storage_buckets
- storage_buckets_create_credentials
- metrics_cpu_effective_total
- projects_networks_restricted_access
- storage_buckets_local
- loki
- acme
- internal_metrics
- cluster_join_token_expiry
- remote_token_expiry
- init_preseed
- storage_volumes_created_at
- cpu_hotplug
- projects_networks_zones
- network_txqueuelen
- cluster_member_state
- instances_placement_scriptlet
- storage_pool_source_wipe
- zfs_block_mode
- instance_generation_id
- disk_io_cache
- amd_sev
- storage_pool_loop_resize
- migration_vm_live
- ovn_nic_nesting
- oidc
- network_ovn_l3only
- ovn_nic_acceleration_vdpa
- cluster_healing
- instances_state_total
- auth_user
- security_csm
- instances_rebuild
- numa_cpu_placement
- custom_volume_iso
- network_allocations
- storage_api_remote_volume_snapshot_copy
- zfs_delegate
- operations_get_query_all_projects
- metadata_configuration
- syslog_socket
- event_lifecycle_name_and_project
- instances_nic_limits_priority
- disk_initial_volume_configuration
- operation_wait
- cluster_internal_custom_volume_copy
- disk_io_bus
- storage_cephfs_create_missing
- instance_move_config
- ovn_ssl_config
- init_preseed_storage_volumes
- metrics_instances_count
- server_instance_type_info
- resources_disk_mounted
- server_version_lts
- oidc_groups_claim
- loki_config_instance
- storage_volatile_uuid
- import_instance_devices
- instances_uefi_vars
- instances_migration_stateful
- container_syscall_filtering_allow_deny_syntax
- access_management
- vm_disk_io_limits
- storage_volumes_all
- instances_files_modify_permissions
- image_restriction_nesting
- container_syscall_intercept_finit_module
- device_usb_serial
- network_allocate_external_ips
- explicit_trust_token
- instance_import_conversion
- instance_create_start
- devlxd_images_vm
- instance_protection_start
- disk_io_bus_virtio_blk
- metadata_configuration_entity_types
- network_allocations_ovn_uplink
- network_ovn_uplink_vlan
- shared_custom_block_volumes
- metrics_api_requests
- projects_limits_disk_pool
- access_management_tls
- state_logical_cpus
- vm_limits_cpu_pin_strategy
- gpu_cdi
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
auth_user_name: root
auth_user_method: unix
environment:
  addresses:
  - 172.24.26.18:8443
  - '[2001:470:b1c3:7946:85f3:d3a8:d71e:7b05]:8443'
  architectures:
  - x86_64
  - i686
  certificate: |
    -----BEGIN CERTIFICATE-----
    MIIB4jCCAWmgAwIBAgIQcrXuVF6F3hyWCmwiW71yFTAKBggqhkjOPQQDAzAkMQww
    CgYDVQQKEwNMWEQxFDASBgNVBAMMC3Jvb3RAbWljcm8xMB4XDTI1MDMwNjE4NTA1
    N1oXDTM1MDMwNDE4NTA1N1owJDEMMAoGA1UEChMDTFhEMRQwEgYDVQQDDAtyb290
    QG1pY3JvMTB2MBAGByqGSM49AgEGBSuBBAAiA2IABLyvcvC44GnZhQ05h41ayUMS
    D2e3P79/npSv4YTzspbV+NT75MasWMRQls/FGeM41fD3oo0lk++DXy/LsFB9yNIJ
    3P7EnjsFMnre1ckwEJ/nFItXs9JOnAdC4GF4+OniSKNgMF4wDgYDVR0PAQH/BAQD
    AgWgMBMGA1UdJQQMMAoGCCsGAQUFBwMBMAwGA1UdEwEB/wQCMAAwKQYDVR0RBCIw
    IIIGbWljcm8xhwR/AAABhxAAAAAAAAAAAAAAAAAAAAABMAoGCCqGSM49BAMDA2cA
    MGQCMG/z7FT0RcYK26tgm24OetWW2hQhuJSwmf7+xsqsNknkaJh9y526fgHNdJXJ
    NjmX2wIwANdzvQbHumoOf7HrzQUTdbhCeZFYMCcUSrtWSRqqXg2voZHXg2xuYLv6
    6jMEAAVp
    -----END CERTIFICATE-----
  certificate_fingerprint: 27658ccbde6ddd7049b39b6f97051b31ed1d443691cdac1b16b580c6b871b6c6
  driver: lxc | qemu
  driver_version: 6.0.2 | 8.2.2
  instance_types:
  - container
  - virtual-machine
  firewall: nftables
  kernel: Linux
  kernel_architecture: x86_64
  kernel_features:
    idmapped_mounts: "true"
    netnsid_getifaddrs: "true"
    seccomp_listener: "true"
    seccomp_listener_continue: "true"
    uevent_injection: "true"
    unpriv_binfmt: "false"
    unpriv_fscaps: "true"
  kernel_version: 5.15.0-1077-kvm
  lxc_features:
    cgroup2: "true"
    core_scheduling: "true"
    devpts_fd: "true"
    idmapped_mounts_v2: "true"
    mount_injection_file: "true"
    network_gateway_device_route: "true"
    network_ipvlan: "true"
    network_l2proxy: "true"
    network_phys_macvlan_mtu: "true"
    network_veth_router: "true"
    pidfd: "true"
    seccomp_allow_deny_syntax: "true"
    seccomp_notify: "true"
    seccomp_proxy_send_notify_fd: "true"
  os_name: Ubuntu
  os_version: "22.04"
  project: default
  server: lxd
  server_clustered: true
  server_event_mode: full-mesh
  server_name: micro1
  server_pid: 1815
  server_version: 5.21.3
  server_lts: true
  storage: zfs | ceph | cephfs
  storage_version: 2.1.5-1ubuntu6~22.04.5 | 17.2.7 | 17.2.7
  storage_supported_drivers:
  - name: cephobject
    version: 17.2.7
    remote: true
  - name: dir
    version: "1"
    remote: false
  - name: lvm
    version: 2.03.11(2) (2021-01-08) / 1.02.175 (2021-01-08) / 4.45.0
    remote: false
  - name: powerflex
    version: 1.16 (nvme-cli)
    remote: true
  - name: zfs
    version: 2.1.5-1ubuntu6~22.04.5
    remote: false
  - name: btrfs
    version: 5.16.2
    remote: false
  - name: ceph
    version: 17.2.7
    remote: true
  - name: cephfs
    version: 17.2.7
    remote: true

Issue description

In a LXD cluster (MicroCloud), trying to concurrently create instances backed by ceph will fail the first time because multiple cluster member will try to create the readonly snapshot on the shared ceph pool.

Steps to reproduce

Here's the minimal reproducer that assumes a MicroCloud made out of 3 nodes, micro1, micro2 and micro3. This means the default profile is configured to put the roofs of the instances onto ceph:

root@micro1:~# lxc profile show default
name: default
description: Default LXD profile
config: {}
devices:
  eth0:
    name: eth0
    network: default
    type: nic
  root:
    path: /
    pool: remote
    type: disk
used_by: []

Copy a public image:

root@micro1:~# lxc image copy ubuntu-minimal-daily:24.04 local:
Image copied successfully!

Verify the downloaded image was never turned into an image on the storage pool:

root@micro1:~# FINGERPRINT="$(lxc image info ubuntu-minimal-daily:24.04 | awk '/^Fingerprint:/ {print $2}')"
root@micro1:~# lxc image list -fcsv -cF | grep -xF "${FINGERPRINT}"
46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a

root@micro1:~# lxc storage volume list remote | grep -F "${FINGERPRINT}" && echo "the image should NOT exist in the pool"

Concurrently create instances using that downloaded image for the first time:

root@micro1:~# lxc init ubuntu-minimal-daily:24.04 c1 --target micro1 & lxc init ubuntu-minimal-daily:24.04 c2 --target micro2 & lxc init ubuntu-minimal-daily:24.04 c3 --target micro3
[1] 12916
[2] 12917
Creating c2
Creating c3
Creating c1
Error: Failed instance creation: Failed creating instance from image: Error inserting volume "46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a" for project "default" in pool "remote" of type "images" into database "UNIQUE constraint failed: index 'storage_volumes_unique_storage_pool_id_node_id_project_id_name_type'"
Retrieving image: Unpacking image: 100% (642.75MB/s)Error: Failed instance creation: Failed creating instance from image: Failed to run: rbd --id admin --cluster ceph --image-feature layering --image-feature striping --image-feature exclusive-lock --image-feature object-map --image-feature fast-diff --image-feature deep-flatten clone lxd_remote/image_46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a_ext4@readonly lxd_remote/container_c2: exit status 2 (2025-04-03T20:35:58.256+0000 7fc2ff7fe640 -1 librbd::image::RefreshRequest: failed to locate snapshot: readonly
2025-04-03T20:35:58.256+0000 7fc2ff7fe640 -1 librbd::image::OpenRequest: failed to find snapshot readonly
2025-04-03T20:35:58.256+0000 7fc2eeffd640 -1 librbd::image::CloneRequest: 0x557ca7f989b0 handle_open_parent: failed to open parent image: (2) No such file or directory
rbd: clone error: (2) No such file or directory)
[1]-  Exit 1                  lxc init ubuntu-minimal-daily:24.04 c1 --target micro1
[2]+  Exit 1                  lxc init ubuntu-minimal-daily:24.04 c2 --target micro2
root@micro1:~# 
root@micro1:~# lxc list
+------+---------+------+------+-----------+-----------+----------+
| NAME |  STATE  | IPV4 | IPV6 |   TYPE    | SNAPSHOTS | LOCATION |
+------+---------+------+------+-----------+-----------+----------+
| c3   | STOPPED |      |      | CONTAINER | 0         | micro3   |
+------+---------+------+------+-----------+-----------+----------+

The above errors show 2 issues. The UNIQUE constaint failed one is about another bug but the failed to open parent image one is the one I'm reporting here.

At this point, it's possible to see the remote pool now has the image as micro3 was able to take the tarball/squashfs and turn it into a RBD volume with a @readonly snapshot:

root@micro1:~# lxc list
+------+---------+------+------+-----------+-----------+----------+
| NAME |  STATE  | IPV4 | IPV6 |   TYPE    | SNAPSHOTS | LOCATION |
+------+---------+------+------+-----------+-----------+----------+
| c3   | STOPPED |      |      | CONTAINER | 0         | micro3   |
+------+---------+------+------+-----------+-----------+----------+
root@micro1:~# lxc storage volume list remote | grep -F "${FINGERPRINT}"
| image     | 46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a |             | filesystem   | 1       |          |

From here on, creating new instances concurrently should no longer run into the bug as the needed image is in the remote pool already:

# Clear up any instance
root@micro1:~# for i in c1 c2 c3; do lxc rm "${i}"; done
Error: Failed checking instance exists "local:c1": Instance not found
Error: Failed checking instance exists "local:c2": Instance not found

# Create multiple instances concurrently
root@micro1:~# lxc init ubuntu-minimal-daily:24.04 c1 --target micro1 & lxc init ubuntu-minimal-daily:24.04 c2 --target micro2 & lxc init ubuntu-minimal-daily:24.04 c3 --target micro3
[1] 15026
[2] 15027
Creating c1
Creating c3
Creating c2

Information to attach

[ ] Any relevant kernel output (dmesg)
[ ] Instance log (lxc info NAME --show-log)
[ ] Instance configuration (lxc config show NAME --expanded)
[ ] Main daemon log (at /var/log/lxd/lxd.log or /var/snap/lxd/common/lxd/logs/lxd.log)
[ ] Output of the client with --debug
[ ] Output of the daemon with --debug (or use lxc monitor while reproducing the issue)

Apr 03 '25 21:04 simondeziel

cc @MusicDin, that's the bug we kept running into with Terraform. Please add anything I might have missed.

Apr 03 '25 21:04 simondeziel

Yes this is similar to the other issue mentioned, except its likely affecting all remote storage pools that try and use the downloaded image to create an image volume.

Apr 04 '25 13:04 tomponline

I suspect we need to create some /internal endpoints that allow us to perform cluster-wide operations for DB records and remote pool operations on the leader (which then allows the use of sync.Mutex).

Apr 04 '25 13:04 tomponline

Yes this is similar to the other issue mentioned, except its likely affecting all remote storage pools that try and use the downloaded image to create an image volume.

I can try with Pure next week to confirm this.

I suspect we need to create some /internal endpoints that allow us to perform cluster-wide operations for DB records and remote pool operations on the leader (which then allows the use of sync.Mutex).

That would be much appreciated :)

Apr 04 '25 15:04 MusicDin

Similar to https://github.com/lxc/incus/issues/2314#issuecomment-3129918070 and https://github.com/canonical/lxd/issues/11636

Jul 29 '25 08:07 tomponline