Concurrently launching instances backed by `ceph` is unreliable
Please confirm
- [x] I have searched existing issues to check if an issue already exists for the bug I encountered.
Distribution
Ubuntu
Distribution version
24.04
Output of "snap list --all lxd core20 core22 core24 snapd"
# snap list --all lxd core20 core22 core24 snapd
Name Version Rev Tracking Publisher Notes
core22 20250210 1802 latest/stable canonical✓ base
core24 20241217 739 latest/stable canonical✓ base
lxd 5.21.3-c5ae129 33110 5.21/stable canonical✓ in-cohort
snapd 2.67.1 23771 latest/stable canonical✓ snapd
Output of "lxc info" or system info if it fails
root@micro1:~# lxc info
config:
cluster.https_address: 172.24.26.18:8443
core.https_address: '[::]:8443'
network.ovn.northbound_connection: ssl:172.24.26.18:6641,ssl:172.24.26.69:6641,ssl:172.24.26.210:6641
storage.backups_volume: local/backups
storage.images_volume: local/images
user.microcloud: 2.1.0
api_extensions:
- storage_zfs_remove_snapshots
- container_host_shutdown_timeout
- container_stop_priority
- container_syscall_filtering
- auth_pki
- container_last_used_at
- etag
- patch
- usb_devices
- https_allowed_credentials
- image_compression_algorithm
- directory_manipulation
- container_cpu_time
- storage_zfs_use_refquota
- storage_lvm_mount_options
- network
- profile_usedby
- container_push
- container_exec_recording
- certificate_update
- container_exec_signal_handling
- gpu_devices
- container_image_properties
- migration_progress
- id_map
- network_firewall_filtering
- network_routes
- storage
- file_delete
- file_append
- network_dhcp_expiry
- storage_lvm_vg_rename
- storage_lvm_thinpool_rename
- network_vlan
- image_create_aliases
- container_stateless_copy
- container_only_migration
- storage_zfs_clone_copy
- unix_device_rename
- storage_lvm_use_thinpool
- storage_rsync_bwlimit
- network_vxlan_interface
- storage_btrfs_mount_options
- entity_description
- image_force_refresh
- storage_lvm_lv_resizing
- id_map_base
- file_symlinks
- container_push_target
- network_vlan_physical
- storage_images_delete
- container_edit_metadata
- container_snapshot_stateful_migration
- storage_driver_ceph
- storage_ceph_user_name
- resource_limits
- storage_volatile_initial_source
- storage_ceph_force_osd_reuse
- storage_block_filesystem_btrfs
- resources
- kernel_limits
- storage_api_volume_rename
- network_sriov
- console
- restrict_devlxd
- migration_pre_copy
- infiniband
- maas_network
- devlxd_events
- proxy
- network_dhcp_gateway
- file_get_symlink
- network_leases
- unix_device_hotplug
- storage_api_local_volume_handling
- operation_description
- clustering
- event_lifecycle
- storage_api_remote_volume_handling
- nvidia_runtime
- container_mount_propagation
- container_backup
- devlxd_images
- container_local_cross_pool_handling
- proxy_unix
- proxy_udp
- clustering_join
- proxy_tcp_udp_multi_port_handling
- network_state
- proxy_unix_dac_properties
- container_protection_delete
- unix_priv_drop
- pprof_http
- proxy_haproxy_protocol
- network_hwaddr
- proxy_nat
- network_nat_order
- container_full
- backup_compression
- nvidia_runtime_config
- storage_api_volume_snapshots
- storage_unmapped
- projects
- network_vxlan_ttl
- container_incremental_copy
- usb_optional_vendorid
- snapshot_scheduling
- snapshot_schedule_aliases
- container_copy_project
- clustering_server_address
- clustering_image_replication
- container_protection_shift
- snapshot_expiry
- container_backup_override_pool
- snapshot_expiry_creation
- network_leases_location
- resources_cpu_socket
- resources_gpu
- resources_numa
- kernel_features
- id_map_current
- event_location
- storage_api_remote_volume_snapshots
- network_nat_address
- container_nic_routes
- cluster_internal_copy
- seccomp_notify
- lxc_features
- container_nic_ipvlan
- network_vlan_sriov
- storage_cephfs
- container_nic_ipfilter
- resources_v2
- container_exec_user_group_cwd
- container_syscall_intercept
- container_disk_shift
- storage_shifted
- resources_infiniband
- daemon_storage
- instances
- image_types
- resources_disk_sata
- clustering_roles
- images_expiry
- resources_network_firmware
- backup_compression_algorithm
- ceph_data_pool_name
- container_syscall_intercept_mount
- compression_squashfs
- container_raw_mount
- container_nic_routed
- container_syscall_intercept_mount_fuse
- container_disk_ceph
- virtual-machines
- image_profiles
- clustering_architecture
- resources_disk_id
- storage_lvm_stripes
- vm_boot_priority
- unix_hotplug_devices
- api_filtering
- instance_nic_network
- clustering_sizing
- firewall_driver
- projects_limits
- container_syscall_intercept_hugetlbfs
- limits_hugepages
- container_nic_routed_gateway
- projects_restrictions
- custom_volume_snapshot_expiry
- volume_snapshot_scheduling
- trust_ca_certificates
- snapshot_disk_usage
- clustering_edit_roles
- container_nic_routed_host_address
- container_nic_ipvlan_gateway
- resources_usb_pci
- resources_cpu_threads_numa
- resources_cpu_core_die
- api_os
- container_nic_routed_host_table
- container_nic_ipvlan_host_table
- container_nic_ipvlan_mode
- resources_system
- images_push_relay
- network_dns_search
- container_nic_routed_limits
- instance_nic_bridged_vlan
- network_state_bond_bridge
- usedby_consistency
- custom_block_volumes
- clustering_failure_domains
- resources_gpu_mdev
- console_vga_type
- projects_limits_disk
- network_type_macvlan
- network_type_sriov
- container_syscall_intercept_bpf_devices
- network_type_ovn
- projects_networks
- projects_networks_restricted_uplinks
- custom_volume_backup
- backup_override_name
- storage_rsync_compression
- network_type_physical
- network_ovn_external_subnets
- network_ovn_nat
- network_ovn_external_routes_remove
- tpm_device_type
- storage_zfs_clone_copy_rebase
- gpu_mdev
- resources_pci_iommu
- resources_network_usb
- resources_disk_address
- network_physical_ovn_ingress_mode
- network_ovn_dhcp
- network_physical_routes_anycast
- projects_limits_instances
- network_state_vlan
- instance_nic_bridged_port_isolation
- instance_bulk_state_change
- network_gvrp
- instance_pool_move
- gpu_sriov
- pci_device_type
- storage_volume_state
- network_acl
- migration_stateful
- disk_state_quota
- storage_ceph_features
- projects_compression
- projects_images_remote_cache_expiry
- certificate_project
- network_ovn_acl
- projects_images_auto_update
- projects_restricted_cluster_target
- images_default_architecture
- network_ovn_acl_defaults
- gpu_mig
- project_usage
- network_bridge_acl
- warnings
- projects_restricted_backups_and_snapshots
- clustering_join_token
- clustering_description
- server_trusted_proxy
- clustering_update_cert
- storage_api_project
- server_instance_driver_operational
- server_supported_storage_drivers
- event_lifecycle_requestor_address
- resources_gpu_usb
- clustering_evacuation
- network_ovn_nat_address
- network_bgp
- network_forward
- custom_volume_refresh
- network_counters_errors_dropped
- metrics
- image_source_project
- clustering_config
- network_peer
- linux_sysctl
- network_dns
- ovn_nic_acceleration
- certificate_self_renewal
- instance_project_move
- storage_volume_project_move
- cloud_init
- network_dns_nat
- database_leader
- instance_all_projects
- clustering_groups
- ceph_rbd_du
- instance_get_full
- qemu_metrics
- gpu_mig_uuid
- event_project
- clustering_evacuation_live
- instance_allow_inconsistent_copy
- network_state_ovn
- storage_volume_api_filtering
- image_restrictions
- storage_zfs_export
- network_dns_records
- storage_zfs_reserve_space
- network_acl_log
- storage_zfs_blocksize
- metrics_cpu_seconds
- instance_snapshot_never
- certificate_token
- instance_nic_routed_neighbor_probe
- event_hub
- agent_nic_config
- projects_restricted_intercept
- metrics_authentication
- images_target_project
- cluster_migration_inconsistent_copy
- cluster_ovn_chassis
- container_syscall_intercept_sched_setscheduler
- storage_lvm_thinpool_metadata_size
- storage_volume_state_total
- instance_file_head
- instances_nic_host_name
- image_copy_profile
- container_syscall_intercept_sysinfo
- clustering_evacuation_mode
- resources_pci_vpd
- qemu_raw_conf
- storage_cephfs_fscache
- network_load_balancer
- vsock_api
- instance_ready_state
- network_bgp_holdtime
- storage_volumes_all_projects
- metrics_memory_oom_total
- storage_buckets
- storage_buckets_create_credentials
- metrics_cpu_effective_total
- projects_networks_restricted_access
- storage_buckets_local
- loki
- acme
- internal_metrics
- cluster_join_token_expiry
- remote_token_expiry
- init_preseed
- storage_volumes_created_at
- cpu_hotplug
- projects_networks_zones
- network_txqueuelen
- cluster_member_state
- instances_placement_scriptlet
- storage_pool_source_wipe
- zfs_block_mode
- instance_generation_id
- disk_io_cache
- amd_sev
- storage_pool_loop_resize
- migration_vm_live
- ovn_nic_nesting
- oidc
- network_ovn_l3only
- ovn_nic_acceleration_vdpa
- cluster_healing
- instances_state_total
- auth_user
- security_csm
- instances_rebuild
- numa_cpu_placement
- custom_volume_iso
- network_allocations
- storage_api_remote_volume_snapshot_copy
- zfs_delegate
- operations_get_query_all_projects
- metadata_configuration
- syslog_socket
- event_lifecycle_name_and_project
- instances_nic_limits_priority
- disk_initial_volume_configuration
- operation_wait
- cluster_internal_custom_volume_copy
- disk_io_bus
- storage_cephfs_create_missing
- instance_move_config
- ovn_ssl_config
- init_preseed_storage_volumes
- metrics_instances_count
- server_instance_type_info
- resources_disk_mounted
- server_version_lts
- oidc_groups_claim
- loki_config_instance
- storage_volatile_uuid
- import_instance_devices
- instances_uefi_vars
- instances_migration_stateful
- container_syscall_filtering_allow_deny_syntax
- access_management
- vm_disk_io_limits
- storage_volumes_all
- instances_files_modify_permissions
- image_restriction_nesting
- container_syscall_intercept_finit_module
- device_usb_serial
- network_allocate_external_ips
- explicit_trust_token
- instance_import_conversion
- instance_create_start
- devlxd_images_vm
- instance_protection_start
- disk_io_bus_virtio_blk
- metadata_configuration_entity_types
- network_allocations_ovn_uplink
- network_ovn_uplink_vlan
- shared_custom_block_volumes
- metrics_api_requests
- projects_limits_disk_pool
- access_management_tls
- state_logical_cpus
- vm_limits_cpu_pin_strategy
- gpu_cdi
api_status: stable
api_version: "1.0"
auth: trusted
public: false
auth_methods:
- tls
auth_user_name: root
auth_user_method: unix
environment:
addresses:
- 172.24.26.18:8443
- '[2001:470:b1c3:7946:85f3:d3a8:d71e:7b05]:8443'
architectures:
- x86_64
- i686
certificate: |
-----BEGIN CERTIFICATE-----
MIIB4jCCAWmgAwIBAgIQcrXuVF6F3hyWCmwiW71yFTAKBggqhkjOPQQDAzAkMQww
CgYDVQQKEwNMWEQxFDASBgNVBAMMC3Jvb3RAbWljcm8xMB4XDTI1MDMwNjE4NTA1
N1oXDTM1MDMwNDE4NTA1N1owJDEMMAoGA1UEChMDTFhEMRQwEgYDVQQDDAtyb290
QG1pY3JvMTB2MBAGByqGSM49AgEGBSuBBAAiA2IABLyvcvC44GnZhQ05h41ayUMS
D2e3P79/npSv4YTzspbV+NT75MasWMRQls/FGeM41fD3oo0lk++DXy/LsFB9yNIJ
3P7EnjsFMnre1ckwEJ/nFItXs9JOnAdC4GF4+OniSKNgMF4wDgYDVR0PAQH/BAQD
AgWgMBMGA1UdJQQMMAoGCCsGAQUFBwMBMAwGA1UdEwEB/wQCMAAwKQYDVR0RBCIw
IIIGbWljcm8xhwR/AAABhxAAAAAAAAAAAAAAAAAAAAABMAoGCCqGSM49BAMDA2cA
MGQCMG/z7FT0RcYK26tgm24OetWW2hQhuJSwmf7+xsqsNknkaJh9y526fgHNdJXJ
NjmX2wIwANdzvQbHumoOf7HrzQUTdbhCeZFYMCcUSrtWSRqqXg2voZHXg2xuYLv6
6jMEAAVp
-----END CERTIFICATE-----
certificate_fingerprint: 27658ccbde6ddd7049b39b6f97051b31ed1d443691cdac1b16b580c6b871b6c6
driver: lxc | qemu
driver_version: 6.0.2 | 8.2.2
instance_types:
- container
- virtual-machine
firewall: nftables
kernel: Linux
kernel_architecture: x86_64
kernel_features:
idmapped_mounts: "true"
netnsid_getifaddrs: "true"
seccomp_listener: "true"
seccomp_listener_continue: "true"
uevent_injection: "true"
unpriv_binfmt: "false"
unpriv_fscaps: "true"
kernel_version: 5.15.0-1077-kvm
lxc_features:
cgroup2: "true"
core_scheduling: "true"
devpts_fd: "true"
idmapped_mounts_v2: "true"
mount_injection_file: "true"
network_gateway_device_route: "true"
network_ipvlan: "true"
network_l2proxy: "true"
network_phys_macvlan_mtu: "true"
network_veth_router: "true"
pidfd: "true"
seccomp_allow_deny_syntax: "true"
seccomp_notify: "true"
seccomp_proxy_send_notify_fd: "true"
os_name: Ubuntu
os_version: "22.04"
project: default
server: lxd
server_clustered: true
server_event_mode: full-mesh
server_name: micro1
server_pid: 1815
server_version: 5.21.3
server_lts: true
storage: zfs | ceph | cephfs
storage_version: 2.1.5-1ubuntu6~22.04.5 | 17.2.7 | 17.2.7
storage_supported_drivers:
- name: cephobject
version: 17.2.7
remote: true
- name: dir
version: "1"
remote: false
- name: lvm
version: 2.03.11(2) (2021-01-08) / 1.02.175 (2021-01-08) / 4.45.0
remote: false
- name: powerflex
version: 1.16 (nvme-cli)
remote: true
- name: zfs
version: 2.1.5-1ubuntu6~22.04.5
remote: false
- name: btrfs
version: 5.16.2
remote: false
- name: ceph
version: 17.2.7
remote: true
- name: cephfs
version: 17.2.7
remote: true
Issue description
In a LXD cluster (MicroCloud), trying to concurrently create instances backed by ceph will fail the first time because multiple cluster member will try to create the readonly snapshot on the shared ceph pool.
Steps to reproduce
Here's the minimal reproducer that assumes a MicroCloud made out of 3 nodes, micro1, micro2 and micro3. This means the default profile is configured to put the roofs of the instances onto ceph:
root@micro1:~# lxc profile show default
name: default
description: Default LXD profile
config: {}
devices:
eth0:
name: eth0
network: default
type: nic
root:
path: /
pool: remote
type: disk
used_by: []
- Copy a public image:
root@micro1:~# lxc image copy ubuntu-minimal-daily:24.04 local:
Image copied successfully!
- Verify the downloaded image was never turned into an image on the storage pool:
root@micro1:~# FINGERPRINT="$(lxc image info ubuntu-minimal-daily:24.04 | awk '/^Fingerprint:/ {print $2}')"
root@micro1:~# lxc image list -fcsv -cF | grep -xF "${FINGERPRINT}"
46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a
root@micro1:~# lxc storage volume list remote | grep -F "${FINGERPRINT}" && echo "the image should NOT exist in the pool"
- Concurrently create instances using that downloaded image for the first time:
root@micro1:~# lxc init ubuntu-minimal-daily:24.04 c1 --target micro1 & lxc init ubuntu-minimal-daily:24.04 c2 --target micro2 & lxc init ubuntu-minimal-daily:24.04 c3 --target micro3
[1] 12916
[2] 12917
Creating c2
Creating c3
Creating c1
Error: Failed instance creation: Failed creating instance from image: Error inserting volume "46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a" for project "default" in pool "remote" of type "images" into database "UNIQUE constraint failed: index 'storage_volumes_unique_storage_pool_id_node_id_project_id_name_type'"
Retrieving image: Unpacking image: 100% (642.75MB/s)Error: Failed instance creation: Failed creating instance from image: Failed to run: rbd --id admin --cluster ceph --image-feature layering --image-feature striping --image-feature exclusive-lock --image-feature object-map --image-feature fast-diff --image-feature deep-flatten clone lxd_remote/image_46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a_ext4@readonly lxd_remote/container_c2: exit status 2 (2025-04-03T20:35:58.256+0000 7fc2ff7fe640 -1 librbd::image::RefreshRequest: failed to locate snapshot: readonly
2025-04-03T20:35:58.256+0000 7fc2ff7fe640 -1 librbd::image::OpenRequest: failed to find snapshot readonly
2025-04-03T20:35:58.256+0000 7fc2eeffd640 -1 librbd::image::CloneRequest: 0x557ca7f989b0 handle_open_parent: failed to open parent image: (2) No such file or directory
rbd: clone error: (2) No such file or directory)
[1]- Exit 1 lxc init ubuntu-minimal-daily:24.04 c1 --target micro1
[2]+ Exit 1 lxc init ubuntu-minimal-daily:24.04 c2 --target micro2
root@micro1:~#
root@micro1:~# lxc list
+------+---------+------+------+-----------+-----------+----------+
| NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS | LOCATION |
+------+---------+------+------+-----------+-----------+----------+
| c3 | STOPPED | | | CONTAINER | 0 | micro3 |
+------+---------+------+------+-----------+-----------+----------+
The above errors show 2 issues. The UNIQUE constaint failed one is about another bug but the failed to open parent image one is the one I'm reporting here.
At this point, it's possible to see the remote pool now has the image as micro3 was able to take the tarball/squashfs and turn it into a RBD volume with a @readonly snapshot:
root@micro1:~# lxc list
+------+---------+------+------+-----------+-----------+----------+
| NAME | STATE | IPV4 | IPV6 | TYPE | SNAPSHOTS | LOCATION |
+------+---------+------+------+-----------+-----------+----------+
| c3 | STOPPED | | | CONTAINER | 0 | micro3 |
+------+---------+------+------+-----------+-----------+----------+
root@micro1:~# lxc storage volume list remote | grep -F "${FINGERPRINT}"
| image | 46942e5befec5812ca67d893456cf2e1d77b5a84d52854e9892d62e9d41c5d3a | | filesystem | 1 | |
From here on, creating new instances concurrently should no longer run into the bug as the needed image is in the remote pool already:
# Clear up any instance
root@micro1:~# for i in c1 c2 c3; do lxc rm "${i}"; done
Error: Failed checking instance exists "local:c1": Instance not found
Error: Failed checking instance exists "local:c2": Instance not found
# Create multiple instances concurrently
root@micro1:~# lxc init ubuntu-minimal-daily:24.04 c1 --target micro1 & lxc init ubuntu-minimal-daily:24.04 c2 --target micro2 & lxc init ubuntu-minimal-daily:24.04 c3 --target micro3
[1] 15026
[2] 15027
Creating c1
Creating c3
Creating c2
Information to attach
- [ ] Any relevant kernel output (
dmesg) - [ ] Instance log (
lxc info NAME --show-log) - [ ] Instance configuration (
lxc config show NAME --expanded) - [ ] Main daemon log (at
/var/log/lxd/lxd.logor/var/snap/lxd/common/lxd/logs/lxd.log) - [ ] Output of the client with
--debug - [ ] Output of the daemon with
--debug(or uselxc monitorwhile reproducing the issue)
cc @MusicDin, that's the bug we kept running into with Terraform. Please add anything I might have missed.
Yes this is similar to the other issue mentioned, except its likely affecting all remote storage pools that try and use the downloaded image to create an image volume.
I suspect we need to create some /internal endpoints that allow us to perform cluster-wide operations for DB records and remote pool operations on the leader (which then allows the use of sync.Mutex).
Yes this is similar to the other issue mentioned, except its likely affecting all remote storage pools that try and use the downloaded image to create an image volume.
I can try with Pure next week to confirm this.
I suspect we need to create some /internal endpoints that allow us to perform cluster-wide operations for DB records and remote pool operations on the leader (which then allows the use of sync.Mutex).
That would be much appreciated :)
Similar to https://github.com/lxc/incus/issues/2314#issuecomment-3129918070 and https://github.com/canonical/lxd/issues/11636