Load balancing group of namespace gets assigned to Gateway not part of group for which namespace is added
Seeing a strange yet major issue where nvme-gw show command lists Gateway not part of griuo mentioned in command
Below, node7 is part of group2 but show command lists in group1 and namespaces added on GW inn group1 takes load balancing group of node7 which is actually part of group2
# ceph nvme-gw show nvmeof_pool group1
{
"epoch": 362,
"pool": "nvmeof_pool",
"group": "group1",
"features": "LB",
"rebalance_ana_group": 5,
"num gws": 5,
"Anagrp list": "[ 1 2 3 4 5 ]",
"num-namespaces": 6,
"Created Gateways:": [
{
"gw-id": "client.nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node3.eubjtw",
"anagrp-id": 1,
"num-namespaces": 1,
"performed-full-startup": 1,
"Availability": "AVAILABLE",
"num-listeners": 2,
"ana states": " 1: ACTIVE , 2: ACTIVE , 3: STANDBY , 4: STANDBY , 5: ACTIVE "
},
{
"gw-id": "client.nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node4.krmbra",
"anagrp-id": 2,
"num-namespaces": 1,
"performed-full-startup": 0,
"Availability": "UNAVAILABLE",
"ana states": " 1: STANDBY , 2: STANDBY , 3: STANDBY , 4: STANDBY , 5: STANDBY "
},
{
"gw-id": "client.nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node5.javqon",
"anagrp-id": 3,
"num-namespaces": 1,
"performed-full-startup": 1,
"Availability": "AVAILABLE",
"num-listeners": 2,
"ana states": " 1: STANDBY , 2: STANDBY , 3: ACTIVE , 4: STANDBY , 5: STANDBY "
},
{
"gw-id": "client.nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node6.gspnwl",
"anagrp-id": 4,
"num-namespaces": 1,
"performed-full-startup": 1,
"Availability": "AVAILABLE",
"num-listeners": 2,
"ana states": " 1: STANDBY , 2: STANDBY , 3: STANDBY , 4: ACTIVE , 5: STANDBY "
},
{
"gw-id": "client.nvmeof.nvmeof_pool.group2.ceph-rlepaksh-8-1-ZKVGK0-node7.uuwgjy",
"anagrp-id": 5,
"num-namespaces": 2,
"performed-full-startup": 1,
"Availability": "CREATED",
"ana states": " 1: STANDBY , 2: STANDBY , 3: STANDBY , 4: STANDBY , 5: STANDBY "
}
]
}
# ceph orch ps | grep nvmeof
nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node3.eubjtw ceph-rlepaksh-8-1-ZKVGK0-node3 *:5500,4420,8009 running (8d) 7m ago 8d 182M - 1.4.2 a09f7e144ac2 ddf9ee88cbd6
nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node4.krmbra ceph-rlepaksh-8-1-ZKVGK0-node4 *:5500,4420,8009 running (4d) 72s ago 8d 170M - 1.4.2 a09f7e144ac2 cbcd128e26a1
nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node5.javqon ceph-rlepaksh-8-1-ZKVGK0-node5 *:5500,4420,8009 running (9h) 6m ago 8d 162M - 1.4.2 a09f7e144ac2 1719b34901b2
nvmeof.nvmeof_pool.group1.ceph-rlepaksh-8-1-ZKVGK0-node6.gspnwl ceph-rlepaksh-8-1-ZKVGK0-node6 *:5500,4420,8009 running (23h) 71s ago 8d 169M - 1.4.2 a09f7e144ac2 540178485ce0
nvmeof.nvmeof_pool.group2.ceph-rlepaksh-8-1-ZKVGK0-node7.uuwgjy ceph-rlepaksh-8-1-ZKVGK0-node7 *:5500,4420,8009 running (16h) 72s ago 16h 219M - 1.4.2 a09f7e144ac2 c7ba1dbf96c4
[ceph: root@ceph-rlepaksh-8-1-zkvgk0-node1-installer /]# ceph orch ls | grep nvmeof
nvmeof.nvmeof_pool.group1 ?:4420,5500,8009 4/4 7m ago 8d ceph-rlepaksh-8-1-ZKVGK0-node3;ceph-rlepaksh-8-1-ZKVGK0-node4;ceph-rlepaksh-8-1-ZKVGK0-node5;ceph-rlepaksh-8-1-ZKVGK0-node6
nvmeof.nvmeof_pool.group2 ?:4420,5500,8009 1/1 77s ago 16h ceph-rlepaksh-8-1-ZKVGK0-node7
# podman run -it quay.io/ceph/nvmeof-cli:latest --server-address 10.0.64.80 namespace list -n nqn.2016-06.io.spdk:cnode1.group1
Namespaces in subsystem nqn.2016-06.io.spdk:cnode1.group1:
╒════════╤════════════════════════╤════════════╤═════════╤═══════════╤═════════════════════╤═════════════╤══════════════╤═══════════╤═══════════╤════════════╤═════════════╕
│ NSID │ Bdev │ RBD │ Image │ Block │ UUID │ Load │ Visibility │ R/W IOs │ R/W MBs │ Read MBs │ Write MBs │
│ │ Name │ Image │ Size │ Size │ │ Balancing │ │ per │ per │ per │ per │
│ │ │ │ │ │ │ Group │ │ second │ second │ second │ second │
╞════════╪════════════════════════╪════════════╪═════════╪═══════════╪═════════════════════╪═════════════╪══════════════╪═══════════╪═══════════╪════════════╪═════════════╡
│ 1 │ bdev_b58e4188-9070- │ rbd/image0 │ 1 TiB │ 512 Bytes │ b58e4188-9070-40d4- │ 1 │ All Hosts │ unset │ unset │ unset │ unset │
│ │ 40d4-8c4a-d37056f1e6d9 │ │ │ │ 8c4a-d37056f1e6d9 │ │ │ │ │ │ │
├────────┼────────────────────────┼────────────┼─────────┼───────────┼─────────────────────┼─────────────┼──────────────┼───────────┼───────────┼────────────┼─────────────┤
│ 2 │ bdev_c376f74c-6924- │ rbd/image1 │ 1 TiB │ 512 Bytes │ c376f74c-6924-4b78- │ 2 │ All Hosts │ unset │ unset │ unset │ unset │
│ │ 4b78-bb32-13b16ee0cb10 │ │ │ │ bb32-13b16ee0cb10 │ │ │ │ │ │ │
├────────┼────────────────────────┼────────────┼─────────┼───────────┼─────────────────────┼─────────────┼──────────────┼───────────┼───────────┼────────────┼─────────────┤
│ 3 │ bdev_b9a64e8a-2dae- │ rbd/image2 │ 1 TiB │ 512 Bytes │ b9a64e8a-2dae-487e- │ 3 │ All Hosts │ unset │ unset │ unset │ unset │
│ │ 487e-96c7-112534761544 │ │ │ │ 96c7-112534761544 │ │ │ │ │ │ │
├────────┼────────────────────────┼────────────┼─────────┼───────────┼─────────────────────┼─────────────┼──────────────┼───────────┼───────────┼────────────┼─────────────┤
│ 4 │ bdev_395abf33-097f- │ rbd/image3 │ 1 TiB │ 512 Bytes │ 395abf33-097f-4138- │ 4 │ All Hosts │ unset │ unset │ unset │ unset │
│ │ 4138-a0a8-ec0eda9af506 │ │ │ │ a0a8-ec0eda9af506 │ │ │ │ │ │ │
├────────┼────────────────────────┼────────────┼─────────┼───────────┼─────────────────────┼─────────────┼──────────────┼───────────┼───────────┼────────────┼─────────────┤
│ 5 │ bdev_cb68c737-b180- │ rbd/image4 │ 1 TiB │ 512 Bytes │ cb68c737-b180-4642- │ 5 │ All Hosts │ unset │ unset │ unset │ unset │
│ │ 4642-868c-337e65b80e1b │ │ │ │ 868c-337e65b80e1b │ │ │ │ │ │ │
├────────┼────────────────────────┼────────────┼─────────┼───────────┼─────────────────────┼─────────────┼──────────────┼───────────┼───────────┼────────────┼─────────────┤
│ 6 │ bdev_24939ab7-138f- │ rbd/image5 │ 1 TiB │ 512 Bytes │ 24939ab7-138f-4d41- │ 5 │ All Hosts │ unset │ unset │ unset │ unset │
│ │ 4d41-877c-ca8ad3f002a6 │ │ │ │ 877c-ca8ad3f002a6 │ │ │ │ │ │ │
╘════════╧════════════════════════╧════════════╧═════════╧═══════════╧═════════════════════╧═════════════╧══════════════╧═══════════╧═══════════╧════════════╧═════════════╛
Spec file
]# ceph orch ls nvmeof --export
service_type: nvmeof
service_id: nvmeof_pool.group1
service_name: nvmeof.nvmeof_pool.group1
placement:
hosts:
- ceph-rlepaksh-8-1-ZKVGK0-node3
- ceph-rlepaksh-8-1-ZKVGK0-node4
- ceph-rlepaksh-8-1-ZKVGK0-node5
- ceph-rlepaksh-8-1-ZKVGK0-node6
spec:
allowed_consecutive_spdk_ping_failures: 1
bdevs_per_cluster: 32
conn_retries: 10
discovery_port: 8009
enable_key_encryption: true
enable_monitor_client: true
enable_prometheus_exporter: true
group: group1
log_directory: /var/log/ceph/
log_files_enabled: true
log_files_rotation_enabled: true
log_level: INFO
max_gws_in_grp: 16
max_hosts_per_namespace: 8
max_hosts_per_subsystem: 32
max_log_directory_backups: 10
max_log_file_size_in_mb: 10
max_log_files_count: 20
max_namespaces: 1024
max_namespaces_per_subsystem: 256
max_namespaces_with_netmask: 1000
max_ns_to_change_lb_grp: 8
max_subsystems: 128
monitor_timeout: 1.0
omap_file_lock_duration: 20
omap_file_lock_retries: 30
omap_file_lock_retry_sleep_interval: 1.0
omap_file_update_reloads: 10
pool: nvmeof_pool
port: 5500
prometheus_port: 10008
prometheus_stats_interval: 10
rebalance_period_sec: 7
rpc_socket_dir: /var/tmp/
rpc_socket_name: spdk.sock
spdk_path: /usr/local/bin/nvmf_tgt
spdk_ping_interval_in_seconds: 2.0
spdk_protocol_log_level: WARNING
spdk_timeout: 60.0
state_update_interval_sec: 5
state_update_notify: true
tgt_path: /usr/local/bin/nvmf_tgt
transport_tcp_options:
in_capsule_data_size: 8192
max_io_qpairs_per_ctrlr: 7
transports: tcp
verbose_log_messages: true
verify_keys: true
verify_nqns: true
---
service_type: nvmeof
service_id: nvmeof_pool.group2
service_name: nvmeof.nvmeof_pool.group2
placement:
hosts:
- ceph-rlepaksh-8-1-ZKVGK0-node7
spec:
allowed_consecutive_spdk_ping_failures: 1
bdevs_per_cluster: 32
conn_retries: 10
discovery_port: 8009
enable_key_encryption: true
enable_monitor_client: true
enable_prometheus_exporter: true
group: group1
log_directory: /var/log/ceph/
log_files_enabled: true
log_files_rotation_enabled: true
log_level: INFO
max_gws_in_grp: 16
max_hosts_per_namespace: 8
max_hosts_per_subsystem: 32
max_log_directory_backups: 10
max_log_file_size_in_mb: 10
max_log_files_count: 20
max_namespaces: 1024
max_namespaces_per_subsystem: 256
max_namespaces_with_netmask: 1000
max_ns_to_change_lb_grp: 8
max_subsystems: 128
monitor_timeout: 1.0
omap_file_lock_duration: 20
omap_file_lock_retries: 30
omap_file_lock_retry_sleep_interval: 1.0
omap_file_update_reloads: 10
pool: nvmeof_pool
port: 5500
prometheus_port: 10008
prometheus_stats_interval: 10
rebalance_period_sec: 7
rpc_socket_dir: /var/tmp/
rpc_socket_name: spdk.sock
spdk_mem_size: 4096
spdk_path: /usr/local/bin/nvmf_tgt
spdk_ping_interval_in_seconds: 2.0
spdk_protocol_log_level: WARNING
spdk_timeout: 60.0
state_update_interval_sec: 5
state_update_notify: true
tgt_path: /usr/local/bin/nvmf_tgt
transport_tcp_options:
in_capsule_data_size: 8192
max_io_qpairs_per_ctrlr: 7
transports: tcp
verbose_log_messages: true
verify_keys: true
verify_nqns: true
@rahullepakshi Hi, I’d like to work on this issue.
From the spec you shared, the nvmeof_pool.group2 service is still configured with group: group1, which seems to cause the gateway in group2 (node7) to be associated with the load-balancing group of group1. I’ll investigate how the group is being applied in the NVMe-oF orchestrator logic and work on ensuring namespaces/gateways are assigned only within their correct group.
Could you please assign this issue to me? Any additional context or known constraints around group handling would be helpful.