ceph-nvmeof icon indicating copy to clipboard operation
ceph-nvmeof copied to clipboard

get_io_stats fails to display IO statistics.

Open sunilkumarn417 opened this issue 10 months ago • 3 comments

Get IO stats displays the IO metrics correctly when quried with active connected Gateway IP, else many metrics are "0" like bytes_written from other gateways.

Steps-to-follow:

  • Deploy service with gateways (GW1 : LB-grp-id: 1, GW2: LB-grp-id: 2)
  • Configure 2 subsystems
  • Add both gateway listeners on each subsystem.
  • Setup host, Add namespaces like below
    • Subsystem-1 Namespaces tagged with LB-grp-id-1
    • Subsystem-2 Namespaces tagged with LB-grp-id-2
  • Start IO.
  • Get IO stats for subsystem-1 namespaces from from GW2, then we can observe "bytes_written" will be "0"
root@ceph-sunilkumar-00-7v6gmf-node6 ~]# podman run --quiet --rm quay.io/barakda1/nvmeof-cli:1.2.1  --format json --server-address 10.0.208.84 --server-port 5500 get_subsystems
Get subsystems:
{
    "subsystems": [
        {
            "nqn": "nqn.2016-06.io.spdk:cnode1",
            "subtype": "NVMe",
            "listen_addresses": [
                {
                    "trtype": "TCP",
                    "adrfam": "IPv4",
                    "traddr": "10.0.208.84",
                    "trsvcid": "4420",
                    "transport": "TCP"
                }
            ],
            "allow_any_host": true,
            "serial_number": "Ceph67140833472324",
            "model_number": "Ceph bdev Controller",
            "max_namespaces": 32,
            "min_cntlid": 2041,
            "max_cntlid": 4080,
            "namespaces": [
                {
                    "nsid": 1,
                    "name": "bdev_e42aafab-8f9b-4911-b326-4308cc962112",
                    "bdev_name": "bdev_e42aafab-8f9b-4911-b326-4308cc962112",
                    "nguid": "E42AAFAB8F9B4911B3264308CC962112",
                    "uuid": "e42aafab-8f9b-4911-b326-4308cc962112",
                    "anagrpid": 2,
                    "nonce": "10.0.208.84:0/3145406409"
                },
                {
                    "nsid": 2,
                    "name": "bdev_b0f9bf47-925a-4bcb-8e93-3749a98d6e2f",
                    "bdev_name": "bdev_b0f9bf47-925a-4bcb-8e93-3749a98d6e2f",
                    "nguid": "B0F9BF47925A4BCB8E933749A98D6E2F",
                    "uuid": "b0f9bf47-925a-4bcb-8e93-3749a98d6e2f",
                    "anagrpid": 2,
                    "nonce": "10.0.208.84:0/3145406409"
                }
            ],
            "hosts": []
        },
        {
            "nqn": "nqn.2016-06.io.spdk:cnode2",
            "subtype": "NVMe",
            "listen_addresses": [
                {
                    "trtype": "TCP",
                    "adrfam": "IPv4",
                    "traddr": "10.0.208.84",
                    "trsvcid": "4420",
                    "transport": "TCP"
                }
            ],
            "allow_any_host": true,
            "serial_number": "Ceph5822363922143",
            "model_number": "Ceph bdev Controller",
            "max_namespaces": 32,
            "min_cntlid": 2041,
            "max_cntlid": 4080,
            "namespaces": [
                {
                    "nsid": 1,
                    "name": "bdev_e7721ff0-5d0e-4c9c-b703-6553aafc945a",
                    "bdev_name": "bdev_e7721ff0-5d0e-4c9c-b703-6553aafc945a",
                    "nguid": "E7721FF05D0E4C9CB7036553AAFC945A",
                    "uuid": "e7721ff0-5d0e-4c9c-b703-6553aafc945a",
                    "anagrpid": 1,
                    "nonce": "10.0.208.84:0/3359539278"
                },
                {
                    "nsid": 2,
                    "name": "bdev_35c56671-2204-42a9-9913-f51f77b65f60",
                    "bdev_name": "bdev_35c56671-2204-42a9-9913-f51f77b65f60",
                    "nguid": "35C56671220442A99913F51F77B65F60",
                    "uuid": "35c56671-2204-42a9-9913-f51f77b65f60",
                    "anagrpid": 1,
                    "nonce": "10.0.208.84:0/3359539278"
                }
            ],
            "hosts": []
        }
    ]
}

Listeners are added

[root@ceph-sunilkumar-00-7v6gmf-node6 ~]# podman run --quiet --rm quay.io/barakda1/nvmeof-cli:1.2.1  --format json --server-address 10.0.208.84 --server-port 5500 listener list -n nqn.2016-06.io.spdk:cnode2
{
    "error_message": "Success",
    "listeners": [
        {
            "host_name": "ceph-sunilkumar-00-7v6gmf-node6",
            "trtype": "TCP",
            "traddr": "10.0.208.84",
            "trsvcid": 4420,
            "adrfam": "ipv4"
        },
        {
            "host_name": "ceph-sunilkumar-00-7v6gmf-node7",
            "trtype": "TCP",
            "traddr": "10.0.209.23",
            "trsvcid": 4420,
            "adrfam": "ipv4"
        }
    ],
    "status": 0
}
[root@ceph-sunilkumar-00-7v6gmf-node6 ~]# podman run --quiet --rm quay.io/barakda1/nvmeof-cli:1.2.1  --format json --server-address 10.0.208.84 --server-port 5500 listener list -n nqn.2016-06.io.spdk:cnode1
{
    "error_message": "Success",
    "listeners": [
        {
            "host_name": "ceph-sunilkumar-00-7v6gmf-node6",
            "trtype": "TCP",
            "traddr": "10.0.208.84",
            "trsvcid": 4420,
            "adrfam": "ipv4"
        },
        {
            "host_name": "ceph-sunilkumar-00-7v6gmf-node7",
            "trtype": "TCP",
            "traddr": "10.0.209.23",
            "trsvcid": 4420,
            "adrfam": "ipv4"
        }
    ],
    "status": 0
}

Get IO stats

[root@ceph-sunilkumar-00-7v6gmf-node6 ~]# podman run --quiet --rm quay.io/barakda1/nvmeof-cli:1.2.1  --format json --server-address 10.0.208.84 --server-port 5500 namespace get_io_stats  --subsystem nqn.2016-06.io.spdk:cnode1 --nsid 1
{
    "error_message": "Success",
    "subsystem_nqn": "nqn.2016-06.io.spdk:cnode1",
    "nsid": 1,
    "uuid": "e42aafab-8f9b-4911-b326-4308cc962112",
    "bdev_name": "bdev_e42aafab-8f9b-4911-b326-4308cc962112",
    "tick_rate": "2290000000",
    "ticks": "2621647473989225",
    "bytes_read": "2183168",
    "num_read_ops": "102",
    "bytes_written": "113770496",
    "num_write_ops": "868",
    "read_latency_ticks": "127828488",
    "max_read_latency_ticks": "5130384",
    "min_read_latency_ticks": "570238",
    "write_latency_ticks": "238402928",
    "max_write_latency_ticks": "65301792",
    "min_write_latency_ticks": "79444",
    "status": 0,
    "bytes_unmapped": "0",
    "num_unmap_ops": "0",
    "unmap_latency_ticks": "0",
    "max_unmap_latency_ticks": "0",
    "min_unmap_latency_ticks": "0",
    "copy_latency_ticks": "0",
    "max_copy_latency_ticks": "0",
    "min_copy_latency_ticks": "0",
    "io_error": []
}

[root@ceph-sunilkumar-00-7v6gmf-node6 ~]# podman run --quiet --rm quay.io/barakda1/nvmeof-cli:1.2.1  --format json --server-address 10.0.209.23 --server-port 5500 namespace get_io_stats  --subsystem nqn.2016-06.io.spdk:cnode1 --nsid 1
{
    "error_message": "Success",
    "subsystem_nqn": "nqn.2016-06.io.spdk:cnode1",
    "nsid": 1,
    "uuid": "e42aafab-8f9b-4911-b326-4308cc962112",
    "bdev_name": "bdev_e42aafab-8f9b-4911-b326-4308cc962112",
    "tick_rate": "2190000000",
    "ticks": "2507761566245463",
    "bytes_read": "36864",
    "num_read_ops": "2",
    "read_latency_ticks": "8553886",
    "max_read_latency_ticks": "4332226",
    "min_read_latency_ticks": "4221660",
    "status": 0,
    "bytes_written": "0",
    "num_write_ops": "0",
    "bytes_unmapped": "0",
    "num_unmap_ops": "0",
    "write_latency_ticks": "0",
    "max_write_latency_ticks": "0",
    "min_write_latency_ticks": "0",
    "unmap_latency_ticks": "0",
    "max_unmap_latency_ticks": "0",
    "min_unmap_latency_ticks": "0",
    "copy_latency_ticks": "0",
    "max_copy_latency_ticks": "0",
    "min_copy_latency_ticks": "0",
    "io_error": []
}

sunilkumarn417 avatar Apr 18 '24 17:04 sunilkumarn417

Why is this an issue? The stats come from the SPDK of the gateway, so if the namespace is not or has never been active on that gateway(spdk), then I would expect them to be 0's.

@sunilkumarn417 if this make sense, please close.

pcuzner avatar Apr 18 '24 20:04 pcuzner

@pcuzner Thanks, I Understand that SPDK of the gateway provides this.

But the question is, As a user, which is the command to look on these metrics, on failover or no-failover?

sunilkumarn417 avatar Apr 19 '24 03:04 sunilkumarn417

@sunilkumarn417 as a user, you need to check that the IO continues when one of the nodes is not available. There are means on the host to check if IO exists to namespaces.

caroav avatar Apr 20 '24 17:04 caroav