ucx icon indicating copy to clipboard operation
ucx copied to clipboard

UCX ERROR ibv_reg_mr failed: Bad address

Open cgorac opened this issue 3 years ago • 0 comments

Describe the bug

(I think this issue is relevant to my problem, in particular the part about handling GPUs with limited BAR size.)

So: I have a code that is passing GPU pointers to MPI calls. The details of the setup are provided below, but in general most of the machines on the network have V100 GPUs, and everything is working fine in this case. However, when I include a machine say with GeForce RTX 3080 into the mix, the code would crash, with the error message as in subject.

I've narrowed the code down to a minimal test program to reproduce the issue, details provided below. I think the crash is caused by the fact that BAR1 size on GeForce RTX 3080 is just 256MB, while for V100 its size is equal to size of GPU memory. So my question is: anything changed in UCX in the meantime, since above mentioned issue filed, w.r.t. GPUs with small BAR1 size, and if not, is it at least possible to detect that UCX won't be able to handle it, so that it could be handled differently (the code that I'm dealing with is able to do staging through CPU memory itself, it just needs to know that it's necessary).

Steps to Reproduce

Minimal reproducible example

Here is a small test program to reproduce the problem it just exchanges messages between memory buffers on two GPUs:

#include <stdio.h>
#include <stdlib.h>

#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include <mpi.h>
#include <mpi-ext.h>

#include <cuda_runtime.h>

#define N 1024

static void
check_cuda_error(cudaError_t status)
{
    if (status != cudaSuccess) {
	fprintf(stderr, "CUDA error: \"%s\"\n",
		cudaGetErrorString(status));
	MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    }
    MPI_Barrier(MPI_COMM_WORLD);
}

int
main(int argc, char *argv[])
{
    cudaError_t     status;

    int             ndevices;
    status = cudaGetDeviceCount(&ndevices);
    if (status != cudaSuccess) {
	return EXIT_FAILURE;
    }
    if (ndevices == 0) {
	return EXIT_FAILURE;
    }

    char           *local_rank_env = getenv("OMPI_COMM_WORLD_LOCAL_RANK");
    if (local_rank_env == NULL) {
	return EXIT_FAILURE;
    }
    int             device = atoi(local_rank_env) % ndevices;
    status = cudaSetDevice(device);
    if (status != cudaSuccess) {
	return EXIT_FAILURE;
    }

    printf("%d: %d/%d\n", atoi(local_rank_env), device, ndevices);

    MPI_Init(&argc, &argv);

    int             size;
    int             rank;
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (size != 2) {
	if (rank == 0) {
	    fprintf(stderr, "Run program with 2 MPI ranks!\n");
	    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
	}
	MPI_Barrier(MPI_COMM_WORLD);
    }

    int             mpi_cuda_support = 0;
    char           *local_size_env = getenv("OMPI_COMM_WORLD_LOCAL_SIZE");
    if (local_size_env != NULL) {
#if defined(OPEN_MPI) && defined(MPIX_CUDA_AWARE_SUPPORT)
	if (MPIX_Query_cuda_support() != 0) {
	    if (atoi(local_size_env) == size) {
		mpi_cuda_support = 1;
	    } else {
		const char     *PATH =
		    "/sys/kernel/mm/memory_peers/nv_mem/version";
		struct stat     buf;
		if (stat(PATH, &buf) == 0) {
		    if ((buf.st_mode & S_IRUSR) == S_IRUSR) {
			mpi_cuda_support = 1;
		    }
		}
	    }
	}
#endif
    }
    if (mpi_cuda_support == 0) {
	if (rank == 0) {
	    fprintf(stderr, "No CUDA support in MPI library!\n");
	    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
	}
	MPI_Barrier(MPI_COMM_WORLD);
    }

    int            *buff_send_h;
    int            *buff_recv_h;

    buff_send_h = (int *) malloc(N * sizeof(int));
    buff_recv_h = (int *) malloc(N * sizeof(int));

    for (int i = 0; i < N; ++i) {
	buff_send_h[i] = rank;
	buff_recv_h[i] = -1;
    }

    int            *buff_send_d;
    int            *buff_recv_d;
    check_cuda_error(cudaMalloc
		     ((void **) &buff_send_d, N * sizeof(int)));
    check_cuda_error(cudaMalloc
		     ((void **) &buff_recv_d, N * sizeof(int)));

    check_cuda_error(cudaMemcpy
		     (buff_send_d, buff_send_h, N * sizeof(int),
		      cudaMemcpyDefault));

    int             peer_rank = rank == 0 ? 1 : 0;

    MPI_Request     requests[2];
    MPI_Isend(buff_send_d, N, MPI_INT, peer_rank, 0, MPI_COMM_WORLD,
	      &requests[0]);
    MPI_Irecv(buff_recv_d, N, MPI_INT, peer_rank, 0, MPI_COMM_WORLD,
	      &requests[1]);

    MPI_Status      statuses[2];
    MPI_Waitall(2, requests, statuses);
    for (int i = 0; i < 2; ++i)
	if (statuses[i].MPI_ERROR != MPI_SUCCESS) {
	    fprintf(stderr, "MPI recieve error!\n");
	    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
	}
    MPI_Barrier(MPI_COMM_WORLD);

    check_cuda_error(cudaMemcpy
		     (buff_recv_h, buff_recv_d, N * sizeof(int),
		      cudaMemcpyDefault));

    for (int i = 0; i < N; ++i)
	if (buff_recv_h[i] != peer_rank) {
	    fprintf(stderr, "Data received wrong!\n");
	    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
	}
    MPI_Barrier(MPI_COMM_WORLD);

    fprintf(stderr, "Rank %d: success!\n", rank);

    free(buff_send_h);
    free(buff_recv_h);
    cudaFree(buff_send_d);
    cudaFree(buff_recv_d);

    MPI_Finalize();

    return 0;
}

Command line

I've built the program above using following command (the code above is saved into file mpi-cuda-check.c):

/opt/openmpi/bin/mpicc -std=c99 -I/usr/local/cuda-11.6/include -o mpi-cuda-check mpi-cuda-check.c -L/usr/local/cuda-11.6/lib64 -lcudart

I've ran the program using following command (here, node-v100 is machine with V100 GPU, and node-rtx3080 is machine with GeForce RTX 3080 GPU):

/opt/openmpi/bin/mpirun -np 2 --host node-v100,node-rtx3090 -x UCX_TLS=sm,cuda,gdr_copy,rc ./mpi-cuda-check

The output from the program run (it was run from node-rtx3080 machine) is as follows:

0: 0/1
0: 0/2
[1647516906.396315] [node-rtx3080:82049:0]           ib_md.c:349  UCX  ERROR ibv_reg_mr(address=0x7f724fa00000, length=4096, access=0xf) failed: Bad address
[1647516906.396356] [node-rtx3080:82049:0]          ucp_mm.c:164  UCX  ERROR failed to register address 0x7f724fa00000 mem_type bit 0x2 length 4096 on md[5]=mlx5_0: Input/output error (md reg_mem_types 0x3)
[1647516906.396366] [node-rtx3080:82049:0]     ucp_request.c:526  UCX  ERROR failed to register user buffer datatype 0x8 address 0x7f724fa00000 len 4096: Input/output error
[node-rtx3080.local:82049] pml_ucx.c:860  Error: ucx send failed: Input/output error
[node-rtx3080:82049] *** An error occurred in MPI_Isend
[node-rtx3080:82049] *** reported by process [2227830785,0]
[node-rtx3080:82049] *** on communicator MPI_COMM_WORLD
[node-rtx3080:82049] *** MPI_ERR_OTHER: known error not in list
[node-rtx3080:82049] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[node-rtx3080:82049] ***    and potentially your MPI job)

UCX version used

The ucx_info -v output is as follows:

# Version 1.13.0
# Git branch 'master', revision 49cb8fd
# Configured with: --prefix=/opt/ucx --disable-logging --disable-debug --disable-assertions --disable-params-check --enable-mt --with-cuda=/usr/local/cuda-11.6 --with-gdrcopy --without-go

UCX environment variables used

As mentioned above the only UCX environment variable is UCX_TLS=sm,cuda,gdr_copy,rc.

Setup and versions

Except for different GPUs, the setup is same on both machines mentioned.

OS version (e.g Linux distro) + CPU architecture (x86_64/aarch64/ppc64le/...)

Here is the output from cat /etc/redhat-release:

CentOS Linux release 7.9.2009 (Core)

and here is one from uname -a:

Linux node-rtx3080.local 3.10.0-1160.59.1.el7.x86_64 #1 SMP Wed Feb 23 16:47:03 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux

For RDMA/IB/RoCE related issues:

Here is output from ofed_info -s:

MLNX_OFED_LINUX-5.5-1.0.3.2

and here from ibstat

CA 'mlx5_0'
        CA type: MT4119
        Number of ports: 1
        Firmware version: 16.32.1010
        Hardware version: 0
        Node GUID: 0x506b4b0300ddfb2a
        System image GUID: 0x506b4b0300ddfb2a
        Port 1:
                State: Active
                Physical state: LinkUp
                Rate: 56
                Base lid: 8
                LMC: 0
                SM lid: 1
                Capability mask: 0x2651e848
                Port GUID: 0x506b4b0300ddfb2a
                Link layer: InfiniBand

For GPU related issues:

  • GPU type: as mentioned above, V100 on one machine, and GeForce RTX 3080 on the othe one.
  • Cuda: CUDA version is 11.6, driver version is 510.47.03. The nvidia-peermem driver appears in lsmod output.

Additional information (depending on the issue)

The OpenMPI version is 4.1.2, built from source. The configure line for OpenMPI build, as reported by ompi_info was:

Configure command line: '--prefix=/opt/openmpi' '--with-platform=contrib/platform/mellanox/optimized' '--enable-static=yes' '--enable-shared=yes' '--with-tm=/opt/pbs' '--with-cuda=/usr/local/cuda-11.6' '--with-gdrcopy' '--with-ucx=/opt/ucx'

The ucx_info -d output is as follows:

#
# Memory domain: posix
#     Component: posix
#             allocate: <= 48717032K
#           remote key: 24 bytes
#           rkey_ptr is supported
#
#      Transport: posix
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 12179.00 MB/sec
#              latency: 80 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: sysv
#     Component: sysv
#             allocate: unlimited
#           remote key: 12 bytes
#           rkey_ptr is supported
#
#      Transport: sysv
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 12179.00 MB/sec
#              latency: 80 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 100
#             am_bcopy: <= 8256
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: self
#     Component: self
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#
#      Transport: self
#         Device: memory0
#           Type: loopback
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 6911.00 MB/sec
#              latency: 0 nsec
#             overhead: 10 nsec
#            put_short: <= 4294967295
#            put_bcopy: unlimited
#            get_bcopy: unlimited
#             am_short: <= 8K
#             am_bcopy: <= 8K
#               domain: cpu
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 0 bytes
#        iface address: 8 bytes
#       error handling: ep_check
#
#
# Memory domain: tcp
#     Component: tcp
#             register: unlimited, cost: 0 nsec
#           remote key: 0 bytes
#
#      Transport: tcp
#         Device: enp97s0f0
#           Type: network
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 113.16/ppn + 0.00 MB/sec
#              latency: 5776 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 0
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: lo
#           Type: network
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 11.91/ppn + 0.00 MB/sec
#              latency: 10960 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 18 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#      Transport: tcp
#         Device: ib0
#           Type: network
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 6239.81/ppn + 0.00 MB/sec
#              latency: 5210 nsec
#             overhead: 50000 nsec
#            put_zcopy: <= 18446744073709551590, up to 6 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 0
#             am_short: <= 8K
#             am_bcopy: <= 8K
#             am_zcopy: <= 64K, up to 6 iov
#   am_opt_zcopy_align: <= 1
#         am_align_mtu: <= 0
#            am header: <= 8037
#           connection: to ep, to iface
#      device priority: 1
#     device num paths: 1
#              max eps: 256
#       device address: 6 bytes
#        iface address: 2 bytes
#           ep address: 10 bytes
#       error handling: peer failure, ep_check, keepalive
#
#
# Connection manager: tcp
#      max_conn_priv: 2064 bytes
#
# Memory domain: cuda_cpy
#     Component: cuda_cpy
#             allocate: unlimited
#             register: unlimited, cost: 0 nsec
#
#      Transport: cuda_copy
#         Device: cuda
#           Type: accelerator
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 10000.00/ppn + 0.00 MB/sec
#              latency: 8000 nsec
#             overhead: 0 nsec
#            put_short: <= 4294967295
#            put_zcopy: unlimited, up to 1 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_short: <= 4294967295
#            get_zcopy: unlimited, up to 1 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 0 bytes
#        iface address: 8 bytes
#       error handling: none
#
#
# Memory domain: cuda_ipc
#     Component: cuda_ipc
#             register: unlimited, cost: 0 nsec
#           remote key: 112 bytes
#           memory invalidation is supported
#
#      Transport: cuda_ipc
#         Device: cuda
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 300000.00/ppn + 0.00 MB/sec
#              latency: 1 nsec
#             overhead: 0 nsec
#            put_zcopy: unlimited, up to 1 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: <= 0, up to 1 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 4 bytes
#       error handling: peer failure, ep_check
#
#
# Memory domain: gdr_copy
#     Component: gdr_copy
#             register: unlimited, cost: 0 nsec
#           remote key: 24 bytes
#
#      Transport: gdr_copy
#         Device: cuda
#           Type: accelerator
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 6911.00/ppn + 0.00 MB/sec
#              latency: 1000 nsec
#             overhead: 0 nsec
#            put_short: <= 4294967295
#            get_short: <= 4294967295
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 0 bytes
#        iface address: 8 bytes
#       error handling: none
#
#
# Memory domain: mlx5_0
#     Component: ib
#             register: unlimited, cost: 180 nsec
#           remote key: 8 bytes
#           local memory handle is required for zcopy
#           memory invalidation is supported
#
#      Transport: rc_verbs
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (1)
#
#      capabilities:
#            bandwidth: 6433.22/ppn + 0.00 MB/sec
#              latency: 700 + 1.000 * N nsec
#             overhead: 75 nsec
#            put_short: <= 124
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 5 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 5 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 123
#             am_bcopy: <= 8255
#             am_zcopy: <= 8255, up to 4 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 127
#               domain: device
#           atomic_add: 64 bit
#          atomic_fadd: 64 bit
#         atomic_cswap: 64 bit
#           connection: to ep
#      device priority: 38
#     device num paths: 1
#              max eps: 256
#       device address: 3 bytes
#           ep address: 5 bytes
#       error handling: peer failure, ep_check
#
#
#      Transport: rc_mlx5
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (1)
#
#      capabilities:
#            bandwidth: 6433.22/ppn + 0.00 MB/sec
#              latency: 700 + 1.000 * N nsec
#             overhead: 40 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 14 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 14 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 186
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to ep
#      device priority: 38
#     device num paths: 1
#              max eps: 256
#       device address: 3 bytes
#           ep address: 7 bytes
#       error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
#      Transport: dc_mlx5
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (1)
#
#      capabilities:
#            bandwidth: 6433.22/ppn + 0.00 MB/sec
#              latency: 760 nsec
#             overhead: 40 nsec
#            put_short: <= 2K
#            put_bcopy: <= 8256
#            put_zcopy: <= 1G, up to 11 iov
#  put_opt_zcopy_align: <= 512
#        put_align_mtu: <= 4K
#            get_bcopy: <= 8256
#            get_zcopy: 65..1G, up to 11 iov
#  get_opt_zcopy_align: <= 512
#        get_align_mtu: <= 4K
#             am_short: <= 2046
#             am_bcopy: <= 8254
#             am_zcopy: <= 8254, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 138
#               domain: device
#           atomic_add: 32, 64 bit
#           atomic_and: 32, 64 bit
#            atomic_or: 32, 64 bit
#           atomic_xor: 32, 64 bit
#          atomic_fadd: 32, 64 bit
#          atomic_fand: 32, 64 bit
#           atomic_for: 32, 64 bit
#          atomic_fxor: 32, 64 bit
#          atomic_swap: 32, 64 bit
#         atomic_cswap: 32, 64 bit
#           connection: to iface
#      device priority: 38
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 5 bytes
#       error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
#      Transport: ud_verbs
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (1)
#
#      capabilities:
#            bandwidth: 6433.22/ppn + 0.00 MB/sec
#              latency: 730 nsec
#             overhead: 105 nsec
#             am_short: <= 116
#             am_bcopy: <= 4088
#             am_zcopy: <= 4088, up to 5 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 3952
#           connection: to ep, to iface
#      device priority: 38
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure, ep_check
#
#
#      Transport: ud_mlx5
#         Device: mlx5_0:1
#           Type: network
#  System device: mlx5_0 (1)
#
#      capabilities:
#            bandwidth: 6433.22/ppn + 0.00 MB/sec
#              latency: 730 nsec
#             overhead: 80 nsec
#             am_short: <= 180
#             am_bcopy: <= 4088
#             am_zcopy: <= 4088, up to 3 iov
#   am_opt_zcopy_align: <= 512
#         am_align_mtu: <= 4K
#            am header: <= 132
#           connection: to ep, to iface
#      device priority: 38
#     device num paths: 1
#              max eps: inf
#       device address: 3 bytes
#        iface address: 3 bytes
#           ep address: 6 bytes
#       error handling: peer failure, ep_check
#
#
# Connection manager: rdmacm
#      max_conn_priv: 54 bytes
#
# Memory domain: cma
#     Component: cma
#             register: unlimited, cost: 9 nsec
#
#      Transport: cma
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 0.00/ppn + 11145.00 MB/sec
#              latency: 80 nsec
#             overhead: 2000 nsec
#            put_zcopy: unlimited, up to 16 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: unlimited, up to 16 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 4 bytes
#       error handling: peer failure, ep_check
#
#
# Memory domain: knem
#     Component: knem
#             register: unlimited, cost: 180 nsec
#           remote key: 16 bytes
#
#      Transport: knem
#         Device: memory
#           Type: intra-node
#  System device: <unknown>
#
#      capabilities:
#            bandwidth: 13862.00/ppn + 0.00 MB/sec
#              latency: 80 nsec
#             overhead: 2000 nsec
#            put_zcopy: unlimited, up to 16 iov
#  put_opt_zcopy_align: <= 1
#        put_align_mtu: <= 1
#            get_zcopy: unlimited, up to 16 iov
#  get_opt_zcopy_align: <= 1
#        get_align_mtu: <= 1
#           connection: to iface
#      device priority: 0
#     device num paths: 1
#              max eps: inf
#       device address: 8 bytes
#        iface address: 0 bytes
#       error handling: none
#

cgorac avatar Mar 17 '22 11:03 cgorac