amzn-drivers
amzn-drivers copied to clipboard
[Bug]: EFA: ibv_open_device() eventually fails when running in loop
Preliminary Actions
- [X] I have searched the existing issues and didn't find a duplicate.
- [X] I have followed the AWS official troubleshoot documentation.
- [X] I have followed the driver readme and best practices.
Driver Type
Linux kernel driver for Elastic Fabric Adapter (EFA)
Driver Tag/Commit
2.8.0g
Custom Code
No
OS Platform and Distribution
5.15.0-1055-aws #60~20.04.1-Ubuntu SMP
$ cat /sys/class/infiniband/rdmap79s0/device/driver/module/version 2.8.0g $ cat /sys/class/infiniband/rdmap79s0/device/device 0xefa1
Bug description
The program below eventually fails after few loops with ENOMEM
. It can be reproduced at will by restarting the program. When removing ibv_create_comp_channel()
, the failure does not seem to reproduce anymore.
Is the cq creation with comp_channel supported on EFA?
Reproduction steps
Source for ibv.c
is at the end of the description:
$ gcc ./ibv.c -libverbs && ./a.out
Using rdmap79s0:
................................................................
................................................................
................................................................
................................................................
ibv_open_device(rdmap79s0) failed: Cannot allocate memory (12)
Expected Behavior
If cq with completion channel is not supported: maybe ibv failure If cq with completion channel is supported: no failure, even when running in loop
Actual Behavior
The call ibv_open_device(rdmap79s0)
eventually fails with ENOMEM
.
Additional Data
No response
Relevant log output
$ strace ./a.out
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffd8f0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf50) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde30) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdc30) = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = 0x155555552000
mmap(NULL, 4096, PROT_WRITE, MAP_SHARED, 3, 0x1000) = 0x155555008000
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf00) = 0
munmap(0x155555008004, 4096) = -1 EINVAL (Invalid argument)
munmap(0x155555552000, 4096) = 0
close(5) = 0
close(3) = 0
close(4) = 0
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = 0 <============================ DID NOT FAIL
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffd8f0) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf50) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde30) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdc30) = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = 0x155555552000
mmap(NULL, 4096, PROT_WRITE, MAP_SHARED, 3, 0x1000) = 0x155555007000
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffdf00) = 0
munmap(0x155555007004, 4096) = -1 EINVAL (Invalid argument)
munmap(0x155555552000, 4096) = 0
close(5) = 0
close(3) = 0
close(4) = 0
openat(AT_FDCWD, "/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFCHR|0666, st_rdev=makedev(0xe7, 0xc0), ...}) = 0
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffde70) = -1 ENOSPC (No space left on device)
ioctl(3, RDMA_VERBS_IOCTL, 0x7fffffffddc0) = -1 ENOMEM (Cannot allocate memory) <====================== FAILS
close(3) = 0
write(1, "Using rdmap79s0:\n..............."..., 340Using rdmap79s0:
................................................................
................................................................
................................................................
................................................................
ibv_open_device(rdmap79s0) failed: Cannot allocate memory (12)
) = 340
exit_group(1) = ?
+++ exited with 1 +++
$ cat ibv.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <infiniband/verbs.h>
#define fatal(name, dev) { \
printf("\n%s(%s) failed: %s (%d)\n", \
name, ibv_get_device_name(device), strerror(errno), errno); \
exit(1); \
}
static void cycle(struct ibv_device *device)
{
struct ibv_cq *cq;
struct ibv_comp_channel *comp_channel = NULL;
struct ibv_context *context = ibv_open_device(device);
if (!context) {
fatal("ibv_open_device", device);
}
#if 1
comp_channel = ibv_create_comp_channel(context);
if (!comp_channel) {
fatal("ibv_create_comp_channel", device);
}
#endif
cq = ibv_create_cq(context, 100, NULL, comp_channel, 0);
if (!cq) {
fatal("ibv_create_cq", device);
}
if (ibv_destroy_cq(cq)) {
fatal("ibv_destroy_cq", device);
}
if (comp_channel && ibv_destroy_comp_channel(comp_channel)) {
fatal("ibv_destroy_comp_channel", device);
}
if (ibv_close_device(context)) {
fatal("ibv_close_device", device);
}
}
int main(void)
{
int num_devices;
struct ibv_device **device_list = ibv_get_device_list(&num_devices);
if (device_list && num_devices > 0) {
printf("Using %s:\n", ibv_get_device_name(device_list[0]));
for (unsigned i = 0;; i++) {
printf("%s.", (i && (i % 64) == 0)? "\n" : "");
cycle(device_list[0]);
}
} else {
printf("Cannot get device list\n");
return -1;
}
ibv_free_device_list(device_list);
return 0;
}
Contact Details
No response