Infiniband not work, Help me
I met the same problem.
RDMABuffer::RDMABuffer(RDMAChannel* channel, uint8_t* addr, size_t size) : channel_(channel), addr_(addr), size_(size) {
//*******************************************************
// case 1: Use cpu memory ibv_reg_mr() is ok, but some code is not work.
// addr_ = reinterpret_cast<uint8_t*>(malloc(size));
//
// http://server01:8042/node/containerlogs/container_1512543960414_0001_01_000003/root/stderr/?start=0
// F1206 02:14:43.892500 18704 math_functions.cu:79] Check failed: error == cudaSuccess (77 vs. 0) an illegal memory access was encountered
// *** Check failure stack trace: ***
//
// case 2: Use gpu memory ibv_reg_mr() is not ok, help me.
// CUDA_CHECK(cudaMalloc(&addr_, size));
//
// http://server01:8042/node/containerlogs/container_1512543960414_0001_01_000003/root/stderr/?start=0
// F1205 17:02:12.639581 7160 rdma.cpp:327] Check failed: self_ Failed to register memory region.
//*******************************************************
self_ = ibv_reg_mr(channel_->adapter_.pd_, addr_, size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); CHECK(self_) << "Failed to register memory region";
id_ = channel_->buffers_.size(); channel_->buffers_.push_back(this);
channel_->SendMR(self_, id_); peer_ = channel_->memory_regions_queue_.pop();
}
//******************************************************* root@5ec610095991:~/CaffeOnSpark/caffe-public# more Makefile.config
Refer to http://caffe.berkeleyvision.org/installation.html Parallelization over InfiniBand or RoCE INFINIBAND := 1
//******************************************************* root@server01:/rt/data/alexNet2# ibv_devices device node GUID ------ ---------------- mlx5_0 ec0d9a0300397dd2
//******************************************************* root@server01:/rt/data/alexNet2# ibv_devinfo hca_id: mlx5_0 transport: InfiniBand (0) fw_ver: 12.21.1000 node_guid: ec0d:9a03:0039:7dd2 sys_image_guid: ec0d:9a03:0039:7dd2 vendor_id: 0x02c9 vendor_part_id: 4115 hw_ver: 0x0 board_id: MT_2180110032 phys_port_cnt: 1 Device ports: port: 1 state: PORT_ACTIVE (4) max_mtu: 4096 (5) active_mtu: 4096 (5) sm_lid: 1 port_lid: 2 port_lmc: 0x00 link_layer: InfiniBand
//*******************************************************
root@5ec610095991:~/CaffeOnSpark/caffe-public# nvidia-smi
Wed Dec 6 07:34:09 2017
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.69 Driver Version: 384.69 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 108... Off | 00000000:04:00.0 Off | N/A |
| 20% 33C P8 16W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 GeForce GTX 108... Off | 00000000:06:00.0 Off | N/A |
| 20% 36C P8 17W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 GeForce GTX 108... Off | 00000000:07:00.0 Off | N/A |
| 20% 33C P8 8W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 GeForce GTX 108... Off | 00000000:08:00.0 Off | N/A |
| 20% 34C P8 8W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 4 GeForce GTX 108... Off | 00000000:0C:00.0 Off | N/A |
| 20% 28C P8 9W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 5 GeForce GTX 108... Off | 00000000:0D:00.0 Off | N/A |
| 20% 27C P8 9W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 6 GeForce GTX 108... Off | 00000000:0E:00.0 Off | N/A |
| 20% 31C P8 9W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 7 GeForce GTX 108... Off | 00000000:0F:00.0 Off | N/A |
| 20% 31C P8 9W / 250W | 10MiB / 11172MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
//******************************************************* [root@server00 01_basic-client-server]# docker images REPOSITORY TAG IMAGE ID CREATED SIZE docker.io/nvidia/cuda 8.0-devel 7e0c5ccdc1eb 2 weeks ago 1.681 GB
//******************************************************* Installation Mellanox OFED for Ubuntu on a Host MLNX_OFED_LINUX-4.2-1.0.0.0-ubuntu16.04-x86_64.tgz
//******************************************************* [root@server01 ~]# systemctl status nv_peer_mem ● nv_peer_mem.service - LSB: Activates/Deactivates nv_peer_mem module to start at boot time. Loaded: loaded (/etc/rc.d/init.d/nv_peer_mem; bad; vendor preset: disabled) Active: active (exited) since Wed 2017-12-06 05:16:08 EST; 1min 32s ago Docs: man:systemd-sysv-generator(8) Process: 2055 ExecStart=/etc/rc.d/init.d/nv_peer_mem start (code=exited, status=0/SUCCESS)
Dec 06 05:16:08 server01 systemd[1]: Starting LSB: Activates/Deactivates nv_peer_mem module to start at boot time.... Dec 06 05:16:08 server01 nv_peer_mem[2055]: starting... OK Dec 06 05:16:08 server01 systemd[1]: Started LSB: Activates/Deactivates nv_peer_mem module to start at boot time.