ucx icon indicating copy to clipboard operation
ucx copied to clipboard

UCT/CUDA: set default max_reg_ratio to 1.0

Open Akshay-Venkatesh opened this issue 2 years ago • 0 comments

What

Set default ratio to 1.0 which means that cuda pinned allocations of any size will be registered fully by IB.

Why ?

Pinned device memory is not cannot be swapped in any case so registering all allocations with IB doesn't add any additional memory pressure. User would have to free up memory to allocate memory greater than what is physically available, at which point IB registrations also go away (assuming correct interception). For devices with low BAR1 capacity (currently on t4 detected), the following pieces of code prevent whole allocation registration to prevent BAR1 exhaustion:

 66 static size_t
 67 uct_cuda_base_get_total_device_mem(CUdevice cuda_device)
 68 {
 69     static size_t total_bytes[UCT_CUDA_MAX_DEVICES];
 70     char dev_name[UCT_CUDA_DEV_NAME_MAX_LEN];
 71     CUresult cu_err;
 72     const char *cu_err_str;
 73 
 74     ucs_assert(cuda_device < UCT_CUDA_MAX_DEVICES);
 75 
 76     ucs_spin_lock(&uct_cuda_base_lock);
 77 
 78     if (!total_bytes[cuda_device]) {
 79         cu_err = cuDeviceTotalMem(&total_bytes[cuda_device], cuda_device);
 80         if (cu_err != CUDA_SUCCESS) {
 81             cuGetErrorString(cu_err, &cu_err_str);
 82             ucs_error("cuDeviceTotalMem error: %s", cu_err_str);
 83             goto err;
 84         }
 85 
 86         cu_err = cuDeviceGetName(dev_name, sizeof(dev_name), cuda_device);
 87         if (cu_err != CUDA_SUCCESS) {
 88             cuGetErrorString(cu_err, &cu_err_str);
 89             ucs_error("cuDeviceGetName error: %s", cu_err_str);
 90             goto err;
 91         }
 92 
 93         if (!strncmp(dev_name, "T4", 2)) {
 94             total_bytes[cuda_device] = 1; /* should ensure that whole alloc
 95                                              registration is not used for t4 */
 96         }
 97     }
 98 
 99     ucs_spin_unlock(&uct_cuda_base_lock);
100     return total_bytes[cuda_device];
101 
102 err:
103     ucs_spin_unlock(&uct_cuda_base_lock);
104     return 1; /* return 1 byte to avoid division by zero */
105 }


168     if (md->config.alloc_whole_reg == UCS_CONFIG_AUTO) {
169         total_bytes = uct_cuda_base_get_total_device_mem(cuda_device);
170         if (alloc_length > (total_bytes * md->config.max_reg_ratio)) {
171             goto out_default_range;
172         }
173     } else {
174         ucs_assert(md->config.alloc_whole_reg == UCS_CONFIG_ON);
175     }

Akshay-Venkatesh avatar Sep 06 '22 21:09 Akshay-Venkatesh