neural_kernels_code
neural_kernels_code copied to clipboard
RuntimeError: Could not compile function
I use your docker to run run_kernel_myrtle5.sh on a 4x 2080ti Ubuntu 18.04 workstation, and finally encounter the following error. Could you give me a hint on how to resolve this issue?
Current Count Is: 1084000
Current Count Is: 1085000
Data_q size start 1085764
0%| | 0/2500000000 [00:00<?, ?it/s]Context already set..
Context already set..
Context already set..
Context already set..
STARTING KERNEL GEN HELP
STARTING KERNEL GEN HELP
STARTING KERNEL GEN HELP
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
STARTING KERNEL GEN HELP
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
TC "conv3_input" was not explicitly compiled for inputs of sizes:
torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
TC "conv3_input" was not explicitly compiled for inputs of sizes:
torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
TC "conv3_input" was not explicitly compiled for inputs of sizes:
torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
TC "conv3_input" was not explicitly compiled for inputs of sizes:
torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
E0618 20:39:27.390336 39 cuda_rtc.cc:251] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
source:
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>
#endif
#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)
// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
#endif
extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
for (int c7 = 0; c7 <= 7; c7 += 1) {
for (int c8 = 0; c8 <= 7; c8 += 1) {
for (int c9 = 0; c9 <= 27; c9 += 1) {
for (int c10 = 0; c10 <= 27; c10 += 1) {
for (int c11 = t1; c11 <= 27; c11 += 8) {
conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
for (int c13 = 0; c13 <= 2; c13 += 1) {
for (int c14 = 0; c14 <= 2; c14 += 1) {
conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
}
}
}
}
}
}
}
}
}
Process Process-4:
Traceback (most recent call last):
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/neural_kernels_code/kernel_gen.py", line 351, in _kernel_gen_help
kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
File "/neural_kernels_code/kernel_gen.py", line 127, in forward
prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
File "/neural_kernels_code/tc_kernels.py", line 434, in conv3zp_input
return res.conv3_input(x,y)/(3*3)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 348, in fun
tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 398, in __call__
implicit_compile(self, entry_point, *inputs)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 392, in implicit_compile
entry_point, inputs, mapping_options)
RuntimeError: Could not compile function
E0618 20:39:27.467238 37 cuda_rtc.cc:251] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
source:
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>
#endif
#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)
// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
#endif
extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
for (int c7 = 0; c7 <= 7; c7 += 1) {
for (int c8 = 0; c8 <= 7; c8 += 1) {
for (int c9 = 0; c9 <= 27; c9 += 1) {
for (int c10 = 0; c10 <= 27; c10 += 1) {
for (int c11 = t1; c11 <= 27; c11 += 8) {
conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
for (int c13 = 0; c13 <= 2; c13 += 1) {
for (int c14 = 0; c14 <= 2; c14 += 1) {
conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
}
}
}
}
}
}
}
}
}
Process Process-2:
Traceback (most recent call last):
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/neural_kernels_code/kernel_gen.py", line 351, in _kernel_gen_help
kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
File "/neural_kernels_code/kernel_gen.py", line 127, in forward
prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
File "/neural_kernels_code/tc_kernels.py", line 434, in conv3zp_input
return res.conv3_input(x,y)/(3*3)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 348, in fun
tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 398, in __call__
implicit_compile(self, entry_point, *inputs)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 392, in implicit_compile
entry_point, inputs, mapping_options)
RuntimeError: Could not compile function
E0618 20:39:27.564579 36 cuda_rtc.cc:251] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
source:
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>
#endif
#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)
// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
#endif
extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
for (int c7 = 0; c7 <= 7; c7 += 1) {
for (int c8 = 0; c8 <= 7; c8 += 1) {
for (int c9 = 0; c9 <= 27; c9 += 1) {
for (int c10 = 0; c10 <= 27; c10 += 1) {
for (int c11 = t1; c11 <= 27; c11 += 8) {
conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
for (int c13 = 0; c13 <= 2; c13 += 1) {
for (int c14 = 0; c14 <= 2; c14 += 1) {
conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
}
}
}
}
}
}
}
}
}
Process Process-1:
Traceback (most recent call last):
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/neural_kernels_code/kernel_gen.py", line 351, in _kernel_gen_help
kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
File "/neural_kernels_code/kernel_gen.py", line 127, in forward
prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
File "/neural_kernels_code/tc_kernels.py", line 434, in conv3zp_input
return res.conv3_input(x,y)/(3*3)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 348, in fun
tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 398, in __call__
implicit_compile(self, entry_point, *inputs)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 392, in implicit_compile
entry_point, inputs, mapping_options)
RuntimeError: Could not compile function
E0618 20:39:27.591982 38 cuda_rtc.cc:251] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
source:
template<typename T> inline __device__ T floord(T n, T d) {
return n < 0 ? - (-n + d - 1)/d : n / d;
}
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))
#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>
#endif
#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)
// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}
#endif
extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
for (int c7 = 0; c7 <= 7; c7 += 1) {
for (int c8 = 0; c8 <= 7; c8 += 1) {
for (int c9 = 0; c9 <= 27; c9 += 1) {
for (int c10 = 0; c10 <= 27; c10 += 1) {
for (int c11 = t1; c11 <= 27; c11 += 8) {
conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
for (int c13 = 0; c13 <= 2; c13 += 1) {
for (int c14 = 0; c14 <= 2; c14 += 1) {
conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
}
}
}
}
}
}
}
}
}
Process Process-3:
Traceback (most recent call last):
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/root/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/neural_kernels_code/kernel_gen.py", line 351, in _kernel_gen_help
kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
File "/neural_kernels_code/kernel_gen.py", line 127, in forward
prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
File "/neural_kernels_code/tc_kernels.py", line 434, in conv3zp_input
return res.conv3_input(x,y)/(3*3)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 348, in fun
tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 398, in __call__
implicit_compile(self, entry_point, *inputs)
File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/__init__.py", line 392, in implicit_compile
entry_point, inputs, mapping_options)
RuntimeError: Could not compile function
After I use Ctrl+C to interrupt it, the following error gets printed out:
^CTraceback (most recent call last):
File "run_train_eval_exp.py", line 156, in <module>
main()
File "run_train_eval_exp.py", line 51, in main
K_train, K_test = generate_kernels(cfg, X_train, X_test)
File "run_train_eval_exp.py", line 101, in generate_kernels
K_train = kernel_gen.generate_kernel_parallel(cfg.KERNEL, X_train, X_train, num_gpus=cfg.SYSTEM.NUM_GPUS, symmetric=True, batch_size=cfg.SYSTEM.BATCH_SIZE, cache_path=cfg.SYSTEM.CACHE_PATH, float32=cfg.SYSTEM.FLOAT_32, extra_info={"kernel_type": "Train"})
File "/neural_kernels_code/kernel_gen.py", line 438, in generate_kernel_parallel
progress = done_q.get()
File "/root/conda/lib/python3.6/multiprocessing/queues.py", line 94, in get
res = self._recv_bytes()
File "/root/conda/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/root/conda/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/root/conda/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
^CError in atexit._run_exitfuncs:
Traceback (most recent call last):
File "/root/conda/lib/python3.6/multiprocessing/util.py", line 262, in _run_finalizers
finalizer()
File "/root/conda/lib/python3.6/multiprocessing/util.py", line 186, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/root/conda/lib/python3.6/multiprocessing/queues.py", line 191, in _finalize_join
thread.join()
File "/root/conda/lib/python3.6/threading.py", line 1056, in join
self._wait_for_tstate_lock()
File "/root/conda/lib/python3.6/threading.py", line 1072, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
KeyboardInterrupt
^C
Ah I am pretty sure the code only works on V100s or TitanV right now.
I am working on an implementation based on Jax that supports more GPU types but alas it is not out yet.
@Vaishaal Yes, V100 can indeed run the code. But I find your generate_kernel_parallel function broke for some new dataset (MNIST-style) on a single GPU, while the single-thread generate_kernel works fine. So I suggest you make the single-thread one as the default for the single GPU case.
Noted! What was the exception, that should not happen. I can try to fix it.