CINN
CINN copied to clipboard
Example of conv2d generating 2 kernels
此PR通过举例说明了在不使用Schedule时,初始的Conv2d Compute在cuda端会生成两个kernels
单测结果:
I0601 11:14:21.862107 52538 codegen_cuda_dev_test.cc:673] func number is : 2
I0601 11:14:21.862145 52538 codegen_cuda_dev_test.cc:676] func is :
function schedule_conv2d_0 (_X, _input_pad)
{
for (j, 0, 128)
{
for (k, 0, 28)
{
for (a, 0, 28)
{
input_pad[0, j, k, a] = X[0, j, k, a]
}
}
}
}
I0601 11:14:21.862172 52538 codegen_cuda_dev_test.cc:676] func is :
function schedule_conv2d_0_1 (_X, _Y, _input_pad, _COD)
{
for (j, 0, 256)
{
for (k, 0, 14)
{
for (a, 0, 14)
{
COD__reduce_init[0, j, k, a] = 0
for (rc, 0, 128)
{
COD[0, j, k, a] = (COD[0, j, k, a] + (input_pad[0, rc, (2 * k), (2 * a)] * Y[j, rc, 0, 0]))
}
}
}
}
}
I0601 11:14:21.882473 52538 codegen_cuda_dev_test.cc:682] compiled test_no_schedule_conv2d code:
extern "C" {
#include "cinn_cuda_runtime_source.cuh"
#ifdef __CUDACC_RTC__
typedef int int32_t;
typedef char int8_t;
#endif
__global__
void schedule_conv2d_0(const float* __restrict__ X, float* __restrict__ input_pad)
{
for (int32_t j = 0; j < 128; j += 1) {
for (int32_t k = 0; k < 28; k += 1) {
for (int32_t a = 0; a < 28; a += 1) {
input_pad[((784 * j) + ((28 * k) + a))] = X[((784 * j) + ((28 * k) + a))];
};
};
};
}__global__
void schedule_conv2d_0_1(const float* __restrict__ X, const float* __restrict__ Y, const float* __restrict__ input_pad, float* __restrict__ COD)
{
float* COD__reduce_init = COD;
for (int32_t j = 0; j < 256; j += 1) {
for (int32_t k = 0; k < 14; k += 1) {
for (int32_t a = 0; a < 14; a += 1) {
COD__reduce_init[((196 * j) + ((14 * k) + a))] = 0;
for (int32_t rc = 0; rc < 128; rc += 1) {
COD[((196 * j) + ((14 * k) + a))] = (COD[((196 * j) + ((14 * k) + a))] + (input_pad[((2 * a) + ((56 * k) + (784 * rc)))] * Y[((128 * j) + rc)]));
};
};
};
};
}
}
Thanks for your contribution!