libCEED
libCEED copied to clipboard
GPU Gen QFunction Assembly
We can save a lot of memory and probably get better performance if we make a stripped version of the Gen operators that assembles the QFunction. We could theoretically do this at the whole operator/diagonal level, but that would block reuse of the QF values, so I don't think that is a good idea.
cc @zatkins-dev I think this is much more immediately tractable if you want a GPU code generation project that I can coach you on. This would also be something we could adapt to the logic in #1824 once that is sorted out.
Ok, for overhauling the current FEM code, I think we need to turn
#define T_1D 6
#include <ceed/jit-source/cuda/cuda-jit.h>
// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>
// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>
#undef CEED_Q_VLA
#define CEED_Q_VLA 6
// User QFunction source
#include "/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h"
// -----------------------------------------------------------------------------
// Operator Kernel
//
// d_[in,out]_i: CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
//
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_apply_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
const CeedScalar *d_in_0 = fields.inputs[0];
const CeedScalar *d_in_1 = fields.inputs[1];
CeedScalar *d_out_0 = fields.outputs[0];
const CeedInt dim = 3;
const CeedInt Q_1d = 6;
extern __shared__ CeedScalar slice[];
SharedData_Cuda data;
data.t_id_x = threadIdx.x;
data.t_id_y = threadIdx.y;
data.t_id_z = threadIdx.z;
data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
data.slice = slice + data.t_id_z*T_1D*T_1D;
// Input field constants and basis data
// -- Input field 0: u
const CeedInt P_1d_in_0 = 5;
const CeedInt num_comp_in_0 = 1;
// EvalMode: interpolation
__shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
// -- Input field 1: qdata
const CeedInt P_1d_in_1 = 6;
const CeedInt num_comp_in_1 = 1;
// EvalMode: none
// Output field constants and basis data
// -- Output field 0: v
const CeedInt P_1d_out_0 = 5;
const CeedInt num_comp_out_0 = 1;
// EvalMode: interpolation
CeedScalar *s_B_out_0 = s_B_in_0;
// Element loop
__syncthreads();
for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
// Scratch restriction buffer space
CeedScalar r_e_scratch[216];
// -- Input field restrictions and basis actions
// ---- Input field 0: u
CeedScalar *r_e_in_0 = r_e_scratch;
const CeedInt l_size_in_0 = 274625;
// CompStride: 274625
ReadLVecStandard3d<num_comp_in_0, 274625, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
// EvalMode: interpolation
CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
// ---- Input field 1: qdata
CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
// Strides: {1, 884736, 216}
ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
// EvalMode: none
CeedScalar *r_q_in_1 = r_e_in_1;
// -- Output field setup
// ---- Output field 0: v
CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
// Note: Using full elements
{
// -- Input fields
// ---- Input field 0: u
CeedScalar *r_s_in_0 = r_q_in_0;
// ---- Input field 1: qdata
CeedScalar *r_s_in_1 = r_q_in_1;
// -- Output fields
// ---- Output field 0: v
CeedScalar *r_s_out_0 = r_q_out_0;
// -- QFunction inputs and outputs
// ---- Inputs
CeedScalar *inputs[2];
// ------ Input field 0: u
inputs[0] = r_s_in_0;
// ------ Input field 1: qdata
inputs[1] = r_s_in_1;
// ---- Outputs
CeedScalar *outputs[1];
// ------ Output field 0: v
outputs[0] = r_s_out_0;
// -- Apply QFunction
apply_mass(ctx, Q_1d, inputs, outputs);
}
// -- Output field basis action and restrictions
// ---- Output field 0: v
// EvalMode: interpolation
CeedScalar *r_e_out_0 = r_e_scratch;
InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
const CeedInt l_size_out_0 = 274625;
// CompStride: 274625
WriteLVecStandard3d<num_comp_out_0, 274625, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
}
}
// -----------------------------------------------------------------------------
into something like
#define T_1D 6
#include <ceed/jit-source/cuda/cuda-jit.h>
// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>
// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>
#undef CEED_Q_VLA
#define CEED_Q_VLA 6
// User QFunction source
#include "/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h"
// -----------------------------------------------------------------------------
// Operator Kernel
//
// d_[in,out]_i: CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
//
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_apply_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
const CeedScalar *d_in_0 = fields.inputs[0];
const CeedScalar *d_in_1 = fields.inputs[1];
CeedScalar *d_out_0 = fields.outputs[0];
const CeedInt dim = 3;
const CeedInt Q_1d = 6;
extern __shared__ CeedScalar slice[];
SharedData_Cuda data;
data.t_id_x = threadIdx.x;
data.t_id_y = threadIdx.y;
data.t_id_z = threadIdx.z;
data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
data.slice = slice + data.t_id_z*T_1D*T_1D;
// Input field constants and basis data
// -- Input field 0: u
const CeedInt P_1d_in_0 = 5;
const CeedInt num_comp_in_0 = 1;
// EvalMode: interpolation
__shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
// -- Input field 1: qdata
const CeedInt P_1d_in_1 = 6;
const CeedInt num_comp_in_1 = 1;
// EvalMode: none
// Output field constants and basis data
// -- Output field 0: v
const CeedInt P_1d_out_0 = 5;
const CeedInt num_comp_out_0 = 1;
// EvalMode: interpolation
CeedScalar *s_B_out_0 = s_B_in_0;
// Element loop
__syncthreads();
for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
// Scratch restriction buffer space
CeedScalar r_e_scratch[216];
// -- Input field restrictions and basis actions
// ---- Input field 0: u
// Skip
// EvalMode: interpolation
CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
for (CeedInt i = 0; i < num_comp_in_0*Q_1d; i++) r_q_in_0[i] = 0.0;
// ---- Input field 1: qdata
CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
// Strides: {1, 884736, 216}
ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
// EvalMode: none
CeedScalar *r_q_in_1 = r_e_in_1;
// -- Output field setup
// ---- Output field 0: v
CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
// Note: Using full elements
for (CeedInt i = 0; i < num_active_in; i++) {
const CeedInt num_comp = num_comp_in_0 * 1; // Need num_comp for i * mode (dim for grad, 1 for interp)
for (CeedInt j = 0; j < num_comp; j++) {
// -- Set unit vector
r_q_in_0[j] = 1.0; // Need to grab correct input per i
// -- Input fields
// ---- Input field 0: u
CeedScalar *r_s_in_0 = r_q_in_0;
// ---- Input field 1: qdata
CeedScalar *r_s_in_1 = r_q_in_1;
// -- Output fields
// ---- Output field 0: v
CeedScalar *r_s_out_0 = r_q_out_0;
// -- QFunction inputs and outputs
// ---- Inputs
CeedScalar *inputs[2];
// ------ Input field 0: u
inputs[0] = r_s_in_0;
// ------ Input field 1: qdata
inputs[1] = r_s_in_1;
// ---- Outputs
CeedScalar *outputs[1];
// ------ Output field 0: v
outputs[0] = r_s_out_0;
// -- Apply QFunction
apply_mass(ctx, Q_1d, inputs, outputs);
// -- Output field restrictions
// ---- Output field 0: v
// Strides: {1, 884736, 216}
WriteLVecStrided3d<num_comp_out, Q_1d, 1, 884736, 216>(data, elem, r_s_out_0, d_out_0);
// -- Reset unit vector
r_q_in_0[j] = 0.0; // need to grab correct input per i
}
}
}
}
// -----------------------------------------------------------------------------
Still tidying the details. Need to address the 3D slab approach
IDK the exact application of this, but wouldn't this require a way label a QFunction as linear? At least for doing a CeedOperatorApply with a single assembled QFunction.
This is specifically for assembly of the QFunction to build the diagonal or full matrix, so this is for Jacobian preconditioner support