libCEED GPU Gen QFunction Assembly

We can save a lot of memory and probably get better performance if we make a stripped version of the Gen operators that assembles the QFunction. We could theoretically do this at the whole operator/diagonal level, but that would block reuse of the QF values, so I don't think that is a good idea.

cc @zatkins-dev I think this is much more immediately tractable if you want a GPU code generation project that I can coach you on. This would also be something we could adapt to the logic in #1824 once that is sorted out.

May 15 '25 22:05 jeremylt

Ok, for overhauling the current FEM code, I think we need to turn

#define T_1D 6
#include <ceed/jit-source/cuda/cuda-jit.h>

// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>

// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>


#undef CEED_Q_VLA
#define CEED_Q_VLA 6

// User QFunction source
#include "/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h"


// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_apply_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 6;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0: u
  const CeedInt P_1d_in_0 = 5;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  // -- Input field 1: qdata
  const CeedInt P_1d_in_1 = 6;
  const CeedInt num_comp_in_1 = 1;
  // EvalMode: none

  // Output field constants and basis data
  // -- Output field 0: v
  const CeedInt P_1d_out_0 = 5;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: interpolation
  CeedScalar *s_B_out_0 = s_B_in_0;

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[216];

    // -- Input field restrictions and basis actions
    // ---- Input field 0: u
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 274625;
    // CompStride: 274625
    ReadLVecStandard3d<num_comp_in_0, 274625, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: interpolation
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1: qdata
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    // Strides: {1, 884736, 216}
    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
    // EvalMode: none
    CeedScalar *r_q_in_1 = r_e_in_1;

    // -- Output field setup
    // ---- Output field 0: v
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0: u
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1: qdata
      CeedScalar *r_s_in_1 = r_q_in_1;
      // -- Output fields
      // ---- Output field 0: v
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0: u
      inputs[0] = r_s_in_0;
      // ------ Input field 1: qdata
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0: v
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      apply_mass(ctx, Q_1d, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0: v
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 274625;
    // CompStride: 274625
    WriteLVecStandard3d<num_comp_out_0, 274625, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

into something like

#define T_1D 6
#include <ceed/jit-source/cuda/cuda-jit.h>

// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>

// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>


#undef CEED_Q_VLA
#define CEED_Q_VLA 6

// User QFunction source
#include "/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h"


// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_apply_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 6;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0: u
  const CeedInt P_1d_in_0 = 5;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  // -- Input field 1: qdata
  const CeedInt P_1d_in_1 = 6;
  const CeedInt num_comp_in_1 = 1;
  // EvalMode: none

  // Output field constants and basis data
  // -- Output field 0: v
  const CeedInt P_1d_out_0 = 5;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: interpolation
  CeedScalar *s_B_out_0 = s_B_in_0;

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[216];

    // -- Input field restrictions and basis actions
    // ---- Input field 0: u
    // Skip
    // EvalMode: interpolation
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    for (CeedInt i = 0; i < num_comp_in_0*Q_1d; i++) r_q_in_0[i] = 0.0;
    // ---- Input field 1: qdata
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    // Strides: {1, 884736, 216}
    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
    // EvalMode: none
    CeedScalar *r_q_in_1 = r_e_in_1;

    // -- Output field setup
    // ---- Output field 0: v
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    for (CeedInt i = 0; i < num_active_in; i++) {
      const CeedInt num_comp = num_comp_in_0 * 1; // Need num_comp for i * mode (dim for grad, 1 for interp)

      for (CeedInt j = 0; j < num_comp; j++) {
        // -- Set unit vector
        r_q_in_0[j] = 1.0; // Need to grab correct input per i

        // -- Input fields
        // ---- Input field 0: u
        CeedScalar *r_s_in_0 = r_q_in_0;
        // ---- Input field 1: qdata
        CeedScalar *r_s_in_1 = r_q_in_1;
        // -- Output fields
        // ---- Output field 0: v
        CeedScalar *r_s_out_0 = r_q_out_0;

        // -- QFunction inputs and outputs
        // ---- Inputs
        CeedScalar *inputs[2];
        // ------ Input field 0: u
        inputs[0] = r_s_in_0;
        // ------ Input field 1: qdata
        inputs[1] = r_s_in_1;
        // ---- Outputs
        CeedScalar *outputs[1];
        // ------ Output field 0: v
        outputs[0] = r_s_out_0;

        // -- Apply QFunction
        apply_mass(ctx, Q_1d, inputs, outputs);

        // -- Output field restrictions
        // ---- Output field 0: v
        // Strides: {1, 884736, 216}
        WriteLVecStrided3d<num_comp_out, Q_1d, 1, 884736, 216>(data, elem, r_s_out_0, d_out_0);

        // -- Reset unit vector
        r_q_in_0[j] = 0.0; // need to grab correct input per i
      }
    }
  }
}
// -----------------------------------------------------------------------------

Still tidying the details. Need to address the 3D slab approach

May 19 '25 15:05 jeremylt

IDK the exact application of this, but wouldn't this require a way label a QFunction as linear? At least for doing a CeedOperatorApply with a single assembled QFunction.

May 19 '25 16:05 jrwrigh

This is specifically for assembly of the QFunction to build the diagonal or full matrix, so this is for Jacobian preconditioner support

May 19 '25 16:05 jeremylt

libCEED libCEED copied to clipboard

GPU Gen QFunction Assembly

libCEED
libCEED copied to clipboard