kokkos-kernels icon indicating copy to clipboard operation
kokkos-kernels copied to clipboard

spgemm gives zero nnz in output on CUDA backend

Open Char-Aznable opened this issue 3 years ago • 0 comments

Consider this code which generates two random CrsMatrix and call spgemm to multiple them:

#include <iostream>
#include <string>
#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>
#include <KokkosKernels_Sorting.hpp>
#include <KokkosSparse_CrsMatrix.hpp>
#include <KokkosSparse_spgemm.hpp>

using ExecutionSpace = Kokkos::DefaultExecutionSpace;
using MemorySpace = ExecutionSpace::memory_space;
using Device = Kokkos::Device<ExecutionSpace,MemorySpace>;
using ScratchSpace = ExecutionSpace::scratch_memory_space;
using ScratchDevice = Kokkos::Device<ExecutionSpace, ScratchSpace>;
using HostExecutionSpace = Kokkos::DefaultHostExecutionSpace;
using HostMemorySpace = HostExecutionSpace::memory_space;
using HostDevice = Kokkos::Device<HostExecutionSpace, HostMemorySpace>;

using M = typename KokkosSparse::CrsMatrix<float, int, Device, void, std::size_t>;
using RowMap = typename M::row_map_type::non_const_type;
using ColIdx = typename M::index_type;
using Values  = typename M::values_type;

using O = typename RowMap::value_type;
using I = typename ColIdx::value_type;
using V = typename Values::value_type;

using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
  O, I, V, ExecutionSpace, MemorySpace, MemorySpace>;

using UnmanagedTrait = Kokkos::MemoryTraits<Kokkos::Unmanaged>;

template < class T > using ScratchView = Kokkos::View<T, ScratchDevice, UnmanagedTrait>;


template < class I = int, class O = typename Kokkos::View<float>::size_type,
  class ... Ts >
KokkosSparse::CrsMatrix<
  typename Kokkos::View<Ts...>::value_type,
  I,
  typename Kokkos::View<Ts...>::device_type,
  typename Kokkos::View<Ts...>::memory_traits,
  O>
dense2CrsMatrix(const Kokkos::View<Ts...>& v, const bool sortColIdx = true) {
  using View = std::decay_t<decltype(v)>;
  static_assert(View::rank == 2, "dense2CrsMatrix: input view is not 2-dimensional");
  using V = typename View::value_type;
  using Device = typename View::device_type;
  using E = typename Device::execution_space;
  using MemoryTraits = typename View::memory_traits;
  using M = KokkosSparse::CrsMatrix<V, I, Device, MemoryTraits, O>;
  using Policy = Kokkos::TeamPolicy<E, Kokkos::Schedule<Kokkos::Dynamic>, I>; 
  using Team = typename Policy::member_type;
  using SM = typename E::scratch_memory_space;
  using SD = Kokkos::Device<E, SM>;
  using ScratchViewCounts = Kokkos::View<O, SD, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
  const I nRows = v.extent(0);
  const I nCols = v.extent(1);
  Kokkos::View<O*, Device> rowMap("rowMap", nRows + 1);
  //Count nnz for each row
  Policy policyNNZ(static_cast<int>(nRows), Kokkos::AUTO, Kokkos::AUTO);
  Kokkos::parallel_for(policyNNZ, KOKKOS_LAMBDA (const Team& team) {
    const auto iRow = team.league_rank();
    Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team, nCols),
      [&] (const auto jCol, O& update) {
      if(v(iRow, jCol) != V{0}) { update += 1; }
    }, rowMap(iRow + 1));
  });
  O nnz;
  Kokkos::parallel_scan("scanRowMap",
    Kokkos::RangePolicy<E, I>{E{}, I{1}, nRows + 1},
    KOKKOS_LAMBDA (const I iRow, O& update, const bool isFinal) {
    update += rowMap(iRow);
    if(isFinal) {
      rowMap(iRow) = update;
    }
  }, nnz);
  Kokkos::View<I*, Device> colIdx("colIdx", nnz);
  Kokkos::View<V*, Device> values("values", nnz);

  Policy policyValues = Policy{static_cast<int>(nRows), Kokkos::AUTO, Kokkos::AUTO}.
    set_scratch_size(0, Kokkos::PerTeam(ScratchViewCounts::shmem_size()));
  Kokkos::parallel_for(policyValues, KOKKOS_LAMBDA (const Team& team) {
    const auto iRow = team.league_rank();
    ScratchViewCounts iNNZ(team.team_scratch(0));
    Kokkos::single(Kokkos::PerTeam(team), [&] () { iNNZ() = rowMap(iRow); });
    team.team_barrier();
    Kokkos::parallel_for(Kokkos::TeamVectorRange(team, nCols),
      [&] (const auto jCol) {
      if(v(iRow, jCol) != V{0}) {
        const auto idx = Kokkos::atomic_fetch_add(&iNNZ(), 1);
        values(idx) = v(iRow, jCol);
        colIdx(idx) = jCol;
      }
    });
  });
  M ans{"dense2CrsMatrix", nRows, nCols, nnz, values, rowMap, colIdx};
  if(sortColIdx) {
    KokkosKernels::sort_crs_matrix(ans);
  }
  return ans;

}

M randomCrsMatrix(const I nRows, const I nCols, const float fractionNNZ,
  Kokkos::Random_XorShift64_Pool<Device>& rndPool) {
  Kokkos::View<V**, Device> mDense("mDense", nRows, nCols);
  Kokkos::parallel_for("randomCrsMatrix::randomDense",
    Kokkos::MDRangePolicy<ExecutionSpace, Kokkos::Rank<2>, int>(
      {0, 0}, {mDense.extent_int(0), mDense.extent_int(1)}),
    KOKKOS_LAMBDA (const int iRow, const int jCol) {
    auto rndEngine = rndPool.get_state();
    if(rndEngine.drand() <= fractionNNZ) {
      mDense(iRow, jCol) = rndEngine.drand();
    }
    rndPool.free_state(rndEngine);
  });
  return dense2CrsMatrix<I, O>(mDense);
}

int main(int argc, char* argv[]) {
  Kokkos::initialize(argc,argv); {

  const I nRows = 1024;
  const I nCtr = nRows - 1;
  const I nCols = nCtr * 500;
  const float fracitonNNZ = 0.05;
  Kokkos::Random_XorShift64_Pool<Device> rndPool(823982);
  auto m = randomCrsMatrix(nRows, nCtr, fracitonNNZ, rndPool);
  auto n = randomCrsMatrix(nCtr, nCols, fracitonNNZ, rndPool);

  KernelHandle kh;
  KokkosSparse::SPGEMMAlgorithm spgemm_algorithm{};
  kh.create_spgemm_handle(spgemm_algorithm);

  M c;
  KokkosSparse::spgemm_symbolic(kh, m, false, n, false, c);
  KokkosSparse::spgemm_numeric(kh, m, false, n, false, c);
  
  Kokkos::fence();

  std::cout << "Result: " << c.numRows() << " x " << c.numCols() << " with " << c.nnz() << " elements\n";

  } Kokkos::finalize(); }

On CUDA backend and AMPERE86, I got the output matrix's nnz() to be zero, which is unexpected given that two input matrices are randomly generated. Also, if I change nRows to 100, the output nnz() is not zero but some reasonable number. This is using the develop branch 5b781a1d8

Char-Aznable avatar Nov 12 '21 22:11 Char-Aznable