cub
cub copied to clipboard
cub::BlockHistogram - Where are the BIN edges?
cub::BlockHistogram is not behaving as I expected so I boiled it down to a simple static data set.
I have attached the file file.
- 128 data points histogram.zip
- 255 data points histogram.zip
My input data set has values distributed from the max possible for char (255) down to the minimum possible (0). So I should see multiple bins in the histogram filled, however, the output (smem_histogram) always has the total count in the lowest bin.
My Kernel is here:
__global__ void hist_char(const unsigned char* d_data_in, unsigned int* d_hist) {
typedef cub::BlockHistogram<unsigned char, warp_size, SAMPLES_PER_THREAD, N_BINS> BlockHistogram;
__shared__ typename BlockHistogram::TempStorage temp_storage;
__shared__ unsigned int smem_histogram[N_BINS];
unsigned char data[SAMPLES_PER_THREAD] = { 0 };
for (short i = 0; i < SAMPLES_PER_THREAD; i++) {
data[threadIdx.x + i * warp_size] = d_data_in[threadIdx.x+ i * warp_size];
printf("data[%d] = %d (tid = %d)\n", threadIdx.x + i * warp_size, data[threadIdx.x + i * warp_size], threadIdx.x);
}
BlockHistogram(temp_storage).Histogram(data, smem_histogram);
__syncthreads();
const unsigned char hist_stride = (N_BINS / warp_size);
for (unsigned short i = 0; i < hist_stride; i++) {
d_hist[i + hist_stride*threadIdx.x] = smem_histogram[i + hist_stride*threadIdx.x];
printf("hist[%d] = %d (tid = %d)\n", i + hist_stride * threadIdx.x, smem_histogram[i + hist_stride * threadIdx.x], threadIdx.x);
}
}