miopenReduceTensor MIOPEN_REDUCE_TENSOR_AVG is failing when using f16 datatype

Open kala855 opened this issue 1 year ago • 0 comments

When trying to apply an average reduction on a tensor filled with float16 elements, we encounter overflow issues. We configure the operation to use float32 as the compute datatype, ensuring that the accumulation occurs in float32, which prevents overflow. Below, I am providing a reproducer of the behavior described.

#define __HIP_PLATFORM_AMD__
#include <half.hpp>
#include <iostream>
#include <limits>
#include <vector>
#include <miopen/miopen.h>
using half_float::half;
using half_float::half_cast;
int main(int argc, char *argv[]) {
    miopenHandle_t handle;
    miopenCreate(&handle);
    miopenNanPropagation_t Nanprop
            = miopenNanPropagation_t::MIOPEN_PROPAGATE_NAN;
    miopenReduceTensorIndices_t Indices
            = miopenReduceTensorIndices_t::MIOPEN_REDUCE_TENSOR_NO_INDICES;
    miopenIndicesType_t IndicesType = miopenIndicesType_t::MIOPEN_32BIT_INDICES;
    // If compute datatype is float the accumulation when the algorithm is
    // MIOPEN_REDUCE_TENSOR_AVG should happen in single precision avoiding
    // any overflow issue.
    auto compute_type = miopenDataType_t::miopenFloat;
    auto dt_src = miopenDataType_t::miopenHalf;
    auto dt_dst = miopenDataType_t::miopenFloat;
    auto dt_src_size = sizeof(half);
    auto dt_dst_size = sizeof(float);
    constexpr int NUM_IO = 2;
    constexpr int NUM_ELEMENTS = 2080769; // 2080768
    miopenDataType_t data_types[NUM_IO] = {dt_src, dt_dst};
    miopenTensorDescriptor_t tensor_descs[NUM_IO] = {};
    miopenReduceTensorDescriptor_t reduce_desc;
    miopenReduceTensorOp_t alg_kind
            = miopenReduceTensorOp_t::MIOPEN_REDUCE_TENSOR_AVG;
    miopenCreateReduceTensorDescriptor(&reduce_desc);
    miopenSetReduceTensorDescriptor(
            reduce_desc, alg_kind, compute_type, Nanprop, Indices, IndicesType);
    miopenCreateTensorDescriptor(&tensor_descs[0]);
    miopenCreateTensorDescriptor(&tensor_descs[1]);
    int src_dims[4] = {NUM_ELEMENTS, 2, 1, 1};
    int dst_dims[4] = {1, 2, 1, 1};
    int src_strides[4] = {2, 1, 1, 1};
    int dst_strides[4] = {2, 1, 1, 1};
    int ndims = 4;
    miopenSetTensorDescriptor(
            tensor_descs[0], data_types[0], ndims, src_dims, src_strides);
    miopenSetTensorDescriptor(
            tensor_descs[1], data_types[1], ndims, dst_dims, dst_strides);
    const float alpha = 1.f, beta = 0.f;
    void *a = nullptr;
    void *c = nullptr;
    void *scratch = nullptr;
    hipMalloc(&a,
            src_dims[0] * src_dims[1] * src_dims[2] * src_dims[3]
                    * dt_src_size);
    hipMalloc(&c,
            dst_dims[0] * dst_dims[1] * dst_dims[2] * dst_dims[3]
                    * dt_dst_size);
    std::vector<half> src(src_dims[0] * src_dims[1] * src_dims[2] * src_dims[3],
            static_cast<half>(1.f));
    hipMemcpy(a, src.data(), 2 * NUM_ELEMENTS * dt_src_size,
            hipMemcpyHostToDevice);
    int workSpaceSize = 4224;
    hipMalloc(&scratch, workSpaceSize * dt_src_size);
    miopenReduceTensor(handle, reduce_desc, nullptr, 0, scratch, workSpaceSize,
            &alpha, tensor_descs[0], a, &beta, tensor_descs[1], c);
    hipDeviceSynchronize();
    std::vector<float> dst(2);
    hipMemcpy(dst.data(), c, 2 * dt_dst_size, hipMemcpyDeviceToHost);
    for (float f : dst) {
        std::cout << f << ", ";
    }
    std::cout << std::endl;
    miopenDestroyReduceTensorDescriptor(reduce_desc);
    miopenDestroyTensorDescriptor(tensor_descs[0]);
    miopenDestroyTensorDescriptor(tensor_descs[1]);
    hipFree(a);
    hipFree(c);
    hipFree(scratch);
}

The previous code was tested on mi210 hardware with rocm5.6.1 and using the half library suggested in MIOpen Readme.

Thanks for your help. If there is any workaround or current solution to this, please let me know.

Aug 01 '24 07:08 kala855