llvm icon indicating copy to clipboard operation
llvm copied to clipboard

ptxas fatality Unresolved extern function '__muldc3'

Open e404044 opened this issue 3 years ago • 1 comments

Greetings, I'm getting the below compilation error when I target my gpu. I don't get the same issue when i target the intel fpga simulator or intel cpu. code is below as well. do you have any recommendations?

[22:10]ec2-user$ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda fft-sycl.cpp -o fft-sycl-gpu
clang-16: warning: CUDA version is newer than the latest supported version 11.5 [-Wunknown-cuda-version]
warning: linking module '/data_disk01/users/archboldw/sycl_workspace/llvm/build/lib/clang/16.0.0/../../clc/remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc': Linking two modules of different target triple
s: '/data_disk01/users/archboldw/sycl_workspace/llvm/build/lib/clang/16.0.0/../../clc/remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc' is 'nvptx64-unknown-nvidiacl' whereas 'fft-sycl.cpp' is 'nvptx64-nvi
dia-cuda'
 [-Wlinker-warnings]
1 warning generated.
ptxas fatal   : Unresolved extern function '__muldc3'
llvm-foreach:
clang-16: error: ptxas command failed with exit code 255 (use -v to see invocation)
clang version 16.0.0 (https://github.com/intel/llvm 3dc891f9d08b9fccacf550655f2b35978a9c29ec)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /var/home/ec2-user/projects/sycl_workspace/llvm/build/bin
clang-16: note: diagnostic msg: Error generating preprocessed source(s).
#include <math.h>

#include <iostream>

#include <complex>
#include <vector>
#include <numeric>

#include <sycl/sycl.hpp>


namespace signals
{

/**
 * @brief Generate a tone
 * @param sample_count The number of samples to generate
 * @param frequency The angular frequency of the signal. Generally bounded by
 * [0, pi].
 */
auto tone(unsigned int sample_count
		 ,double frequency
		 )
-> std::vector<std::complex<double>>
{
    using cplx = std::complex<double>;

    // delta = e^{jw}
    // x[1] = e^{jw(n=1)} = e^{jw1} = delta
    // x[2] = e^{jw(n=2)} = e^{jw2} = x[1] * delta
    const cplx delta(std::polar(static_cast<double>(1.0), frequency));

    std::vector<cplx> ret(sample_count, delta);
    ret[0] = {1, 0};

    std::partial_sum(ret.cbegin(), ret.cend(), ret.begin(), std::multiplies<cplx>());

    return ret;
}

}

namespace fft
{

/**
 * @brief Calculate the fft
 * @param samples The samples to fft
 */
auto fft(const std::vector<std::complex<double>> & samples)
-> std::vector<std::complex<double>>
{
    using cplx = std::complex<double>;

    // M = [tone(0); tone(1 * PI / N)]
    // ret = M \cdot samples

    std::vector<cplx> ret;
    ret.reserve(samples.size());
    for (unsigned int idx(0); idx < samples.size(); ++idx)
    {
        auto kernel = signals::tone(samples.size(), -2.0 * M_PI * idx / samples.size());
        ret.push_back(std::inner_product(kernel.cbegin()
                                        ,kernel.cend()
                                        ,samples.cbegin()
                                        ,cplx(0.0, 0.0)
                                        ));

    }

    return ret;
}

class Fft
{
public:
    Fft(std::size_t fft_length)
    : m_filters(fft_length * fft_length)
    {
        auto begin(m_filters.begin());
        for (unsigned int idx{0}; idx < fft_length; ++idx)
        {
            auto kernel = signals::tone(fft_length, -2.0 * M_PI * idx / fft_length);
            begin = std::copy(kernel.begin(), kernel.end(), begin);
        }
    }

    friend std::ostream & operator<<(std::ostream & out, const Fft & obj)
    {
        std::size_t fft_length{obj.get_fft_length()};

        bool print_sep;
        for (auto begin_filt(obj.m_filters.begin())
            ;begin_filt < obj.m_filters.end()
            ;begin_filt += fft_length)
        {
            print_sep = false;
            for (auto begin_elem(begin_filt)
                ;begin_elem < begin_filt + fft_length
                ;++begin_elem)
            {
                if (print_sep)
                {
                    out << "; ";
                }
                out << *begin_elem;
                print_sep = true;
            }
            out << "\n";
        }

        return out;
    }

    std::vector<std::complex<double>>
        operator()(sycl::queue & q, std::vector<std::complex<double>> samples) const
        {
            std::size_t fft_length{get_fft_length()};

            using cplx = std::complex<double>;
            std::vector<cplx> ret(fft_length, 0.0);

            sycl::buffer buf_samp(samples);
            sycl::buffer buf_filt(m_filters);
            sycl::buffer buf_ret(ret);

            q.submit([&](auto & h) {
                sycl::accessor acc_samp(buf_samp, h, sycl::read_only);
                sycl::accessor acc_filt(buf_filt, h, sycl::read_only);
                sycl::accessor acc_ret(buf_ret, h, sycl::write_only);

                h.parallel_for(fft_length, [=](auto idx){
                    acc_ret[idx] = cplx(0.0, 0.0);
                    for (std::size_t jdx{0}; jdx < fft_length; ++jdx)
                    {
                        acc_ret[idx] += acc_filt[idx * fft_length + jdx] * acc_samp[jdx];
                    }

                });
            });

            return ret;

        }

private:
    std::vector<std::complex<double>> m_filters;
    std::size_t get_fft_length() const
    {
        return std::round(std::sqrt(m_filters.size()));
    }
};

}

template <typename Iter>
void print(Iter begin, Iter end, char sep = ';')
{
    bool print_sep(false);
    for (Iter it(begin); it < end; ++it)
    {
        if (print_sep)
            std::cout << sep << " ";
        std::cout << *it;
        print_sep = true;
    }
    std::cout << std::endl;

}

int main(int argc, char ** argv)
{
    const unsigned int sample_count{8};
    const unsigned int channel{7};

    auto samples = signals::tone(sample_count, 2 * M_PI * channel / sample_count);

    auto fft_val1 = fft::fft(samples);

    sycl::queue q;
    std::cout << "Running on "
              << q.get_device().get_info<sycl::info::device::name>()
              << "\n";
    fft::Fft fft(8);
    auto fft_val2 = fft(q, samples);

    std::cout << "=== SIGNAL ===" << std::endl;
    print(samples.begin(), samples.end());
    std::cout << "=== FFT ===" << std::endl;
    print(fft_val1.begin(), fft_val1.end());
    //std::cout << "=== === ===" << std::endl;
    //std::cout << fft << std::endl;
    std::cout << "=== FFT ===" << std::endl;
    print(fft_val2.begin(), fft_val2.end());
}

e404044 avatar Sep 21 '22 22:09 e404044

It looks like a missing symbol from compiler-rt, I think this is something we likely should support.

But in the meantime it may be possible to fix it by using the experimental sycl::complex extension rather than std::complex:

  • https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_complex.asciidoc#specification

npmiller avatar Sep 22 '22 13:09 npmiller

@npmiller thanks i tried replacing std::complex everywhere with sycl::complex, but didn't work. also tried the format your link provided (sycl::oneapi::ext::complex) and the compiler's suggested format (sycl::ext::oneapi::complex). Is there something i need to do to properly add the extension?

Can you also elaborate more on a possible missing symbol from compiler-rt? Is that something I can correct?

[20:50]ec2-user$ clang++ -fsycl -fsycl-targets=spir64_fpga fft-sycl_GPU.cpp -o fft-sycl-acc
fft-sycl_GPU.cpp:27:16: error: no template named 'complex' in namespace 'sycl'; did you mean 'std::complex'?
-> std::vector<sycl::complex<double>>
               ^~~~~~~~~~~~~
               std::complex
/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/complex:122:12: note: 'std::complex' declared here
    struct complex
           ^
[20:59]ec2-user$ clang++ -fsycl -fsycl-targets=spir64_fpga fft-sycl_GPU.cpp -o fft-sycl-gpu
fft-sycl_GPU.cpp:27:16: error: no member named 'oneapi' in namespace 'sycl'; did you mean 'sycl::ext::oneapi'?
-> std::vector<sycl::oneapi::ext::complex<double>>
               ^~~~~~~~~~~~
               sycl::ext::oneapi
/var/home/ec2-user/projects/sycl_workspace/llvm/build/bin/../include/sycl/ext/oneapi/sub_group_mask.hpp:24:11: note: 'sycl::ext::oneapi' declared here
namespace oneapi {
          ^
[21:03]ec2-user$ clang++ -fsycl -fsycl-targets=spir64_fpga fft-sycl_GPU.cpp -o fft-sycl-gpu
fft-sycl_GPU.cpp:27:35: error: no member named 'complex' in namespace 'sycl::ext::oneapi'
-> std::vector<sycl::ext::oneapi::complex<double>>
               ~~~~~~~~~~~~~~~~~~~^

e404044 avatar Sep 22 '22 21:09 e404044

Can you also elaborate more on a possible missing symbol from compiler-rt? Is that something I can correct?

Not really, to clarify on this we actually don't currently support std::complex at all with the CUDA backend, the idea is that sycl::complex should be used instead.

Is there something i need to do to properly add the extension?

You need to define the right macro and include the extension header:

#define SYCL_EXT_ONEAPI_COMPLEX       
                                           
#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
#include <sycl/sycl.hpp>   

See the complex test here:

  • https://github.com/intel/llvm/blob/sycl/sycl/test/extensions/test_complex.cpp

npmiller avatar Sep 23 '22 08:09 npmiller

yup std::vector<sycl::ext::oneapi::experimental::complex<double>> worked. thanks!

e404044 avatar Sep 23 '22 17:09 e404044