thrust
thrust copied to clipboard
Custom memory resource/allocator is not used by `thrust::async` algorithms for temporary allocations
I'm trying to implement a multi-buffered sort (i.e. overlapping h2d copy, sort of a sub-range and d2h copy in three different streams) using the async
API. For this I would like to use a custom memory resource (inspired by the cached_allocator
from the examples, but always giving a pointer to the same block when asked for the same size, one resource per stream) to avoid the unnecessary allocations for temporary storage in thrust::async::sort
. When using the thrust::device(alloc).after(h2d)
execution policy, my allocator (I tried both thrust::mr::allocator
and thrust::mr::polymorphic_allocator
with the adaptor resource as in mr_basic.cu
) seems to get ignored as debug print statements inside the resource are not executed and I can see the many calls to cudaMalloc
in the nsys
profile.
Are custom allocators for async
algorithms always ignored? Or does my allocator/memory resource just need to fulfill some requirement to be used? When used for a thrust::device_vector
it seems to work fine.
#include <thrust/memory.h>
#include <thrust/mr/memory_resource.h>
#include <algorithm>
#include <cstdlib>
#ifndef NDEBUG
#include <iostream>
#endif
#include <sstream>
#include <string>
#include <tuple>
#include <vector>
struct not_my_pointer {
not_my_pointer(void* p) : message() {
std::stringstream s;
s << "Pointer `" << p << "` was not allocated by this allocator.";
message = s.str();
}
virtual ~not_my_pointer() {}
virtual const char* what() const { return message.c_str(); }
private:
std::string message;
};
template <class Upstream>
class unsafe_cached_resource final
: public thrust::mr::memory_resource<typename Upstream::pointer> {
public:
unsafe_cached_resource(Upstream* upstream) : m_upstream(upstream) {}
unsafe_cached_resource()
: m_upstream(thrust::mr::get_global_resource<Upstream>()) {}
~unsafe_cached_resource() { release(); }
private:
typedef typename Upstream::pointer void_ptr;
typedef std::tuple<std::ptrdiff_t, std::size_t, void_ptr> block_t;
std::vector<block_t> blocks{};
Upstream* m_upstream;
public:
void release() {
#ifndef NDEBUG
std::cout << "unsafe_cached_resource::release()" << std::endl;
#endif
std::for_each(blocks.begin(), blocks.end(), [this](block_t& block) {
auto& [bytes, alignment, ptr] = block;
m_upstream->do_deallocate(ptr, bytes, alignment);
});
}
void_ptr do_allocate(std::size_t bytes, std::size_t alignment) override {
#ifndef NDEBUG
std::cout << "unsafe_cached_resource::do_allocate(): num_bytes == "
<< bytes << std::endl;
#endif
void_ptr result = nullptr;
auto const fitting_block = std::find_if(
blocks.cbegin(), blocks.cend(),
[bytes, alignment](block_t const& block) {
auto& [b_bytes, b_alignment, _] = block;
return b_bytes == bytes && b_alignment == alignment;
});
if (fitting_block != blocks.end()) {
#ifndef NDEBUG
std::cout
<< "unsafe_cached_resource::do_allocate(): found a free block"
<< std::endl;
#endif
result = std::get<2>(*fitting_block);
} else {
#ifndef NDEBUG
std::cout
<< "unsafe_cached_resource::do_allocate(): allocating new block"
<< std::endl;
#endif
result = m_upstream->do_allocate(bytes, alignment);
blocks.emplace_back(bytes, alignment, result);
}
return result;
}
void do_deallocate(void_ptr ptr, std::size_t bytes,
std::size_t alignment) override {
#ifndef NDEBUG
std::cout << "single_block_cached_allocator::do_deallocate(): ptr == "
<< reinterpret_cast<void*>(ptr.get()) << std::endl;
#endif
auto const fitting_block = std::find_if(
blocks.cbegin(), blocks.cend(),
[ptr](block_t const& block) { return std::get<2>(block) == ptr; });
if (fitting_block == blocks.end()) {
throw not_my_pointer(thrust::raw_pointer_cast(ptr));
}
}
};
This bug (?) can be reproed easily by putting above code into e.g. the thrust::async
example in the readme and adding an allocator to the reduction:
#include <thrust/async/copy.h>
#include <thrust/async/reduce.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/mr/allocator.h>
#include <thrust/mr/device_memory_resource.h>
#include <thrust/random.h>
#include <iostream>
#include <numeric>
#include "unsafe_cached_resource.h"
int main() {
thrust::default_random_engine rng(123456);
thrust::uniform_real_distribution<double> dist(-50.0, 50.0);
thrust::host_vector<double> h_vec(32 << 20);
thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
thrust::device_vector<double> d_vec(h_vec.size());
thrust::device_event e =
thrust::async::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
using MemRes = thrust::device_ptr_memory_resource<
unsafe_cached_resource<thrust::device_memory_resource>>;
using Alloc = thrust::mr::allocator<double, MemRes>;
MemRes mem_res{};
Alloc alloc{&mem_res};
auto f0 =
thrust::async::reduce(thrust::device(alloc).after(e), d_vec.begin(),
d_vec.end(), 0.0, thrust::plus<double>());
double f1 = std::accumulate(h_vec.begin(), h_vec.end(), 0.0,
thrust::plus<double>());
std::cout << f0.get() << ' ' << f1 << '\n';
}
Where again one will only see unsafe_cached_resource::release()
being printed, but no allocations. When using the non-async
reduction it allocates/prints as expected. I used “Cuda compilation tools, release 11.6, V11.6.112” with “gcc (Ubuntu 11.2.0-19ubuntu1) 11.2.0”:
nvcc -O3 -std=c++17 -gencode arch=compute_61,code=sm_61 -o async_example async_example.cu
and executed on a GTX 1070.
This sounds like the same root cause as #1195. Since the async algorithms are no longer actively developed it's unlikely that this will get fixed in the current thrust::async
interfaces. We're working on an alternative async API that would avoid these issues, but this is still in the early stages of development.
The Thrust async algorithms are not going to be supported going forward, prefer to use the thrust::cuda::par_nosync
execution policy where supported instead.