thrust
thrust copied to clipboard
initialize universal vector on the host
We use thrust:universal_vector
to store data that might be accessed from the CPU or the GPU. We do the processing on the CPU for smaller buffers, and use the GPU when the buffer is larger. However, when we construct the universal vector, it seems that the constructor will call thrust::uninitialized_fill
, which performs the uninitialized fill on the GPU and migrate the memory to the GPU, and cause page faults for subsequent accesses on the CPU. Prefetching can help to mitigate the performance issue a bit, but it is still suboptimal.
I wonder if it is possible to provide some APIs that allows us to construct the universal vector on the CPU, without touching the GPU. Below is an example that shows how much faster we can get with such an API:
#include <chrono>
#include <thrust/execution_policy.h>
#include <thrust/universal_vector.h>
constexpr bool prefetch = false;
int universal(int vec_length) {
auto t0 = std::chrono::high_resolution_clock::now();
thrust::universal_vector<int> test_vector(vec_length);
if (prefetch)
cudaMemPrefetchAsync(test_vector.data().get(), vec_length * sizeof(int),
cudaCpuDeviceId);
for (int j = 0; j < vec_length; j++)
test_vector[j] = j + 1;
thrust::reduce(thrust::host, test_vector.begin(), test_vector.end(), 0,
thrust::plus<int>{});
auto t1 = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}
int raw_managed_memory(int vec_length) {
auto t0 =
std::chrono::high_resolution_clock::now();
int *managed_ptr;
cudaMallocManaged(&managed_ptr, sizeof(int) * vec_length);
thrust::uninitialized_fill_n(thrust::host, managed_ptr, vec_length, 0);
for (int j = 0; j < vec_length; j++)
managed_ptr[j] = j + 1;
thrust::reduce(thrust::host, managed_ptr, managed_ptr + vec_length, 0,
thrust::plus<int>{});
auto t1 = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}
int main() {
// warm up
for (int i = 0; i < 10; i++) {
raw_managed_memory(100);
universal(100);
}
constexpr int repeat = 1000;
constexpr int vec_length = 10000;
int results[2] = {0};
for (int i = 0; i < repeat; i++) {
results[0] += raw_managed_memory(vec_length);
results[1] += universal(vec_length);
}
for (int &r : results)
r /= repeat;
std::cout << "interleaved:" << std::endl;
std::cout << "raw managed memory: " << results[0] << "us"
<< std::endl;
std::cout << "universal vector: " << results[1] << "us"
<< std::endl;
results[0] = 0;
results[1] = 0;
for (int i = 0; i < repeat; i++)
results[0] += raw_managed_memory(vec_length);
for (int i = 0; i < repeat; i++)
results[1] += universal(vec_length);
for (int &r : results)
r /= repeat;
std::cout << std::endl;
std::cout << "grouped:" << std::endl;
std::cout << "raw managed memory: " << results[0] << "us"
<< std::endl;
std::cout << "universal vector: " << results[1] << "us"
<< std::endl;
std::cout << std::endl;
}
With prefetching disabled:
interleaved:
raw managed memory: 14us
universal vector: 199us
grouped:
raw managed memory: 14us
universal vector: 70us
With prefetching enabled:
interleaved:
raw managed memory: 8us
universal vector: 63us
grouped:
raw managed memory: 15us
universal vector: 65us
As a workaround one can adapt the uninitialized_allocator
from examples/uninitialized_vector.cu
to thrust::cuda::univeral_allocator
:
#include <chrono>
#include <thrust/execution_policy.h>
#include <thrust/universal_vector.h>
#include <thrust/system/cuda/memory.h>
template<typename T>
struct uninitialized_allocator
: thrust::cuda::universal_allocator<T>
{
__host__
uninitialized_allocator() {}
__host__
uninitialized_allocator(const uninitialized_allocator & other)
: thrust::cuda::universal_allocator<T>(other) {}
__host__
~uninitialized_allocator() {}
uninitialized_allocator & operator=(const uninitialized_allocator &) = default;
template<typename U>
struct rebind
{
typedef uninitialized_allocator<U> other;
};
__host__ __device__
void construct(T *)
{
// no-op
}
};
template <typename T>
using uninitialized_vector = thrust::universal_vector<T, uninitialized_allocator<T>>;
auto universal(int vec_length) {
auto t0 = std::chrono::steady_clock::now();
thrust::universal_vector<int> test_vector(vec_length);
cudaMemPrefetchAsync(test_vector.data().get(), vec_length * sizeof(int),
cudaCpuDeviceId);
for (int j = 0; j < vec_length; j++)
test_vector[j] = j + 1;
thrust::reduce(thrust::host, test_vector.begin(), test_vector.end(), 0,
thrust::plus<int>{});
auto t1 = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}
auto universal_uninit(int vec_length) {
auto t0 = std::chrono::steady_clock::now();
uninitialized_vector<int> test_vector(vec_length);
thrust::uninitialized_fill(thrust::host, test_vector.begin(), test_vector.end(), 0);
for (int j = 0; j < vec_length; j++)
test_vector[j] = j + 1;
thrust::reduce(thrust::host, test_vector.begin(), test_vector.end(), 0,
thrust::plus<int>{});
auto t1 = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}
auto raw_managed_memory(int vec_length) {
auto t0 =
std::chrono::steady_clock::now();
int *managed_ptr;
cudaMallocManaged(&managed_ptr, sizeof(int) * vec_length);
thrust::uninitialized_fill_n(thrust::host, managed_ptr, vec_length, 0);
for (int j = 0; j < vec_length; j++)
managed_ptr[j] = j + 1;
thrust::reduce(thrust::host, managed_ptr, managed_ptr + vec_length, 0,
thrust::plus<int>{});
auto t1 = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}
int main() {
// warm up
for (int i = 0; i < 10; i++) {
raw_managed_memory(100);
universal(100);
}
constexpr int repeat = 1000;
constexpr int vec_length = 10000;
{
std::chrono::microseconds::rep results[2] = {0};
for (int i = 0; i < repeat; i++) {
results[0] += raw_managed_memory(vec_length);
results[1] += universal(vec_length);
}
for (auto &r : results)
r /= repeat;
std::cout << "interleaved:\n"
"raw managed memory: " << results[0] << "us\n"
"universal vector w/ prefetch: " << results[1] << "us\n\n";
}
{
std::chrono::microseconds::rep results[2] = {0};
for (int i = 0; i < repeat; i++)
results[0] += raw_managed_memory(vec_length);
for (int i = 0; i < repeat; i++)
results[1] += universal(vec_length);
for (auto &r : results)
r /= repeat;
std::cout << "grouped:\n"
"raw managed memory: " << results[0] << "us\n"
"universal vector w/ prefetch: " << results[1] << "us\n\n";
}
// warm up
for (int i = 0; i < 10; i++) {
raw_managed_memory(100);
universal_uninit(100);
}
{
std::chrono::microseconds::rep results[2] = {0};
for (int i = 0; i < repeat; i++) {
results[0] += raw_managed_memory(vec_length);
results[1] += universal_uninit(vec_length);
}
for (auto &r : results)
r /= repeat;
std::cout << "interleaved:\n"
"raw managed memory: " << results[0] << "us\n"
"universal uninit vector w/o prefetch: " << results[1] << "us\n\n";
}
{
std::chrono::microseconds::rep results[2] = {0};
for (int i = 0; i < repeat; i++)
results[0] += raw_managed_memory(vec_length);
for (int i = 0; i < repeat; i++)
results[1] += universal_uninit(vec_length);
for (auto &r : results)
r /= repeat;
std::cout << "grouped:\n"
"raw managed memory: " << results[0] << "us\n"
"universal uninit vector w/o prefetch: " << results[1] << "us\n\n";
}
}
For me this gives
interleaved:
raw managed memory: 10us
universal vector w/ prefetch: 155us
grouped:
raw managed memory: 30us
universal vector w/ prefetch: 202us
interleaved:
raw managed memory: 10us
universal uninit vector w/o prefetch: 37us
grouped:
raw managed memory: 30us
universal uninit vector w/o prefetch: 14us