about the performance of C++ AMP mode
is there any difference in the performance with C++ AMP mode compared with HCC mode? I want to keep compatibility with Windows platform. so the C++ AMP compatible mode is important for me. when I run some program on the both Windows and Linux with C++ AMP, Windows run 2 times faster than Linux, Ubuntu at 2015.
I tried to compare with simple diffusion equation code.
I examined both versions of single precision and double precision. the common conditions are array size = 1048576, loop number nT = 128. init means the time consumption for initialization. calc means the time consumption for calculation.
the compile command is,
hcc `clamp-config --cxxflags --ldflags` amp_std_comp.cpp -O3 -o dd.
ubuntu 16.04.3
the result with single precision.
i5-4670 CPU @ 3.40GHz with normal loop and std::vector
init = 0.08781[s]
calc = 1.66423[s] x1.00
RX 580 with parallel for and array
init = 0.210147[s]
calc = 0.029218[s] x56.9
W8100
init = 0.209308[s]
calc = 0.017998[s] x92.5
the result of double precision,
i5-4670 CPU @ 3.40GHz with normal loop and std::vector
init = 0.076328[s]
calc = 0.959732[s] x1.00
RX 580 with parallel for and array
init = 0.183225[s]
calc = 0.033789[s] x28.4
W8100
init = 0.094943[s]
calc = 0.024426[s] x39.2
single precision on Windows 10 ver 1703 with VS 2017, Release build and AVX2.
Core i7 6800K @3.6GHz
init = 0.040035[s]
calc = 3.55448[s] x0.46
RX 480
init = 0.06945[s]
calc = 0.054498[s] x30.5
GTX 1080
init = 0.064302[s]
calc = 0.052711[s] x31.6
Haswell GT2 mobile
init = 0.065724[s]
calc = 0.433497[s] x3.84
double precision on Windows 10
Intel Core i7 6800K
init = 0.051084[s]
calc = 1.77646[s] x0.54
RX 480
init = 0.075427[s]
calc = 0.081349[s] x11.8
GTX 1080
init = 0.072133[s]
calc = 0.122203[s] x7.85
R9 280X
init = 0.080332[s]
calc = 0.112282[s] x8.55
Haswell GT2 mobile
init = 0.049524[s]
calc = 0.792195[s] x1.21
compared with windows environment, Linux's one is very fast. and double precision data format seems more suitable for modern CPU architectures.
when I compared the similar conditions in 2015, Windows environment run amp code more fast. so I can say that hcc project has improved very much. and its C++ AMP mode is enough fast. I think this performance of HCC is enough for HPC usage. and if C++ AMP mode is compatible with OpenCL 1.2, we can use this for NVIDIA GPU also. because there should be no spir in the code produced by HCC, I think it would be work.
here is the whole code for this test.
#include <vector>
#include <iostream>
#include <chrono>
#include <thread>
#include <random>
#include <amp.h>
#include <amp_math.h>
concurrency::accelerator ChoseAccelerator()
{
using std::cout;
using std::cin;
using std::wcout;
using std::vector;
using concurrency::accelerator;
vector<accelerator> allAccl = accelerator::get_all();
vector<accelerator> validAccl;
int numAcs = allAccl.size();
for (int i = 0; i<numAcs; i++)
{
{
validAccl.push_back(allAccl[i]);
}
}
if (validAccl.size() == 0)
{
cout << "there is no GPU suitable for GPGPU\n";
throw - 1;
}
cout << "select the GPU" << "\n\n";
for (int i = 0; i<validAccl.size(); i++)
{
std::wstring st1;
std::string st2;
st1 = validAccl[i].get_description();
for (auto it : st1)
{
st2.push_back(it);
}
cout << i << " : " << st2 << "\n";
cout << " memory : "
<< (double)validAccl[i].get_dedicated_memory() / 1024 / 1000 << " [GB]\n";
cout << " full double precision feature : ";
if (validAccl[i].get_supports_double_precision())
cout << "true\n";
else
cout << "false\n";
cout << " whether be used to display : ";
if (validAccl[i].get_has_display())
cout << "true\n";
else
cout << "false\n";
}
cout << "\nput a number to specify which GPU to use\n\n";
cin.exceptions(std::ios::failbit);
int selAcs;
while (true)
{
cout << "the number = ";
try {
cin >> selAcs;
}
catch (...) {
cout << "please input a number\n";
cin.clear();
cin.seekg(0);
continue;
}
cout << "\n";
if (0 <= selAcs && selAcs < validAccl.size())
{
cout << selAcs << " is selected\n";
return validAccl[selAcs];
}
cout << "plsese set a number from 0 to " << validAccl.size() - 1 << "."
<< "there is no GPU its number is" << selAcs << "\n";
}
throw - 1;
}
using std::cout;
using concurrency::accelerator;
accelerator ChoseAccelerator();
accelerator acc;
int32_t N = 1024*1024;
int32_t nT = 128;
using realp = float;
void Acc();
void cpustd();
int main(int carg, char* varg[])
{
acc = ChoseAccelerator();
int sel;
sel = 0;
if (carg > 1)
{
try {
N = std::stoi(varg[1]);
}
catch (...) {
}
}
if (carg > 2)
{
try {
nT = std::stoi(varg[2]);
}
catch (...) {
}
}
cout << "carg = " << carg << "\n";
cout << "size = " << N << "\n";
cout << "nT = " << nT << "\n";
Acc();
cpustd();
return 0;
}
void Acc()
{
cout << "by amp with array\n";
concurrency::array<realp> *pvA;
pvA = new concurrency::array<realp>(N, acc.get_default_view());
auto t0 = std::chrono::high_resolution_clock::now();
concurrency::array_view<realp> aA = *pvA;
//initialization.
std::mt19937 mt(19);
std::uniform_real_distribution<realp> dist(-1, 1);
for (int32_t i = 0; i < N; i++)
{
aA[i] = dist(mt);
}
aA[0] = 0;
aA[N - 1] = 1;
aA.synchronize();
auto t1 = std::chrono::high_resolution_clock::now();
for (int32_t cT = 0; cT < nT; cT++)
{
for (int32_t r = 0; r < 2; r++)
{
concurrency::parallel_for_each(
acc.get_default_view(),
concurrency::extent<1>((N - 2) / 2),
[=](concurrency::index<1> iR) restrict(amp) {
int32_t iC = iR[0] * 2 + r + 1;
int32_t iE = iC + 1;
int32_t iW = iC - 1;
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
});
}
}
auto t2 = std::chrono::high_resolution_clock::now();
aA.synchronize();
realp res = aA[N / 4];
auto t4 = std::chrono::high_resolution_clock::now();
double dt1 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000000;
double dt2 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000000;
double dt4 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t4 - t1).count() / 1000000;
cout << "Result = " << res << "\n";
cout << "init = " << dt1 << "[s]\n";
cout << "dt2 = " << dt2 << "[s]\n";
cout << "calc = " << dt4 << "[s]\n";
cout << "Acc\n";
}
void cpustd()
{
cout << "by raw cpu with vector\n";
std::vector<realp> *pvA;
pvA = new std::vector<realp>(N);
auto t0 = std::chrono::high_resolution_clock::now();
std::vector<realp>& aA = *pvA;
//initialization.
std::mt19937 mt(19);
std::uniform_real_distribution<realp> dist(-1, 1);
for (int32_t i = 0; i < N; i++)
{
aA[i] = dist(mt);
}
aA[0] = 0;
aA[N - 1] = 1;
auto t1 = std::chrono::high_resolution_clock::now();
for (int32_t cT = 0; cT < nT; cT++)
{
for (int32_t r = 0; r < 2; r++)
{
for (int32_t i = 0; i<(N - 2) / 2; i++)
{
int32_t iC = i * 2 + r + 1;
int32_t iE = iC + 1;
int32_t iW = iC - 1;
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
}
}
}
auto t2 = std::chrono::high_resolution_clock::now();
realp res = aA[N / 4];
auto t4 = std::chrono::high_resolution_clock::now();
double dt1 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000000;
double dt2 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000000;
double dt4 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t4 - t1).count() / 1000000;
cout << "Result = " << res << "\n";
cout << "init = " << dt1 << "[s]\n";
cout << "dt2 = " << dt2 << "[s]\n";
cout << "calc = " << dt4 << "[s]\n";
}
@smithakihide Is hcc program runnable on Windows?