hcc about the performance of C++ AMP mode

is there any difference in the performance with C++ AMP mode compared with HCC mode? I want to keep compatibility with Windows platform. so the C++ AMP compatible mode is important for me. when I run some program on the both Windows and Linux with C++ AMP, Windows run 2 times faster than Linux, Ubuntu at 2015.

Nov 15 '17 16:11 smithakihide

I tried to compare with simple diffusion equation code.

I examined both versions of single precision and double precision. the common conditions are array size = 1048576, loop number nT = 128. init means the time consumption for initialization. calc means the time consumption for calculation.

the compile command is, hcc `clamp-config --cxxflags --ldflags` amp_std_comp.cpp -O3 -o dd. ubuntu 16.04.3

the result with single precision.

i5-4670 CPU @ 3.40GHz with normal loop and std::vector
init = 0.08781[s]
calc = 1.66423[s]   x1.00
RX 580 with parallel for and array
init = 0.210147[s]
calc = 0.029218[s]  x56.9
W8100 
init = 0.209308[s]
calc = 0.017998[s]  x92.5

the result of double precision,

i5-4670 CPU @ 3.40GHz with normal loop and std::vector
init = 0.076328[s]
calc = 0.959732[s] x1.00
RX 580 with parallel for and array
init = 0.183225[s]
calc = 0.033789[s] x28.4
W8100
init = 0.094943[s]
calc = 0.024426[s] x39.2

single precision on Windows 10 ver 1703 with VS 2017, Release build and AVX2.

Core i7 6800K @3.6GHz
init = 0.040035[s]
calc = 3.55448[s]   x0.46
RX 480
init = 0.06945[s]
calc = 0.054498[s]  x30.5
GTX 1080
init = 0.064302[s]
calc = 0.052711[s]  x31.6
Haswell GT2 mobile
init = 0.065724[s]
calc = 0.433497[s]  x3.84

double precision on Windows 10

Intel Core i7 6800K
init = 0.051084[s]
calc = 1.77646[s]  x0.54
RX 480
init = 0.075427[s]
calc = 0.081349[s]  x11.8
GTX 1080
init = 0.072133[s]
calc = 0.122203[s]  x7.85
R9 280X
init = 0.080332[s]
calc = 0.112282[s]  x8.55
Haswell GT2 mobile
init = 0.049524[s]
calc = 0.792195[s]   x1.21

compared with windows environment, Linux's one is very fast. and double precision data format seems more suitable for modern CPU architectures.

when I compared the similar conditions in 2015, Windows environment run amp code more fast. so I can say that hcc project has improved very much. and its C++ AMP mode is enough fast. I think this performance of HCC is enough for HPC usage. and if C++ AMP mode is compatible with OpenCL 1.2, we can use this for NVIDIA GPU also. because there should be no spir in the code produced by HCC, I think it would be work.

Nov 19 '17 21:11 smithakihide

here is the whole code for this test.

#include <vector>
#include <iostream>
#include <chrono>
#include <thread>
#include <random>
#include <amp.h>
#include <amp_math.h>

concurrency::accelerator ChoseAccelerator()
{
	using  std::cout;
	using  std::cin;
	using  std::wcout;
	using  std::vector;
	using  concurrency::accelerator;

	vector<accelerator> allAccl = accelerator::get_all();


	vector<accelerator> validAccl;
	int numAcs = allAccl.size();

	for (int i = 0; i<numAcs; i++)
	{
		{
			validAccl.push_back(allAccl[i]);
		}
	}

	if (validAccl.size() == 0)
	{
		cout << "there is no GPU suitable for GPGPU\n";
		throw - 1;
	}

	cout << "select the GPU" << "\n\n";
	for (int i = 0; i<validAccl.size(); i++)
	{
		std::wstring st1;
		std::string st2;
		st1 = validAccl[i].get_description();
		for (auto it : st1)
		{
			st2.push_back(it);
		}

		cout << i << " : " << st2 << "\n";
		cout << "    memory                           : "
			<< (double)validAccl[i].get_dedicated_memory() / 1024 / 1000 << " [GB]\n";
		cout << "    full double precision feature    : ";
		if (validAccl[i].get_supports_double_precision())
			cout << "true\n";
		else
			cout << "false\n";

		cout << "    whether be used to display       : ";
		if (validAccl[i].get_has_display())
			cout << "true\n";
		else
			cout << "false\n";

	}
	cout << "\nput a number to specify which GPU to use\n\n";

	cin.exceptions(std::ios::failbit);

	int selAcs;
	while (true)
	{
		cout << "the number = ";
		try {
			cin >> selAcs;
		}
		catch (...) {
			cout << "please input a number\n";
			cin.clear();
			cin.seekg(0);
			continue;
		}
		cout << "\n";

		if (0 <= selAcs && selAcs < validAccl.size())
		{
			cout << selAcs << " is selected\n";
			return validAccl[selAcs];
		}

		cout << "plsese set a number from 0 to " << validAccl.size() - 1 << "."
			<< "there is no GPU its number is" << selAcs << "\n";
	}


	throw - 1;
}

using  std::cout;
using  concurrency::accelerator;
accelerator ChoseAccelerator();

accelerator acc;
int32_t N = 1024*1024;
int32_t nT = 128;

using realp = float;
void Acc();
void cpustd();

int main(int carg, char* varg[])
{
	acc = ChoseAccelerator();

	int sel;
	sel = 0;

	if (carg > 1)
	{
		try {
			N = std::stoi(varg[1]);
		}
		catch (...) {
		}
	}

	if (carg > 2)
	{
		try {
			nT = std::stoi(varg[2]);
		}
		catch (...) {
		}
	}

	cout << "carg = " << carg << "\n";
	cout << "size = " << N << "\n";
	cout << "nT = " << nT << "\n";


	Acc();
	cpustd();

    return 0;
}

void Acc()
{
	cout << "by amp with array\n";
	concurrency::array<realp> *pvA;

	pvA = new concurrency::array<realp>(N, acc.get_default_view());

	auto t0 = std::chrono::high_resolution_clock::now();
	concurrency::array_view<realp> aA = *pvA;
	//initialization.
	std::mt19937 mt(19);
	std::uniform_real_distribution<realp> dist(-1, 1);
	for (int32_t i = 0; i < N; i++)
	{
		aA[i] = dist(mt);
	}
	aA[0] = 0;
	aA[N - 1] = 1;
	aA.synchronize();

	auto t1 = std::chrono::high_resolution_clock::now();

	for (int32_t cT = 0; cT < nT; cT++)
	{
		for (int32_t r = 0; r < 2; r++)
		{
			concurrency::parallel_for_each(
				acc.get_default_view(),
				concurrency::extent<1>((N - 2) / 2),
				[=](concurrency::index<1> iR) restrict(amp) {
				int32_t iC = iR[0] * 2 + r + 1;
				int32_t iE = iC + 1;
				int32_t iW = iC - 1;
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
			});
		}
	}

	auto t2 = std::chrono::high_resolution_clock::now();
	aA.synchronize();


	realp res = aA[N / 4];
	auto t4 = std::chrono::high_resolution_clock::now();

	double dt1 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000000;
	double dt2 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000000;
	double dt4 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t4 - t1).count() / 1000000;

	cout << "Result = " << res << "\n";
	cout << "init = " << dt1 << "[s]\n";
	cout << "dt2 = " << dt2 << "[s]\n";
	cout << "calc = " << dt4 << "[s]\n";

	cout << "Acc\n";
}

void cpustd()
{
	cout << "by raw cpu with vector\n";


	std::vector<realp> *pvA;

	pvA = new std::vector<realp>(N);

	auto t0 = std::chrono::high_resolution_clock::now();
	std::vector<realp>& aA = *pvA;
	//initialization.
	std::mt19937 mt(19);
	std::uniform_real_distribution<realp> dist(-1, 1);
	for (int32_t i = 0; i < N; i++)
	{
		aA[i] = dist(mt);
	}
	aA[0] = 0;
	aA[N - 1] = 1;

	auto t1 = std::chrono::high_resolution_clock::now();

	for (int32_t cT = 0; cT < nT; cT++)
	{
		for (int32_t r = 0; r < 2; r++)
		{
			for (int32_t i = 0; i<(N - 2) / 2; i++)
			{
				int32_t iC = i * 2 + r + 1;
				int32_t iE = iC + 1;
				int32_t iW = iC - 1;
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
				aA[iC] = aA[iC] + 0.1*(aA[iE] - 2 * aA[iC] + aA[iW]);
			}
		}
	}
	auto t2 = std::chrono::high_resolution_clock::now();


	realp res = aA[N / 4];
	auto t4 = std::chrono::high_resolution_clock::now();

	double dt1 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() / 1000000;
	double dt2 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / 1000000;
	double dt4 = (double)std::chrono::duration_cast<std::chrono::microseconds>(t4 - t1).count() / 1000000;

	cout << "Result = " << res << "\n";
	cout << "init = " << dt1 << "[s]\n";
	cout << "dt2 = " << dt2 << "[s]\n";
	cout << "calc = " << dt4 << "[s]\n";
}

Nov 19 '17 21:11 smithakihide

@smithakihide Is hcc program runnable on Windows?

Apr 30 '19 21:04 ghostplant