hpx
hpx copied to clipboard
Performance drop
Expected Behavior
No performance drop
Actual Behavior
Performance drop of 25% with the changes of https://github.com/STEllAR-GROUP/hpx/commit/6f16e770c9f6ae809e9fbf15384ab11e40c390e6
Steps to Reproduce the Problem
Use of restricted_thread_pool_executor
and executor_guided_chunk_size
as in my old examples in https://github.com/STEllAR-GROUP/hpx/pull/5117.
Specifications
- Ubuntu clang version 14.0.0-1ubuntu1
@m-diers Marco could you please be more specific how to reproduce this? Would you have a small code reproducing it?
The performance degradation is noticeable in simple algorithms, such as the ones below. (d09db41 is the merge of the relevant PR#6157, while 75838f5 is before the merge)
for_each | transform |
---|---|
![]() |
![]() |
![]() |
![]() |
Note that there was small improvement in some cases (ie for CPU-heavy tasks & smaller input sizes), if that's of any interest.
@Pansysk75 thanks. Doesn't that show trends opposite to the measurement we did before? This PR binds threads to cores in a way preventing for them to be stolen by other cores, which could be the reason for the performance regression. Could you please try commenting out this code section: https://github.com/STEllAR-GROUP/hpx/blob/d05037b092ac4f5b5ab18c470d5856c02ab58cec/libs/core/algorithms/include/hpx/parallel/util/adapt_thread_priority.hpp#L31-L46 and try again?
@hkaiser
Here is the old example with adapted workload.
Example
#include "hpx/runtime_distributed/find_all_localities.hpp"
#include <hpx/hpx_init.hpp>
#include <hpx/iostream.hpp>
#include <hpx/future.hpp>
#include <hpx/compute_local/host/numa_domains.hpp>
#include <hpx/include/parallel_executors.hpp>
#include <hpx/parallel/algorithms/for_loop.hpp>
namespace hpx::parallel::execution {
struct restricted_thread_pool_executor_v : public restricted_thread_pool_executor {
public:
explicit restricted_thread_pool_executor_v( std::pair<std::size_t, std::size_t> num_pus )
: restricted_thread_pool_executor( num_pus.first,
num_pus.second )
, num_pus_( num_pus ) {
}
auto get_num_pus() const -> std::pair<std::size_t, std::size_t> {
return num_pus_;
}
private:
std::pair<std::size_t, std::size_t> num_pus_;
};
template<>
struct is_one_way_executor<restricted_thread_pool_executor_v> : std::true_type {
};
template<>
struct is_two_way_executor<restricted_thread_pool_executor_v> : std::true_type {
};
template<>
struct is_bulk_two_way_executor<restricted_thread_pool_executor_v> : std::true_type {
};
struct executor_guided_chunk_size {
constexpr explicit executor_guided_chunk_size() = default;
template<typename Executor, typename F>
constexpr std::size_t get_chunk_size( Executor&& exec,
F&& f,
std::size_t cores,
std::size_t num_tasks ) const {
auto corecount( exec.get_num_pus().second );
//auto corecount( 4ul ); // for me 4 or 6, fix it
return ( std::max )( 1ul, ( num_tasks + corecount - 1 ) / corecount );
}
private:
friend class hpx::serialization::access;
template<typename Archive>
void serialize( Archive& /*ar*/,
const unsigned int /*version*/ ) {
}
};
template<>
struct is_executor_parameters<executor_guided_chunk_size>
: std::true_type {
};
}
using Target = std::array<std::size_t, 3>;
using Targets = std::vector<Target>;
class Component6177 : public hpx::components::component_base<Component6177> {
private:
using Grid = std::vector<std::vector<float>>;
static constexpr auto TimeSteps = 5000ul;
static constexpr auto DimX = 540ul;
static constexpr auto DimZ = 230ul;
static constexpr auto DimPad = 6ul;
public:
Component6177() = default;
auto execute( Target target ) -> Target {
hpx::parallel::execution::restricted_thread_pool_executor_v executor{ std::make_pair( target[1], target[2] ) };
Grid grida( DimX, Grid::value_type( DimZ, 1.f ) );
Grid padded( DimX + 2 * DimPad, Grid::value_type( DimZ + 2 * DimPad, 99.f ) );
for( auto t( 0ul ); t < TimeSteps; ++t ) {
hpx::for_loop( hpx::execution::par.on( executor ).with( hpx::parallel::execution::executor_guided_chunk_size{} ),
DimPad, DimX + DimPad,
[&]( auto i ) {
auto sign = i % 2;
for( auto k = DimPad; k < DimZ + DimPad; ++k ) {
grida[i - DimPad][k - DimPad] = 0.5 * padded[i][k] +
0.4 * ( padded[i + 1][k] + sign * padded[i - 1][k] ) +
0.3 * ( padded[i + 2][k] + sign * padded[i - 2][k] ) +
0.2 * ( padded[i + 3][k] + sign * padded[i - 3][k] ) +
0.1 * ( padded[i + 4][k] + sign * padded[i - 4][k] ) +
0.09 * ( padded[i + 5][k] + sign * padded[i - 5][k] ) +
0.08 * ( padded[i + 6][k] + sign * padded[i - 6][k] ) +
0.5 * padded[i][k] +
0.4 * ( padded[i][k + 1] + sign * padded[i][k - 1] ) +
0.3 * ( padded[i][k + 2] + sign * padded[i][k - 2] ) +
0.2 * ( padded[i][k + 3] + sign * padded[i][k - 3] ) +
0.1 * ( padded[i][k + 4] + sign * padded[i][k - 4] ) +
0.09 * ( padded[i][k + 5] + sign * padded[i][k - 5] ) +
0.08 * ( padded[i][k + 6] + sign * padded[i][k - 6] );
}
} );
}
return target;
}
auto targets( std::size_t hint ) const -> Targets {
auto numadomains( hpx::compute::host::numa_domains() );
auto numacount( hpx::compute::host::numa_domains().size() );
auto numasize( hpx::compute::host::numa_domains().front().num_pus().second );
auto executorsize( std::min( hint, numasize ) );
while( numasize % executorsize ) {
++executorsize;
}
Targets targets;
for( auto i( 0u ); i < ( numacount * numasize ); i += executorsize ) {
targets.emplace_back( Target{ hpx::get_locality_id(), i, executorsize } );
}
return targets;
}
HPX_DEFINE_COMPONENT_ACTION( Component6177, execute );
HPX_DEFINE_COMPONENT_ACTION( Component6177, targets );
};
HPX_REGISTER_COMPONENT( hpx::components::component<Component6177>, Component6177 );
HPX_REGISTER_ACTION( Component6177::execute_action );
HPX_REGISTER_ACTION( Component6177::targets_action );
class Component6177Client : public hpx::components::client_base<Component6177Client, Component6177> {
using BaseType = hpx::components::client_base<Component6177Client, Component6177>;
public:
template<typename... Arguments>
explicit Component6177Client( Arguments... arguments )
: BaseType( std::move( arguments )... ) {
}
template<typename... Arguments>
auto execute( Arguments... arguments ) -> hpx::future<Target> {
return hpx::async<Component6177::execute_action>( this->get_id(), std::move( arguments )... );
}
template<typename... Arguments>
auto targets( Arguments... arguments ) -> hpx::future<Targets> {
return hpx::async<Component6177::targets_action>( this->get_id(), std::move( arguments )... );
}
};
int hpx_main() {
std::vector<Component6177Client> clients;
auto localities( hpx::find_all_localities() );
std::transform( std::begin( localities ), std::end( localities ),
std::back_inserter( clients ),
[]( auto& loc ) {
return hpx::new_<Component6177Client>( loc );
} );
std::vector<hpx::future<Targets>> targets;
for( auto& client : clients ) {
targets.emplace_back( client.targets( 4ul ) );
}
std::vector<hpx::future<Target>> results;
std::for_each( std::begin( targets ), std::end( targets ),
[&]( auto&& target ) {
for( auto&& subtarget : target.get() ) {
results.emplace_back( clients[subtarget[0]].execute( subtarget ) );
}
} );
for( auto counter( results.size() ); counter < 25ul; ++counter ) {
hpx::wait_any( results );
auto res( std::find_if( std::begin( results ), std::end( results ), []( auto& result ) { return result.has_value(); } ) );
auto result( res->get() );
*res = clients[result[0]].execute( result );
std::cout << "Shot " << counter << " on " << result[0] << " " << result[1] << ":" << result[2] << std::endl;
}
hpx::wait_all( results );
return hpx::finalize();
}
int main( int argc, char* argv[] ) {
return hpx::init( argc, argv );
}
Unfortunately, I also get a compilation error with the commit I mentioned. I have entered my fixed value there for testing. The executor and chunk_size must still be transferred.
Runtime:
- AMD Ryzen 9 5950X: 10 s vs 25 s
- AMD EPYC 7401P: 16 s vs 32 s
@Pansysk75 thanks. Doesn't that show trends opposite to the measurement we did before? This PR binds threads to cores in a way preventing for them to be stolen by other cores, which could be the reason for the performance regression. Could you please try commenting out this code section:
https://github.com/STEllAR-GROUP/hpx/blob/d05037b092ac4f5b5ab18c470d5856c02ab58cec/libs/core/algorithms/include/hpx/parallel/util/adapt_thread_priority.hpp#L31-L46
and try again?
@hkaiser Partially eliminates the performance drop
Application runtime:
- AMD Ryzen 9 5950X (1 NUMA node): 10 s vs 11 s
- AMD EPYC 7401P (4 NUMA nodes): 16 s vs 21 s
Here is another data point supporting the evidence that we have not entirely fixed the performance regression yet:

The yellow line is what we have to strive for, the pink one represents the current state.