performance: align structures for 64-bit platforms
More info about PR changes
If you are well versed in optimization for C/C++/C# compilers, then you know about Pahole memory analyzer tool.
I used a utility Pahole (https://linux.die.net/man/1/pahole) from the Linux kernel that allows you to analyze structures in memory that are not aligned, if its size exceeds the CPU cacheline (by default 64 byte, therefore, it is very important that the structures are multiples of 64 bytes CPU needs more clock cycles, and alignment also gives a strong performance boost. (Example 64 bytes, 128 bytes or less 56 bytes, 112 bytes and etc. ))
Structure fields change readable as possible from original code style.
This commit reduced cpu cost time move, copy, create objects with changed structures.
Smaller size structure or class, higher chance putting into CPU cache. Most processors are already 64 bit, so the change won't make it any worse.
Example in real production
Google developer simple align structure fields, results here:
- https://www.phoronix.com/news/Linux-6.8-Networking
- https://lore.kernel.org/lkml/[email protected]/
Info about technique:
- https://softwareengineering.stackexchange.com/questions/328775/how-important-is-memory-alignment-does-it-still-matter
- https://wr.informatik.uni-hamburg.de/_media/teaching/wintersemester_2013_2014/epc-14-haase-svenhendrik-alignmentinc-presentation.pdf
- https://hpc.rz.rptu.de/Tutorials/AVX/alignment.shtml
- https://www.intel.com/content/www/us/en/developer/articles/technical/data-alignment-to-assist-vectorization.html
- https://en.wikipedia.org/wiki/Data_structure_alignment
- https://stackoverflow.com/a/20882083
- https://zijishi.xyz/post/optimization-technique/learning-to-use-data-alignment/
Pahole example with HudElements:
- Comment
/* XXX {n} bytes hole, try to pack */shows where optimization is possible by rearranging the order of fields structures and classes
Master branch have 13 CPU cachelines My PR have 12 CPU cachelines, That's a big difference in hot path code.
Master branch
class HudElements {
struct exec_entry {
int pos; /* 0 4 */
/* XXX 4 bytes hole, try to pack */
string value; /* 8 32 */
string ret; /* 40 32 */
/* size: 72, cachelines: 2, members: 3 */
/* sum members: 68, holes: 1, sum holes: 4 */
/* last cacheline: 8 bytes */
};
/* tag__fprintf: const_type tag not supported! */;
enum display_servers {
UNKNOWN = 0,
WAYLAND = 1,
XWAYLAND = 2,
XORG = 3,
};
struct hud_colors {
bool convert; /* 0 1 */
bool update; /* 1 1 */
/* XXX 2 bytes hole, try to pack */
struct ImVec4 cpu; /* 4 16 */
struct ImVec4 gpu; /* 20 16 */
struct ImVec4 vram; /* 36 16 */
struct ImVec4 ram; /* 52 16 */
/* --- cacheline 1 boundary (64 bytes) was 4 bytes ago --- */
struct ImVec4 swap; /* 68 16 */
struct ImVec4 engine; /* 84 16 */
struct ImVec4 io; /* 100 16 */
struct ImVec4 frametime; /* 116 16 */
/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
struct ImVec4 background; /* 132 16 */
struct ImVec4 text; /* 148 16 */
struct ImVec4 media_player; /* 164 16 */
struct ImVec4 wine; /* 180 16 */
/* --- cacheline 3 boundary (192 bytes) was 4 bytes ago --- */
struct ImVec4 horizontal_separator; /* 196 16 */
struct ImVec4 battery; /* 212 16 */
struct ImVec4 gpu_load_low; /* 228 16 */
struct ImVec4 gpu_load_med; /* 244 16 */
/* --- cacheline 4 boundary (256 bytes) was 4 bytes ago --- */
struct ImVec4 gpu_load_high; /* 260 16 */
struct ImVec4 cpu_load_low; /* 276 16 */
struct ImVec4 cpu_load_med; /* 292 16 */
struct ImVec4 cpu_load_high; /* 308 16 */
/* --- cacheline 5 boundary (320 bytes) was 4 bytes ago --- */
struct ImVec4 fps_value_low; /* 324 16 */
struct ImVec4 fps_value_med; /* 340 16 */
struct ImVec4 fps_value_high; /* 356 16 */
struct ImVec4 text_outline; /* 372 16 */
/* --- cacheline 6 boundary (384 bytes) was 4 bytes ago --- */
struct ImVec4 network; /* 388 16 */
/* size: 404, cachelines: 7, members: 27 */
/* sum members: 402, holes: 1, sum holes: 2 */
/* last cacheline: 20 bytes */
};
public:
struct swapchain_stats * sw_stats; /* 0 8 */
class shared_ptr<overlay_params> params; /* 8 16 */
float ralign_width; /* 24 4 */
float old_scale; /* 28 4 */
float res_width; /* 32 4 */
float res_height; /* 36 4 */
bool is_vulkan; /* 40 1 */
bool gamemode_bol; /* 41 1 */
bool vkbasalt_bol; /* 42 1 */
/* XXX 1 byte hole, try to pack */
int place; /* 44 4 */
int text_column; /* 48 4 */
int table_columns_count; /* 52 4 */
pid_t g_gamescopePid; /* 56 4 */
int g_fsrUpscale; /* 60 4 */
int g_fsrSharpness; /* 64 4 */
/* XXX 4 bytes hole, try to pack */
time_point last_exec; /* 72 8 */
class vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > options; /* 80 24 */
class vector<Function, std::allocator<Function> > ordered_functions; /* 104 24 */
class vector<float, std::allocator<float> > gamescope_debug_latency; /* 128 24 */
class vector<float, std::allocator<float> > gamescope_debug_app; /* 152 24 */
int min; /* 176 4 */
int max; /* 180 4 */
int gpu_core_max; /* 184 4 */
int gpu_mem_max; /* 188 4 */
int cpu_temp_max; /* 192 4 */
int gpu_temp_max; /* 196 4 */
const class vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > permitted_params; /* 200 24 */
class vector<HudElements::exec_entry, std::allocator<HudElements::exec_entry> > exec_list; /* 224 24 */
time_point overlay_start; /* 248 8 */
uint32_t vendorID; /* 256 4 */
int hdr_status; /* 260 4 */
int refresh; /* 264 4 */
unsigned int vsync; /* 268 4 */
enum display_servers display_server; /* 272 4 */
/* XXX 4 bytes hole, try to pack */
class unique_ptr<Net, std::default_delete<Net> > net; /* 280 8 */
class unique_ptr<Shell, std::default_delete<Shell> > shell; /* 288 8 */
void sort_elements(class HudElements *, const struct pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<c);
void legacy_elements(class HudElements *, const struct overlay_params *);
void update_exec(class HudElements *);
int convert_to_fahrenheit(class HudElements *, int);
void version(void);
void time(void);
void gpu_stats(void);
void cpu_stats(void);
void core_load(void);
void io_stats(void);
void vram(void);
void proc_vram(void);
void ram(void);
void procmem(void);
void fps(void);
void engine_version(void);
void gpu_name(void);
void vulkan_driver(void);
void arch(void);
void wine(void);
void frame_timing(void);
void media_player(void);
void resolution(void);
void show_fps_limit(void);
void custom_text_center(void);
void custom_text(void);
void vkbasalt(void);
void gamemode(void);
void graphs(void);
void _exec(void);
void battery(void);
void fps_only(void);
void gamescope_fsr(void);
void gamescope_frame_timing(void);
void device_battery(void);
void frame_count(void);
void fan(void);
void throttling_status(void);
void exec_name(void);
void duration(void);
void fps_metrics(void);
void hdr(void);
void refresh_rate(void);
void winesync(void);
void present_mode(void);
void network(void);
void _display_session(void);
void fex_stats(void);
void ftrace(void);
void convert_colors(class HudElements *, const struct overlay_params &);
void convert_colors(class HudElements *, bool, const struct overlay_params &);
struct hud_colors colors; /* 296 404 */
/* XXX last struct has 1 hole */
void TextColored(class HudElements *, struct ImVec4, const char *, ...);
/* --- cacheline 10 boundary (640 bytes) was 60 bytes ago --- */
struct array<VkPresentModeKHR, 6> presentModes; /* 700 24 */
/* XXX 4 bytes hole, try to pack */
/* --- cacheline 11 boundary (704 bytes) was 24 bytes ago --- */
class map<VkPresentModeKHR, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<VkPresentModeKHR>, std::allocator<std::pair<const VkPresentModeKHR, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > presentModeMap; /* 728 48 */
/* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */
VkPresentModeKHR cur_present_mode; /* 776 4 */
string get_present_mode(class HudElements *);
/* size: 784, cachelines: 13, members: 40 */
/* sum members: 767, holes: 4, sum holes: 13 */
/* padding: 4 */
/* member types with holes: 1, total: 1 */
/* last cacheline: 16 bytes */
};
My PR branch
class HudElements {
enum display_servers {
UNKNOWN = 0,
WAYLAND = 1,
XWAYLAND = 2,
XORG = 3,
};
struct exec_entry {
int pos; /* 0 4 */
/* XXX 4 bytes hole, try to pack */
string value; /* 8 32 */
string ret; /* 40 32 */
/* size: 72, cachelines: 2, members: 3 */
/* sum members: 68, holes: 1, sum holes: 4 */
/* last cacheline: 8 bytes */
};
/* tag__fprintf: const_type tag not supported! */;
struct hud_colors {
bool convert; /* 0 1 */
bool update; /* 1 1 */
/* XXX 2 bytes hole, try to pack */
struct ImVec4 cpu; /* 4 16 */
struct ImVec4 gpu; /* 20 16 */
struct ImVec4 vram; /* 36 16 */
struct ImVec4 ram; /* 52 16 */
/* --- cacheline 1 boundary (64 bytes) was 4 bytes ago --- */
struct ImVec4 swap; /* 68 16 */
struct ImVec4 engine; /* 84 16 */
struct ImVec4 io; /* 100 16 */
struct ImVec4 frametime; /* 116 16 */
/* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */
struct ImVec4 background; /* 132 16 */
struct ImVec4 text; /* 148 16 */
struct ImVec4 media_player; /* 164 16 */
struct ImVec4 wine; /* 180 16 */
/* --- cacheline 3 boundary (192 bytes) was 4 bytes ago --- */
struct ImVec4 horizontal_separator; /* 196 16 */
struct ImVec4 battery; /* 212 16 */
struct ImVec4 gpu_load_low; /* 228 16 */
struct ImVec4 gpu_load_med; /* 244 16 */
/* --- cacheline 4 boundary (256 bytes) was 4 bytes ago --- */
struct ImVec4 gpu_load_high; /* 260 16 */
struct ImVec4 cpu_load_low; /* 276 16 */
struct ImVec4 cpu_load_med; /* 292 16 */
struct ImVec4 cpu_load_high; /* 308 16 */
/* --- cacheline 5 boundary (320 bytes) was 4 bytes ago --- */
struct ImVec4 fps_value_low; /* 324 16 */
struct ImVec4 fps_value_med; /* 340 16 */
struct ImVec4 fps_value_high; /* 356 16 */
struct ImVec4 text_outline; /* 372 16 */
/* --- cacheline 6 boundary (384 bytes) was 4 bytes ago --- */
struct ImVec4 network; /* 388 16 */
/* size: 404, cachelines: 7, members: 27 */
/* sum members: 402, holes: 1, sum holes: 2 */
/* last cacheline: 20 bytes */
};
public:
struct swapchain_stats * sw_stats; /* 0 8 */
class shared_ptr<overlay_params> params; /* 8 16 */
time_point last_exec; /* 24 8 */
time_point overlay_start; /* 32 8 */
class unique_ptr<Net, std::default_delete<Net> > net; /* 40 8 */
class unique_ptr<Shell, std::default_delete<Shell> > shell; /* 48 8 */
class vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > options; /* 56 24 */
class vector<Function, std::allocator<Function> > ordered_functions; /* 80 24 */
class vector<float, std::allocator<float> > gamescope_debug_latency; /* 104 24 */
class vector<float, std::allocator<float> > gamescope_debug_app; /* 128 24 */
const class vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > permitted_params; /* 152 24 */
class vector<HudElements::exec_entry, std::allocator<HudElements::exec_entry> > exec_list; /* 176 24 */
VkPresentModeKHR cur_present_mode; /* 200 4 */
float ralign_width; /* 204 4 */
float old_scale; /* 208 4 */
float res_width; /* 212 4 */
float res_height; /* 216 4 */
int place; /* 220 4 */
int text_column; /* 224 4 */
int table_columns_count; /* 228 4 */
pid_t g_gamescopePid; /* 232 4 */
int g_fsrUpscale; /* 236 4 */
int g_fsrSharpness; /* 240 4 */
int min; /* 244 4 */
int max; /* 248 4 */
int gpu_core_max; /* 252 4 */
int gpu_mem_max; /* 256 4 */
int cpu_temp_max; /* 260 4 */
int gpu_temp_max; /* 264 4 */
uint32_t vendorID; /* 268 4 */
int hdr_status; /* 272 4 */
int refresh; /* 276 4 */
unsigned int vsync; /* 280 4 */
enum display_servers display_server; /* 284 4 */
bool is_vulkan; /* 288 1 */
bool gamemode_bol; /* 289 1 */
bool vkbasalt_bol; /* 290 1 */
/* XXX 1 byte hole, try to pack */
void sort_elements(class HudElements *, const struct pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<c);
void legacy_elements(class HudElements *, const struct overlay_params *);
void update_exec(class HudElements *);
int convert_to_fahrenheit(class HudElements *, int);
void version(void);
void time(void);
void gpu_stats(void);
void cpu_stats(void);
void core_load(void);
void io_stats(void);
void vram(void);
void proc_vram(void);
void ram(void);
void procmem(void);
void fps(void);
void engine_version(void);
void gpu_name(void);
void vulkan_driver(void);
void arch(void);
void wine(void);
void frame_timing(void);
void media_player(void);
void resolution(void);
void show_fps_limit(void);
void custom_text_center(void);
void custom_text(void);
void vkbasalt(void);
void gamemode(void);
void graphs(void);
void _exec(void);
void battery(void);
void fps_only(void);
void gamescope_fsr(void);
void gamescope_frame_timing(void);
void device_battery(void);
void frame_count(void);
void fan(void);
void throttling_status(void);
void exec_name(void);
void duration(void);
void fps_metrics(void);
void hdr(void);
void refresh_rate(void);
void winesync(void);
void present_mode(void);
void network(void);
void _display_session(void);
void fex_stats(void);
void ftrace(void);
void convert_colors(class HudElements *, const struct overlay_params &);
void convert_colors(class HudElements *, bool, const struct overlay_params &);
struct hud_colors colors; /* 292 404 */
/* XXX last struct has 1 hole */
void TextColored(class HudElements *, struct ImVec4, const char *, ...);
/* --- cacheline 10 boundary (640 bytes) was 56 bytes ago --- */
struct array<VkPresentModeKHR, 6> presentModes; /* 696 24 */
/* --- cacheline 11 boundary (704 bytes) was 16 bytes ago --- */
class map<VkPresentModeKHR, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<VkPresentModeKHR>, std::allocator<std::pair<const VkPresentModeKHR, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > presentModeMap; /* 720 48 */
string get_present_mode(class HudElements *);
/* size: 768, cachelines: 12, members: 40 */
/* sum members: 767, holes: 1, sum holes: 1 */
/* member types with holes: 1, total: 1 */
};
The linked examples are about kernel TCP/netdev internals, which MangoHud doesn’t use, so they don’t justify this change. Without benchmarks in MangoHud itself, this feels like high churn for purely theoretical gains. Since we still ship 32bit builds, optimized 64-bit layouts also risk being neutral or even negative on 32bit.