llama.cpp Introduce Graph Profiler

Here is an attempt at reintroducing the original whole-graph profiler (LLAMA_PERF) with some additional features. Not ready for the merge into master but useful for profiling different models (on CPU).

Features:

Profiling data is collected per op / per thread and aggregated during the report
- We collect three per-op events: START, SYNC, END
- This allows for measuring sync overhead per op
- All timing data is in nanoseconds
The output is the markdown table
- See example below (nice and readable output both in the terminal and rendedered)
The output includes a compact dump of all tensor dims and data-types of the Op (see Flash Attention in the sample below)
- This might be handy in other places.
Changes to the graph_compute are minimal / clean
If GGML_GRAPH_PROFILER is not defined there is zero overhead (well, besides the extra pointer in the graph).
If profiler is compiled but not enabled the overhead is relatively small (3 extra if checks per graph node)
- We can probably optimize that out further (ie like a separate compute loop without profiling calls)
Tested on a couple platforms (M2 Max and Snapdragon Gen 3) with and without TSAN

Known issues:

Using env var is just a hack for now
- I'm thinking of adding ggml_init_param.graph_profile or it'll be moved into the backend params
- If set it'll enable profiling for all the graphs created from that context
The parms will probably be a string (filename:format or something like that)
The profile_data allocation is leaky
Currently it only works with the CPU backend If there is interest it should be easy to extend to other backends where they could update per-node/per-thread ggml_profile_timing data (they'd have to collect it on the accelerator and then export into this common format.

See original PR #9647 for additional details.

Example of the terminal output

| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |
| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |
| 0000 |   GET_ROWS |      22863 |   47566042 |   47588905 |      4096:128256:1:1 x 2:1:1:1 ->   4096:2:1:1 |      q4_0 x i32 -> f32 |             inp_embd |
| 0001 |   RMS_NORM |      96926 |   47859323 |   47956249 |                     4096:2:1:1 ->   4096:2:1:1 |             f32 -> f32 |               norm-0 |
| 0002 |        MUL |      11875 |   36215730 |   36227605 |        4096:2:1:1 x 4096:1:1:1 ->   4096:2:1:1 |       f32 x f32 -> f32 |          attn_norm-0 |
| 0003 |    MUL_MAT |   27633698 |   21271875 |   48905573 |     4096:4096:1:1 x 4096:2:1:1 ->   4096:2:1:1 |  q4_0_4x8 x f32 -> f32 |               Qcur-0 |
| 0004 |    RESHAPE |       9167 |   22963435 |   22972602 |                     4096:2:1:1 ->   128:32:1:1 |             f32 -> f32 |    Qcur-0 (reshaped) |
| 0005 |       ROPE |    1787239 |   22215678 |   24002917 |           128:32:1:1 x 2:1:1:1 ->   128:32:1:1 |       f32 x i32 -> f32 |               Qcur-0 |
| 0006 |    MUL_MAT |   20612237 |     698388 |   21310625 |     4096:1024:1:1 x 4096:2:1:1 ->   1024:2:1:1 |  q4_0_4x8 x f32 -> f32 |               Kcur-0 |
| 0007 |    RESHAPE |      10989 |       7033 |      18022 |                     1024:2:1:1 ->    128:8:1:1 |             f32 -> f32 |    Kcur-0 (reshaped) |
| 0008 |       ROPE |    2566925 |     332969 |    2899894 |            128:8:1:1 x 2:1:1:1 ->    128:8:1:1 |       f32 x i32 -> f32 |               Kcur-0 |
| 0009 |    MUL_MAT |     489845 |      98229 |     588074 |     4096:1024:1:1 x 4096:2:1:1 ->   1024:2:1:1 |  q4_0_4x8 x f32 -> f32 |               Vcur-0 |
| 0010 |       VIEW |       7396 |       3074 |      10470 |                  4194304:1:1:1 ->   2048:1:1:1 |           q4_0 -> q4_0 |       k_cache_view-0 |
| 0011 |        CPY |      15782 |       7186 |      22968 |         128:8:1:1 x 2048:1:1:1 ->   2048:1:1:1 |     f32 x q4_0 -> q4_0 | k_cache_view-0 (copy of Kcur-0) |
| 0012 |       VIEW |       5836 |       2444 |       8280 |                  4194304:1:1:1 ->   2048:1:1:1 |           q4_0 -> q4_0 |       v_cache_view-0 |
| 0013 |        CPY |      14324 |      12865 |      27189 |        1024:2:1:1 x 2048:1:1:1 ->   2048:1:1:1 |     f32 x q4_0 -> q4_0 | v_cache_view-0 (copy of Vcur-0) |
| 0014 |    PERMUTE |       5572 |       2240 |       7812 |                     128:32:1:1 ->    128:2:1:1 |             f32 -> f32 |                  q-0 |
| 0015 |       VIEW |      10469 |       3543 |      14012 |                  4194304:1:1:1 ->  128:256:1:1 |           q4_0 -> q4_0 |                  k-0 |
| 0016 |       VIEW |       9375 |       3071 |      12446 |                  4194304:1:1:1 ->  128:256:1:1 |           q4_0 -> q4_0 |                  v-0 |
| 0017 |        CPY |      10053 |       4635 |      14688 |        256:32:1:1 x 256:32:1:1 ->   256:32:1:1 |       f32 x f16 -> f16 |       KQ_mask (copy) |
| 0018 | FLASH_ATTN_EXT |     113753 |      17809 |     131562 | 128:2:1:1 x 128:256:1:1 x 128:256:1:1 x 256:32:1:1 ->   128:32:1:1 | f32 x q4_0 x q4_0 x f16 -> f32 |              node_18 |

Same example in rendered MarkDown

node idx	op name	proc (nsec)	sync (nsec)	total (nsec)	op dims	op types	tensor name
0000	GET_ROWS	22863	47566042	47588905	4096:128256:1:1 x 2:1:1:1 -> 4096:2:1:1	q4_0 x i32 -> f32	inp_embd
0001	RMS_NORM	96926	47859323	47956249	4096:2:1:1 -> 4096:2:1:1	f32 -> f32	norm-0
0002	MUL	11875	36215730	36227605	4096:2:1:1 x 4096:1:1:1 -> 4096:2:1:1	f32 x f32 -> f32	attn_norm-0
0003	MUL_MAT	27633698	21271875	48905573	4096:4096:1:1 x 4096:2:1:1 -> 4096:2:1:1	q4_0_4x8 x f32 -> f32	Qcur-0
0004	RESHAPE	9167	22963435	22972602	4096:2:1:1 -> 128:32:1:1	f32 -> f32	Qcur-0 (reshaped)
0005	ROPE	1787239	22215678	24002917	128:32:1:1 x 2:1:1:1 -> 128:32:1:1	f32 x i32 -> f32	Qcur-0
0006	MUL_MAT	20612237	698388	21310625	4096:1024:1:1 x 4096:2:1:1 -> 1024:2:1:1	q4_0_4x8 x f32 -> f32	Kcur-0
0007	RESHAPE	10989	7033	18022	1024:2:1:1 -> 128:8:1:1	f32 -> f32	Kcur-0 (reshaped)
0008	ROPE	2566925	332969	2899894	128:8:1:1 x 2:1:1:1 -> 128:8:1:1	f32 x i32 -> f32	Kcur-0
0009	MUL_MAT	489845	98229	588074	4096:1024:1:1 x 4096:2:1:1 -> 1024:2:1:1	q4_0_4x8 x f32 -> f32	Vcur-0
0010	VIEW	7396	3074	10470	4194304:1:1:1 -> 2048:1:1:1	q4_0 -> q4_0	k_cache_view-0
0011	CPY	15782	7186	22968	128:8:1:1 x 2048:1:1:1 -> 2048:1:1:1	f32 x q4_0 -> q4_0	k_cache_view-0 (copy of Kcur-0)
0012	VIEW	5836	2444	8280	4194304:1:1:1 -> 2048:1:1:1	q4_0 -> q4_0	v_cache_view-0
0013	CPY	14324	12865	27189	1024:2:1:1 x 2048:1:1:1 -> 2048:1:1:1	f32 x q4_0 -> q4_0	v_cache_view-0 (copy of Vcur-0)
0014	PERMUTE	5572	2240	7812	128:32:1:1 -> 128:2:1:1	f32 -> f32	q-0
0015	VIEW	10469	3543	14012	4194304:1:1:1 -> 128:256:1:1	q4_0 -> q4_0	k-0
0016	VIEW	9375	3071	12446	4194304:1:1:1 -> 128:256:1:1	q4_0 -> q4_0	v-0
0017	CPY	10053	4635	14688	256:32:1:1 x 256:32:1:1 -> 256:32:1:1	f32 x f16 -> f16	KQ_mask (copy)
0018	FLASH_ATTN_EXT	113753	17809	131562	128:2:1:1 x 128:256:1:1 x 128:256:1:1 x 256:32:1:1 -> 128:32:1:1	f32 x q4_0 x q4_0 x f16 -> f32	node_18

Sep 26 '24 16:09 max-krasnyansky

@max-krasnyansky I am using the graph-profiler branch but I'm unsure how to trigger and get the profiling details. Any docs, commands or references would be appreciated. Thanks.

Oct 11 '24 10:10 shwu-nyunai

@max-krasnyansky I am using the graph-profiler branch but I'm unsure how to trigger and get the profiling details. Any docs, commands or references would be appreciated. Thanks.

Sorry for the delay. I thought the example is included in the PR description but looks like I forgot that.

Here is how to build (arm64-ubuntu)

CC=clang-18 CXX=clang++-18 CFLAGS="-march=armv8.7a" CXXFLAGS="-march=armv8.7a" cmake -G Ninja -D GGML_GRAPH_PROFILER=ON -B build-arm64

cmake --build build-arm64

And here is how to run

GGML_GRAPH_PROFILE=stderr ./build-arm64/bin/llama-cli -m ../gguf/llama-v3.1-8b-instruct.q4_0_4_8.gguf -p Hi -n 1

This will get you the output I included in the PR

ggml-profile:| node idx | op name | proc (nsec) | sync (nsec) | total (nsec) | op dims | op types | tensor name |
ggml-profile:| -------: | :------ | ----------: | ----------: | -----------: | ------: | -------: | ----------: |
ggml-profile:| 0000 |   GET_ROWS |    6269425 |  135449736 |  141719161 |      4096:128256:1:1 x 2:1:1:1 ->   4096:2:1:1 |      q4_0 x i32 -> f32 |             inp_embd |
ggml-profile:| 0001 |   RMS_NORM |    1038728 |  154384695 |  155423423 |                     4096:2:1:1 ->   4096:2:1:1 |             f32 -> f32 |               norm-0 |

Oct 16 '24 02:10 max-krasnyansky

Hi, I am also trying to find how to do profile properly with llama.cpp. In my case, I would like to know the performance beyond the node level. For example, I would like to know the aggregated time of all nodes generated by build_attn_mha. Ideally I would like to see a time series graph which is similar to nsys, which can tell me how much time is spent on attention of the llama model. Do you have any idea how to implement this in llama.cpp?

May 15 '25 18:05 PanJason

I think a good approach can be that for each ggml_tensor add another field besides op which records where this op is coming from so that we can differentiate the same op from multiple operations, for example, the same mat_mul from attn or mlp

May 15 '25 18:05 PanJason

I think a good approach can be that for each ggml_tensor add another field besides op which records where this op is coming from so that we can differentiate the same op from multiple operations, for example, the same mat_mul from attn or mlp

I'm thinking for that it might make sense to insert dummy graph nodes that record profiling data. Also you could post-process the profile output and combine nodes by tensor names (ie add up all tensors for attention layer-X).

Jun 05 '25 21:06 max-krasnyansky

Hi, I was looking for a tool exactly like this to dump the actual graph operators during execution. Very useful. One feedback but I actually needed the following change to compile in my environment (Windows 11 x64 + VS2022). I was getting link errors such as ggml-cpu.c.obj : error LNK2019: unresolved external symbol ggml_graph_profile_start referenced in function ggml_graph_compute

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 2cc42d4b0..acfa79fff 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -583,6 +583,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
     endif()

+    if (GGML_GRAPH_PROFILER)
+        target_link_libraries(${GGML_CPU_NAME} PRIVATE ggml-base)
+    endif()
+
     message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
     target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
     target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
diff --git a/ggml/src/ggml-profile.h b/ggml/src/ggml-profile.h
index 3f8fecc08..f63f019ce 100644
--- a/ggml/src/ggml-profile.h
+++ b/ggml/src/ggml-profile.h
@@ -77,11 +77,11 @@ static inline void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum g

 #else

-void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
-void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
-void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
-void ggml_graph_profile_free(struct ggml_cgraph *cg);
-void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);
+GGML_API void ggml_graph_profile_init(struct ggml_cgraph *cg, int n_threads);
+GGML_API void ggml_graph_profile_start(struct ggml_cgraph *cg, int n_threads);
+GGML_API void ggml_graph_profile_finish(struct ggml_cgraph *cg, int n_threads);
+GGML_API void ggml_graph_profile_free(struct ggml_cgraph *cg);
+GGML_API void ggml_graph_profile_event(const struct ggml_cgraph *cg, enum ggml_profile_event e, int node_n, int ith);

 #endif // GGML_GRAPH_PROFILER

Sep 03 '25 07:09 rillomas