llama-cpp-python Retrieve attention score for all input tokens per generated token

Is your feature request related to a problem? Please describe. In RAG-scenarious, I think it would be a great help to differentiate if a LLM is hallucinating or retrieving its informations from the given context, when we could get an attention score for all input-tokens per generated token.

Describe the solution you'd like Having a callback-mechanism for every generated token, similar to the LogitsProcessor, that receives a list of scores.

Describe alternatives you've considered Calculating the scores by myself. But my knowledge of transformers is not sufficient.

Additional context I would like to build something like the "Attention tracing" in this repository, but with llama.cpp as backend.

Jan 29 '24 14:01 parallaxe

Hey @parallaxe the approach mentioned in that repo requires computation of per token attention based on outputs of specific transformer attention head layers. This isn't currently supported by the llama.cpp api because the entire model is computed in a single forward pass and intermediate values are discarded before. At best we can currently return the per token logits, this will only tell you the relative confidence of the model in predicting that specific token given the previous sequence.

Hope that helps.

Jan 31 '24 04:01 abetlen

Thanks for clarifying!

Jan 31 '24 08:01 parallaxe

As I have looked around in the llama.cpp-project, I found this callback. A sample usage can be found here (but the int node_index must be removed) and a more extensive usage of the callback can be found here. This is relatively new, but it looks like the kind of callback that could be used to retrieve an attention score, or am I mistaken?

Feb 04 '24 20:02 parallaxe

Hey @parallaxe yes you're correct that that should work, right now I'm not exposing the ggml bindings directly in this project but that's doable (started on that in ggml-python but it's early stage), I'll look into it.

Feb 06 '24 03:02 abetlen

Hope this can be a good starting point!

First just need to update cmake to add the ggml_shared library.

CMakeLists.txt

cmake_minimum_required(VERSION 3.21)

project(llama_cpp)

option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)

if (LLAMA_BUILD)
    set(BUILD_SHARED_LIBS "On")

    # Building llama
    if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
        # Need to disable these llama.cpp flags on Apple x86_64,
        # otherwise users may encounter invalid instruction errors
        set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
        set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
        set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
        set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
    endif()
    add_subdirectory(vendor/llama.cpp)
    install(
        TARGETS llama 
        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
    )
    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
    install(
        TARGETS llama 
        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
    )
    # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
    install(
        FILES $<TARGET_RUNTIME_DLLS:llama>
        DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
    )
    install(
        FILES $<TARGET_RUNTIME_DLLS:llama>
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
    )

    if (LLAVA_BUILD)
        # Building llava
        add_subdirectory(vendor/llama.cpp/examples/llava)
        set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
        # Set CUDA_ARCHITECTURES to OFF on windows
        if (WIN32)
            set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
        endif()
        install(
            TARGETS llava_shared
            LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
            RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
            ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
            FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
            RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        )
        # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
        install(
            TARGETS llava_shared
            LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
            RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
            ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
            FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
            RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        )
    endif()

    set_target_properties(ggml_shared PROPERTIES OUTPUT_NAME "ggml")
    install(
        TARGETS ggml_shared
        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
    )
    install(
        TARGETS ggml_shared 
        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
    )
endif()

Then write the minimal bindings to convert the ggml_tensor pointer to a numpy array.

ggml.py

import os
import sys
import ctypes
import pathlib

from typing import List, TypeAlias


# Load the library
def _load_shared_library(lib_base_name: str):
    # Construct the paths to the possible shared library names
    _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
    # Searching for the library in the current directory under the name "libllama" (default name
    # for llamacpp) and "llama" (default name for this repo)
    _lib_paths: List[pathlib.Path] = []
    # Determine the file extension based on the platform
    if sys.platform.startswith("linux"):
        _lib_paths += [
            _base_path / f"lib{lib_base_name}.so",
        ]
    elif sys.platform == "darwin":
        _lib_paths += [
            _base_path / f"lib{lib_base_name}.so",
            _base_path / f"lib{lib_base_name}.dylib",
        ]
    elif sys.platform == "win32":
        _lib_paths += [
            _base_path / f"{lib_base_name}.dll",
            _base_path / f"lib{lib_base_name}.dll",
        ]
    else:
        raise RuntimeError("Unsupported platform")

    if "LLAVA_CPP_LIB" in os.environ:
        lib_base_name = os.environ["LLAVA_CPP_LIB"]
        _lib = pathlib.Path(lib_base_name)
        _base_path = _lib.parent.resolve()
        _lib_paths = [_lib.resolve()]

    cdll_args = dict()  # type: ignore
    # Add the library directory to the DLL search path on Windows (if needed)
    if sys.platform == "win32" and sys.version_info >= (3, 8):
        os.add_dll_directory(str(_base_path))
        if "CUDA_PATH" in os.environ:
            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
        cdll_args["winmode"] = ctypes.RTLD_GLOBAL

    # Try to load the shared library, handling potential errors
    for _lib_path in _lib_paths:
        if _lib_path.exists():
            try:
                return ctypes.CDLL(str(_lib_path), **cdll_args)
            except Exception as e:
                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")

    raise FileNotFoundError(
        f"Shared library with base name '{lib_base_name}' not found"
    )


# Load the library
libname = "ggml"
lib = _load_shared_library(libname)


GGML_MAX_DIMS = 4
GGML_MAX_SRC = 10
GGML_MAX_NAME = 64
GGML_MAX_OP_PARAMS = 64


# // n-dimensional tensor
# struct ggml_tensor {
#     enum ggml_type         type;
#     enum ggml_backend_type backend;

#     struct ggml_backend_buffer * buffer;

#     int64_t ne[GGML_MAX_DIMS]; // number of elements
#     size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
#                                // nb[0] = ggml_type_size(type)
#                                // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
#                                // nb[i] = nb[i-1] * ne[i-1]

#     // compute data
#     enum ggml_op op;

#     // op params - allocated as int32_t for alignment
#     int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

#     bool is_param;

#     struct ggml_tensor * grad;
#     struct ggml_tensor * src[GGML_MAX_SRC];

#     // performance
#     int     perf_runs;
#     int64_t perf_cycles;
#     int64_t perf_time_us;

#     struct ggml_tensor * view_src;
#     size_t               view_offs;

#     void * data;

#     char name[GGML_MAX_NAME];

#     void * extra; // extra things e.g. for ggml-cuda.cu


#     char padding[8];
# };
class ggml_tensor(ctypes.Structure):
    """n-dimensional tensor

    Attributes:
        type (int): ggml_type
        backend (int): ggml_backend
        buffer (ctypes.pointer[ggml_backend_buffer]): pointer to backend buffer
        ne (ctypes.Array[ctypes.c_int64]): number of elements in each dimension
        nb (ctypes.Array[ctypes.c_size_t]): stride in bytes for each dimension
        op (int): ggml operation
        op_params (ctypes.Array[ctypes.c_int32]): `GGML_MAX_OP_PARAMS`-length array of operation parameters
        is_param (bool): is this a parameter tensor
        grad (ggml_tensor_p): reference to gradient tensor
        src (ctypes.Array[ggml_tensor_p]): `GGML_MAX_SRC`-length array of source tensors
        perf_runs (int): number of performance runs
        perf_cycles (int): number of cycles
        perf_time_us (int): time in microseconds
        view_src (ggml_tensor_p): pointer to tensor if this tensor is a view, None if the tensor is not a view
        view_offs (ctypes.c_size_t): offset into the data pointer of the view tensor
        data (ctypes.c_void_p): reference to raw tensor data
        name (bytes): name of tensor
        extra (ctypes.c_void_p): extra data (e.g. for CUDA)
    """


ggml_tensor._fields_ = [
    ("type", ctypes.c_int),
    ("backend", ctypes.c_int),
    ("buffer", ctypes.c_void_p),
    ("ne", ctypes.c_int64 * GGML_MAX_DIMS),
    ("nb", ctypes.c_size_t * GGML_MAX_DIMS),
    ("op", ctypes.c_int),
    (
        "op_params",
        ctypes.c_int32 * (GGML_MAX_OP_PARAMS // ctypes.sizeof(ctypes.c_int32)),
    ),
    ("is_param", ctypes.c_bool),
    ("grad", ctypes.POINTER(ggml_tensor)),
    ("src", ctypes.POINTER(ggml_tensor) * GGML_MAX_SRC),
    ("perf_runs", ctypes.c_int),
    ("perf_cycles", ctypes.c_int64),
    ("perf_time_us", ctypes.c_int64),
    ("view_src", ctypes.POINTER(ggml_tensor)),
    ("view_offs", ctypes.c_size_t),
    ("data", ctypes.c_void_p),
    ("name", ctypes.c_char * GGML_MAX_NAME),
    ("extra", ctypes.c_void_p),
    ("padding", ctypes.c_char * 8),
]

GGML_TENSOR_SIZE = ctypes.sizeof(ggml_tensor)

ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]" if TYPE_CHECKING else ctypes.POINTER(ggml_tensor)  # type: ignore

# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(
    ctypes.c_bool, ggml_tensor_p, ctypes.c_bool, ctypes.c_void_p
)


# GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
def ggml_backend_tensor_get(
    tensor: ggml_tensor_p, data: ctypes.c_void_p, offset: int, size: int
):
    return lib.ggml_backend_tensor_get(tensor, data, offset, size)


lib.ggml_backend_tensor_get.argtypes = [
    ggml_tensor_p,
    ctypes.c_void_p,
    ctypes.c_size_t,
    ctypes.c_size_t,
]
lib.ggml_backend_tensor_get.restype = None


# GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
def ggml_nbytes(tensor: ggml_tensor_p) -> int:
    return lib.ggml_nbytes(tensor)


lib.ggml_nbytes.argtypes = [ggml_tensor_p]
lib.ggml_nbytes.restype = ctypes.c_size_t


# GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
def ggml_nelements(tensor: ggml_tensor_p) -> int:
    return lib.ggml_nelements(tensor)


lib.ggml_nelements.argtypes = [ggml_tensor_p]
lib.ggml_nelements.restype = ctypes.c_int64


# GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
def ggml_n_dims(tensor: ggml_tensor_p) -> int:
    return lib.ggml_n_dims(tensor)

lib.ggml_n_dims.argtypes = [ggml_tensor_p]
lib.ggml_n_dims.restype = ctypes.c_int


def ggml_tensor_to_numpy(tensor: ggml_tensor_p):
    import numpy as np
    nbytes = ggml_nbytes(tensor)
    nelements = ggml_nelements(tensor)
    data = np.empty(nelements, dtype=np.float32)
    ggml_backend_tensor_get(tensor, ctypes.cast(data.ctypes.data, ctypes.c_void_p), 0, nbytes)
    return data.reshape(tensor.contents.ne[: ggml_n_dims(tensor)])

You would also need to filter out the exact operators in the ggml graph for the attention head outputs, likely either by name or by operator type.

Feb 06 '24 05:02 abetlen

Oh wow, this looks great! Thank you! I will see how far I can get with it 😄

Feb 06 '24 09:02 parallaxe

Hey @parallaxe, I am also very interested in this feature. Have you managed to get the attention scores yet?

Apr 04 '24 08:04 reuank

Hi @reuank, I switched from llama-cpp-python to llama.cpp for other reasons, and started implementing an attention score collecting callback for the server-implementation. I'm not sure how / if this may end in an PR, but I will push it as fork once the implementation is good enough.

Apr 05 '24 06:04 parallaxe

+1 would love to see this feature.

Oct 22 '24 05:10 ekcrisp

+1 I would love to see this in the future! Please feel free to give any more guidance, given the llama.cpp refactor

May 02 '25 05:05 clarismiranda

I think I figured out how to do this. examples/eval-callback gives an example on how to use the callback. I took the code from eval-callback and put that into examples/embedding to hook up the callback. That works fine. Only thing then is to identify the correct tensor.

I believe it is in llama-graph.cpp in function build_attn_mha on line 1122. The line that says: kq = ggml_soft_max_ext(...) So after that line I added: ggml_set_name(kq, "kq");

The calback gives me 24 x {4, 4, 16, 1} tensors. I am using multilingual-e5-large-instruct. Which has 24 blocks, 16 heads and my input was 4 tokens so that look ok I think.

Jun 01 '25 18:06 marcov-dart

llama-cpp-python llama-cpp-python copied to clipboard

Retrieve attention score for all input tokens per generated token

llama-cpp-python
llama-cpp-python copied to clipboard