llama-cpp-python
llama-cpp-python copied to clipboard
Retrieve attention score for all input tokens per generated token
Is your feature request related to a problem? Please describe. In RAG-scenarious, I think it would be a great help to differentiate if a LLM is hallucinating or retrieving its informations from the given context, when we could get an attention score for all input-tokens per generated token.
Describe the solution you'd like Having a callback-mechanism for every generated token, similar to the LogitsProcessor, that receives a list of scores.
Describe alternatives you've considered Calculating the scores by myself. But my knowledge of transformers is not sufficient.
Additional context I would like to build something like the "Attention tracing" in this repository, but with llama.cpp as backend.
Hey @parallaxe the approach mentioned in that repo requires computation of per token attention based on outputs of specific transformer attention head layers. This isn't currently supported by the llama.cpp api because the entire model is computed in a single forward pass and intermediate values are discarded before. At best we can currently return the per token logits, this will only tell you the relative confidence of the model in predicting that specific token given the previous sequence.
Hope that helps.
Thanks for clarifying!
As I have looked around in the llama.cpp-project, I found this callback. A sample usage can be found here (but the int node_index must be removed) and a more extensive usage of the callback can be found here.
This is relatively new, but it looks like the kind of callback that could be used to retrieve an attention score, or am I mistaken?
Hey @parallaxe yes you're correct that that should work, right now I'm not exposing the ggml bindings directly in this project but that's doable (started on that in ggml-python but it's early stage), I'll look into it.
Hope this can be a good starting point!
First just need to update cmake to add the ggml_shared library.
CMakeLists.txt
cmake_minimum_required(VERSION 3.21)
project(llama_cpp)
option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
if (LLAMA_BUILD)
set(BUILD_SHARED_LIBS "On")
# Building llama
if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
# Need to disable these llama.cpp flags on Apple x86_64,
# otherwise users may encounter invalid instruction errors
set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
endif()
add_subdirectory(vendor/llama.cpp)
install(
TARGETS llama
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
# Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
install(
TARGETS llama
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
# Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
install(
FILES $<TARGET_RUNTIME_DLLS:llama>
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
install(
FILES $<TARGET_RUNTIME_DLLS:llama>
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
if (LLAVA_BUILD)
# Building llava
add_subdirectory(vendor/llama.cpp/examples/llava)
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
# Set CUDA_ARCHITECTURES to OFF on windows
if (WIN32)
set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
endif()
install(
TARGETS llava_shared
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
# Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
install(
TARGETS llava_shared
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
endif()
set_target_properties(ggml_shared PROPERTIES OUTPUT_NAME "ggml")
install(
TARGETS ggml_shared
LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
)
install(
TARGETS ggml_shared
LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
)
endif()
Then write the minimal bindings to convert the ggml_tensor pointer to a numpy array.
ggml.py
import os
import sys
import ctypes
import pathlib
from typing import List, TypeAlias
# Load the library
def _load_shared_library(lib_base_name: str):
# Construct the paths to the possible shared library names
_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
# Searching for the library in the current directory under the name "libllama" (default name
# for llamacpp) and "llama" (default name for this repo)
_lib_paths: List[pathlib.Path] = []
# Determine the file extension based on the platform
if sys.platform.startswith("linux"):
_lib_paths += [
_base_path / f"lib{lib_base_name}.so",
]
elif sys.platform == "darwin":
_lib_paths += [
_base_path / f"lib{lib_base_name}.so",
_base_path / f"lib{lib_base_name}.dylib",
]
elif sys.platform == "win32":
_lib_paths += [
_base_path / f"{lib_base_name}.dll",
_base_path / f"lib{lib_base_name}.dll",
]
else:
raise RuntimeError("Unsupported platform")
if "LLAVA_CPP_LIB" in os.environ:
lib_base_name = os.environ["LLAVA_CPP_LIB"]
_lib = pathlib.Path(lib_base_name)
_base_path = _lib.parent.resolve()
_lib_paths = [_lib.resolve()]
cdll_args = dict() # type: ignore
# Add the library directory to the DLL search path on Windows (if needed)
if sys.platform == "win32" and sys.version_info >= (3, 8):
os.add_dll_directory(str(_base_path))
if "CUDA_PATH" in os.environ:
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
cdll_args["winmode"] = ctypes.RTLD_GLOBAL
# Try to load the shared library, handling potential errors
for _lib_path in _lib_paths:
if _lib_path.exists():
try:
return ctypes.CDLL(str(_lib_path), **cdll_args)
except Exception as e:
raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
raise FileNotFoundError(
f"Shared library with base name '{lib_base_name}' not found"
)
# Load the library
libname = "ggml"
lib = _load_shared_library(libname)
GGML_MAX_DIMS = 4
GGML_MAX_SRC = 10
GGML_MAX_NAME = 64
GGML_MAX_OP_PARAMS = 64
# // n-dimensional tensor
# struct ggml_tensor {
# enum ggml_type type;
# enum ggml_backend_type backend;
# struct ggml_backend_buffer * buffer;
# int64_t ne[GGML_MAX_DIMS]; // number of elements
# size_t nb[GGML_MAX_DIMS]; // stride in bytes:
# // nb[0] = ggml_type_size(type)
# // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
# // nb[i] = nb[i-1] * ne[i-1]
# // compute data
# enum ggml_op op;
# // op params - allocated as int32_t for alignment
# int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
# bool is_param;
# struct ggml_tensor * grad;
# struct ggml_tensor * src[GGML_MAX_SRC];
# // performance
# int perf_runs;
# int64_t perf_cycles;
# int64_t perf_time_us;
# struct ggml_tensor * view_src;
# size_t view_offs;
# void * data;
# char name[GGML_MAX_NAME];
# void * extra; // extra things e.g. for ggml-cuda.cu
# char padding[8];
# };
class ggml_tensor(ctypes.Structure):
"""n-dimensional tensor
Attributes:
type (int): ggml_type
backend (int): ggml_backend
buffer (ctypes.pointer[ggml_backend_buffer]): pointer to backend buffer
ne (ctypes.Array[ctypes.c_int64]): number of elements in each dimension
nb (ctypes.Array[ctypes.c_size_t]): stride in bytes for each dimension
op (int): ggml operation
op_params (ctypes.Array[ctypes.c_int32]): `GGML_MAX_OP_PARAMS`-length array of operation parameters
is_param (bool): is this a parameter tensor
grad (ggml_tensor_p): reference to gradient tensor
src (ctypes.Array[ggml_tensor_p]): `GGML_MAX_SRC`-length array of source tensors
perf_runs (int): number of performance runs
perf_cycles (int): number of cycles
perf_time_us (int): time in microseconds
view_src (ggml_tensor_p): pointer to tensor if this tensor is a view, None if the tensor is not a view
view_offs (ctypes.c_size_t): offset into the data pointer of the view tensor
data (ctypes.c_void_p): reference to raw tensor data
name (bytes): name of tensor
extra (ctypes.c_void_p): extra data (e.g. for CUDA)
"""
ggml_tensor._fields_ = [
("type", ctypes.c_int),
("backend", ctypes.c_int),
("buffer", ctypes.c_void_p),
("ne", ctypes.c_int64 * GGML_MAX_DIMS),
("nb", ctypes.c_size_t * GGML_MAX_DIMS),
("op", ctypes.c_int),
(
"op_params",
ctypes.c_int32 * (GGML_MAX_OP_PARAMS // ctypes.sizeof(ctypes.c_int32)),
),
("is_param", ctypes.c_bool),
("grad", ctypes.POINTER(ggml_tensor)),
("src", ctypes.POINTER(ggml_tensor) * GGML_MAX_SRC),
("perf_runs", ctypes.c_int),
("perf_cycles", ctypes.c_int64),
("perf_time_us", ctypes.c_int64),
("view_src", ctypes.POINTER(ggml_tensor)),
("view_offs", ctypes.c_size_t),
("data", ctypes.c_void_p),
("name", ctypes.c_char * GGML_MAX_NAME),
("extra", ctypes.c_void_p),
("padding", ctypes.c_char * 8),
]
GGML_TENSOR_SIZE = ctypes.sizeof(ggml_tensor)
ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]" if TYPE_CHECKING else ctypes.POINTER(ggml_tensor) # type: ignore
# typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(
ctypes.c_bool, ggml_tensor_p, ctypes.c_bool, ctypes.c_void_p
)
# GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
def ggml_backend_tensor_get(
tensor: ggml_tensor_p, data: ctypes.c_void_p, offset: int, size: int
):
return lib.ggml_backend_tensor_get(tensor, data, offset, size)
lib.ggml_backend_tensor_get.argtypes = [
ggml_tensor_p,
ctypes.c_void_p,
ctypes.c_size_t,
ctypes.c_size_t,
]
lib.ggml_backend_tensor_get.restype = None
# GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
def ggml_nbytes(tensor: ggml_tensor_p) -> int:
return lib.ggml_nbytes(tensor)
lib.ggml_nbytes.argtypes = [ggml_tensor_p]
lib.ggml_nbytes.restype = ctypes.c_size_t
# GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
def ggml_nelements(tensor: ggml_tensor_p) -> int:
return lib.ggml_nelements(tensor)
lib.ggml_nelements.argtypes = [ggml_tensor_p]
lib.ggml_nelements.restype = ctypes.c_int64
# GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
def ggml_n_dims(tensor: ggml_tensor_p) -> int:
return lib.ggml_n_dims(tensor)
lib.ggml_n_dims.argtypes = [ggml_tensor_p]
lib.ggml_n_dims.restype = ctypes.c_int
def ggml_tensor_to_numpy(tensor: ggml_tensor_p):
import numpy as np
nbytes = ggml_nbytes(tensor)
nelements = ggml_nelements(tensor)
data = np.empty(nelements, dtype=np.float32)
ggml_backend_tensor_get(tensor, ctypes.cast(data.ctypes.data, ctypes.c_void_p), 0, nbytes)
return data.reshape(tensor.contents.ne[: ggml_n_dims(tensor)])
You would also need to filter out the exact operators in the ggml graph for the attention head outputs, likely either by name or by operator type.
Oh wow, this looks great! Thank you! I will see how far I can get with it 😄
Hey @parallaxe, I am also very interested in this feature. Have you managed to get the attention scores yet?
Hi @reuank, I switched from llama-cpp-python to llama.cpp for other reasons, and started implementing an attention score collecting callback for the server-implementation. I'm not sure how / if this may end in an PR, but I will push it as fork once the implementation is good enough.
+1 would love to see this feature.
+1 I would love to see this in the future! Please feel free to give any more guidance, given the llama.cpp refactor
I think I figured out how to do this. examples/eval-callback gives an example on how to use the callback. I took the code from eval-callback and put that into examples/embedding to hook up the callback. That works fine. Only thing then is to identify the correct tensor.
I believe it is in llama-graph.cpp in function build_attn_mha on line 1122. The line that says: kq = ggml_soft_max_ext(...) So after that line I added: ggml_set_name(kq, "kq");
The calback gives me 24 x {4, 4, 16, 1} tensors. I am using multilingual-e5-large-instruct. Which has 24 blocks, 16 heads and my input was 4 tokens so that look ok I think.