llama.cpp
llama.cpp copied to clipboard
grammar, json, llama: replace push on emplace if it possible
It rather refers to code refactoring, but there is less code for inserts functions.
📈 llama.cpp server for bench-server-baseline on Standard_NC4as_T4_v3 for phi-2
-q4_0
: 550 iterations 🚀
Expand details for performance related PR only
- Concurrent users: 8, duration: 10m
- HTTP request : avg=8508.88ms p(95)=20898.67ms fails=, finish reason: stop=481 truncated=69
- Prompt processing (pp): avg=98.93tk/s p(95)=450.98tk/s
- Token generation (tg): avg=34.62tk/s p(95)=48.4tk/s
- ggml-org/models/phi-2/ggml-model-q4_0.gguf parallel=8 ctx-size=16384 ngl=33 batch-size=2048 ubatch-size=256 pp=1024 pp+tg=2048 branch=emplace-cpp11 commit=c08d69f9245d00a9b7c863dc312096dcfcca6670
More
---
config:
xyChart:
titleFontSize: 12
width: 900
height: 600
themeVariables:
xyChart:
titleColor: "#000000"
---
xychart-beta
title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
duration=10m 550 iterations"
y-axis "llamacpp:prompt_tokens_seconds"
x-axis "llamacpp:prompt_tokens_seconds" 1715654350 --> 1715654982
line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 377.95, 377.95, 377.95, 377.95, 377.95, 935.26, 935.26, 935.26, 935.26, 935.26, 950.18, 950.18, 950.18, 950.18, 950.18, 942.57, 942.57, 942.57, 942.57, 942.57, 958.73, 958.73, 958.73, 958.73, 958.73, 990.24, 990.24, 990.24, 990.24, 990.24, 977.58, 977.58, 977.58, 977.58, 977.58, 966.32, 966.32, 966.32, 966.32, 966.32, 961.56, 961.56, 961.56, 961.56, 961.56, 966.56, 966.56, 966.56, 966.56, 966.56, 957.54, 957.54, 957.54, 957.54, 957.54, 976.34, 976.34, 976.34, 976.34, 976.34, 998.64, 998.64, 998.64, 998.64, 998.64, 989.63, 989.63, 989.63, 989.63, 989.63, 1012.2, 1012.2, 1012.2, 1012.2, 1012.2, 1010.21, 1010.21, 1010.21, 1010.21, 1010.21, 1006.78, 1006.78, 1006.78, 1006.78, 1006.78, 1007.85, 1007.85, 1007.85, 1007.85, 1007.85, 1008.34, 1008.34, 1008.34, 1008.34, 1008.34, 1001.61, 1001.61, 1001.61, 1001.61, 1001.61, 996.36, 996.36, 996.36, 996.36, 996.36, 998.25, 998.25, 998.25, 998.25, 998.25, 997.09, 997.09, 997.09, 997.09, 997.09, 1001.19, 1001.19, 1001.19, 1001.19, 1001.19, 997.57, 997.57, 997.57, 997.57, 997.57, 992.96, 992.96, 992.96, 992.96, 992.96, 1002.88, 1002.88, 1002.88, 1002.88, 1002.88, 996.34, 996.34, 996.34, 996.34, 996.34, 988.25, 988.25, 988.25, 988.25, 988.25, 986.01, 986.01, 986.01, 986.01, 986.01, 987.67, 987.67, 987.67, 987.67, 987.67, 984.0, 984.0, 984.0, 984.0, 984.0, 980.7, 980.7, 980.7, 980.7, 980.7, 980.78, 980.78, 980.78, 980.78, 980.78, 988.85, 988.85, 988.85, 988.85, 988.85, 993.49, 993.49, 993.49, 993.49, 993.49, 996.07, 996.07, 996.07, 996.07, 996.07, 990.94, 990.94, 990.94, 990.94, 990.94, 989.13, 989.13, 989.13, 989.13, 989.13, 989.82, 989.82, 989.82, 989.82, 989.82, 985.57, 985.57, 985.57, 985.57, 985.57, 990.74, 990.74, 990.74, 990.74, 990.74, 985.79, 985.79, 985.79, 985.79, 985.79, 966.61, 966.61, 966.61, 966.61, 966.61, 963.75, 963.75, 963.75, 963.75, 963.75, 959.76, 959.76, 959.76, 959.76, 959.76, 952.62, 952.62, 952.62, 952.62, 952.62, 950.82, 950.82, 950.82, 950.82, 950.82, 954.47, 954.47, 954.47, 954.47, 954.47, 949.34, 949.34, 949.34, 949.34, 949.34, 944.24, 944.24, 944.24, 944.24, 944.24, 946.92, 946.92, 946.92, 946.92, 946.92, 945.36, 945.36, 945.36, 945.36, 945.36, 950.15, 950.15, 950.15, 950.15, 950.15, 948.35, 948.35, 948.35, 948.35, 948.35, 948.11, 948.11, 948.11, 948.11, 948.11, 948.54, 948.54, 948.54, 948.54, 948.54, 948.81, 948.81, 948.81, 948.81, 948.81, 949.26, 949.26, 949.26, 949.26, 949.26, 950.63, 950.63, 950.63, 950.63, 950.63, 950.38, 950.38, 950.38, 950.38, 950.38, 950.38, 950.38]
More
---
config:
xyChart:
titleFontSize: 12
width: 900
height: 600
themeVariables:
xyChart:
titleColor: "#000000"
---
xychart-beta
title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
duration=10m 550 iterations"
y-axis "llamacpp:predicted_tokens_seconds"
x-axis "llamacpp:predicted_tokens_seconds" 1715654350 --> 1715654982
line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 38.9, 38.9, 38.9, 38.9, 38.9, 27.98, 27.98, 27.98, 27.98, 27.98, 26.91, 26.91, 26.91, 26.91, 26.91, 28.2, 28.2, 28.2, 28.2, 28.2, 29.52, 29.52, 29.52, 29.52, 29.52, 30.42, 30.42, 30.42, 30.42, 30.42, 31.57, 31.57, 31.57, 31.57, 31.57, 32.56, 32.56, 32.56, 32.56, 32.56, 32.98, 32.98, 32.98, 32.98, 32.98, 33.46, 33.46, 33.46, 33.46, 33.46, 33.65, 33.65, 33.65, 33.65, 33.65, 33.53, 33.53, 33.53, 33.53, 33.53, 32.91, 32.91, 32.91, 32.91, 32.91, 31.98, 31.98, 31.98, 31.98, 31.98, 31.83, 31.83, 31.83, 31.83, 31.83, 31.89, 31.89, 31.89, 31.89, 31.89, 32.01, 32.01, 32.01, 32.01, 32.01, 31.77, 31.77, 31.77, 31.77, 31.77, 31.52, 31.52, 31.52, 31.52, 31.52, 31.27, 31.27, 31.27, 31.27, 31.27, 31.2, 31.2, 31.2, 31.2, 31.2, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.34, 31.48, 31.48, 31.48, 31.48, 31.48, 31.2, 31.2, 31.2, 31.2, 31.2, 30.86, 30.86, 30.86, 30.86, 30.86, 31.01, 31.01, 31.01, 31.01, 31.01, 31.18, 31.18, 31.18, 31.18, 31.18, 31.33, 31.33, 31.33, 31.33, 31.33, 31.42, 31.42, 31.42, 31.42, 31.42, 31.49, 31.49, 31.49, 31.49, 31.49, 31.47, 31.47, 31.47, 31.47, 31.47, 31.31, 31.31, 31.31, 31.31, 31.31, 31.25, 31.25, 31.25, 31.25, 31.25, 31.05, 31.05, 31.05, 31.05, 31.05, 31.21, 31.21, 31.21, 31.21, 31.21, 31.29, 31.29, 31.29, 31.29, 31.29, 31.43, 31.43, 31.43, 31.43, 31.43, 31.53, 31.53, 31.53, 31.53, 31.53, 31.47, 31.47, 31.47, 31.47, 31.47, 31.08, 31.08, 31.08, 31.08, 31.08, 30.51, 30.51, 30.51, 30.51, 30.51, 29.89, 29.89, 29.89, 29.89, 29.89, 29.7, 29.7, 29.7, 29.7, 29.7, 29.77, 29.77, 29.77, 29.77, 29.77, 29.89, 29.89, 29.89, 29.89, 29.89, 29.98, 29.98, 29.98, 29.98, 29.98, 30.13, 30.13, 30.13, 30.13, 30.13, 30.08, 30.08, 30.08, 30.08, 30.08, 29.98, 29.98, 29.98, 29.98, 29.98, 29.9, 29.9, 29.9, 29.9, 29.9, 29.89, 29.89, 29.89, 29.89, 29.89, 30.03, 30.03, 30.03, 30.03, 30.03, 30.21, 30.21, 30.21, 30.21, 30.21, 30.31, 30.31, 30.31, 30.31, 30.31, 30.38, 30.38, 30.38, 30.38, 30.38, 30.41, 30.41, 30.41, 30.41, 30.41, 30.39, 30.39, 30.39, 30.39, 30.39, 30.39, 30.39]
Details
More
---
config:
xyChart:
titleFontSize: 12
width: 900
height: 600
themeVariables:
xyChart:
titleColor: "#000000"
---
xychart-beta
title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
duration=10m 550 iterations"
y-axis "llamacpp:kv_cache_usage_ratio"
x-axis "llamacpp:kv_cache_usage_ratio" 1715654350 --> 1715654982
line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07, 0.07, 0.07, 0.07, 0.07, 0.35, 0.35, 0.35, 0.35, 0.35, 0.26, 0.26, 0.26, 0.26, 0.26, 0.13, 0.13, 0.13, 0.13, 0.13, 0.17, 0.17, 0.17, 0.17, 0.17, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.11, 0.11, 0.11, 0.11, 0.11, 0.16, 0.16, 0.16, 0.16, 0.16, 0.18, 0.18, 0.18, 0.18, 0.18, 0.12, 0.12, 0.12, 0.12, 0.12, 0.14, 0.14, 0.14, 0.14, 0.14, 0.28, 0.28, 0.28, 0.28, 0.28, 0.18, 0.18, 0.18, 0.18, 0.18, 0.24, 0.24, 0.24, 0.24, 0.24, 0.14, 0.14, 0.14, 0.14, 0.14, 0.18, 0.18, 0.18, 0.18, 0.18, 0.2, 0.2, 0.2, 0.2, 0.2, 0.25, 0.25, 0.25, 0.25, 0.25, 0.3, 0.3, 0.3, 0.3, 0.3, 0.14, 0.14, 0.14, 0.14, 0.14, 0.16, 0.16, 0.16, 0.16, 0.16, 0.18, 0.18, 0.18, 0.18, 0.18, 0.31, 0.31, 0.31, 0.31, 0.31, 0.13, 0.13, 0.13, 0.13, 0.13, 0.12, 0.12, 0.12, 0.12, 0.12, 0.33, 0.33, 0.33, 0.33, 0.33, 0.29, 0.29, 0.29, 0.29, 0.29, 0.24, 0.24, 0.24, 0.24, 0.24, 0.13, 0.13, 0.13, 0.13, 0.13, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.2, 0.2, 0.2, 0.2, 0.2, 0.19, 0.19, 0.19, 0.19, 0.19, 0.33, 0.33, 0.33, 0.33, 0.33, 0.16, 0.16, 0.16, 0.16, 0.16, 0.18, 0.18, 0.18, 0.18, 0.18, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.34, 0.34, 0.34, 0.34, 0.34, 0.46, 0.46, 0.46, 0.46, 0.46, 0.49, 0.49, 0.49, 0.49, 0.49, 0.41, 0.41, 0.41, 0.41, 0.41, 0.39, 0.39, 0.39, 0.39, 0.39, 0.17, 0.17, 0.17, 0.17, 0.17, 0.09, 0.09, 0.09, 0.09, 0.09, 0.24, 0.24, 0.24, 0.24, 0.24, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.28, 0.28, 0.28, 0.28, 0.28, 0.24, 0.24, 0.24, 0.24, 0.24, 0.29, 0.29, 0.29, 0.29, 0.29, 0.11, 0.11, 0.11, 0.11, 0.11, 0.12, 0.12, 0.12, 0.12, 0.12, 0.1, 0.1, 0.1, 0.1, 0.1, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.19, 0.26, 0.26]
More
---
config:
xyChart:
titleFontSize: 12
width: 900
height: 600
themeVariables:
xyChart:
titleColor: "#000000"
---
xychart-beta
title "llama.cpp bench-server-baseline on Standard_NC4as_T4_v3
duration=10m 550 iterations"
y-axis "llamacpp:requests_processing"
x-axis "llamacpp:requests_processing" 1715654350 --> 1715654982
line [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, 7.0, 7.0, 7.0, 7.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 5.0, 5.0, 5.0, 5.0, 7.0, 7.0, 7.0, 7.0, 7.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 6.0, 6.0, 6.0, 6.0, 6.0, 1.0, 1.0]