runtime
runtime copied to clipboard
Integrate SimdUnicode for AVX-512
Contributes to https://github.com/dotnet/runtime/issues/103781, only for AVX-512, other ISAs can be added if/once this is approved/merged.
I did some clean up, like replacing some SIMD apis with cross-platform ones/operators. Btw, I don't believe that ISimdVector can be used here. Also, I removed the initial "skip ASCII data" part since we already have a work horse for that.
cc @lemire, @Nick-Nuon let me know if you want to change something (including credits in THIRD-PARTY-NOTICES.TXT)
TODO: do some ad-hoc testing, make sure test coverage is good
Tagging subscribers to this area: @dotnet/area-system-text-encoding See info in area-owners.md if you want to be subscribed.
@EgorBot -intel
using System.Text;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System.Text.Unicode;
BenchmarkRunner.Run<Bench>(args: args);
public class Bench
{
public static IEnumerable<byte[]> GetUtf8BytesData()
{
// Chinese "Lorem Ipsum"
var utf8 = "唐聞球方五保査禁答近確掲著協世好知長。育乗江校上価話戒宏口自森特室堂討。陸迎奔必秋最量注好枚挑周。間父癒曲在近真権幕覧超持樹件芸保展島船点。齢度約治末価埼坂内辞千故資接藤雨約宿県。定戻業担伸立発告敗家響意球禎。呼真局験善体続得新税知群孫大場。変省創与毎容開拡作北経眺間。樹野市現館開分供同南費海。投以画露両装知全茨済力上速田弘変掲材保内。王野嗅結択芸合験覧託委致就近資。励意親者著識連愚戦親能精球信相準大避一。民覧過走最国転開社加砲者度座図。提著学月牟止百県意能宝質約投分記加。中長塚相選暇版経田経問下訟全報府。要事集細両体要特義点必周優載治山集摘。手機掛果題銀料新政庁分堀住画禁信。味表柄読必望著後入協攻末源安 案志検江水口宿言京並属需就一生断導。通崎楽大最放新属健戦維議本金部兜素定市船"u8.ToArray();
yield return utf8.AsSpan(0, 1000).ToArray();
yield return utf8.AsSpan(0, 500).ToArray();
yield return utf8.AsSpan(0, 250).ToArray();
yield return utf8.AsSpan(0, 100).ToArray();
}
[Benchmark]
[ArgumentsSource(nameof(GetUtf8BytesData))]
public int GetUtf8Bytes(byte[] str) => Encoding.UTF8.GetCharCount(str);
public static IEnumerable<byte[]> ValidateUtf8Data()
{
// ru-RU "Lorem Ipsum"
var utf8 = "Лорем ипсум долор сит амет, хас тале феугаит ех, мел дицит сонет сцрипта ид? Еррорибус темпорибус адверсариум про те, видит ностер хас не, яуод феугаит цу ест. Но дицунт рецусабо диссентиас цум, оптион евертитур ан вих. Но мел антиопам молестиае, продессет абхорреант витуператорибус ат сит, дицант глориатур персецути при еу. При еяуидем пхаедрум рецусабо ех, не вим ерант вертерем Ехерци семпер те нец. Ид нолуиссе детерруиссет нам, яуо ан адхуц дицит пертинациа, мел тота цлита цомпрехенсам ид? Ид аугуе граецис еффициенди вис, ат анимал фиерент инструцтиор пер, не виде еффициенди при!"u8.ToArray();
yield return utf8.AsSpan(0, 1000).ToArray();
yield return utf8.AsSpan(0, 500).ToArray();
yield return utf8.AsSpan(0, 250).ToArray();
yield return utf8.AsSpan(0, 100).ToArray();
}
[Benchmark]
[ArgumentsSource(nameof(ValidateUtf8Data))]
public bool ValidateUtf8(byte[] str) => Utf8.IsValid(str);
}
Benchmark results on Intel
BenchmarkDotNet v0.13.12, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 16 logical and 8 physical cores
Job-CDTPVO : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-AJLHHY : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
| Method | Toolchain | str | Mean | Error | Ratio |
|---|---|---|---|---|---|
| GetUtf8Bytes | Main | Byte[1000] | 240.52 ns | 0.114 ns | 1.00 |
| GetUtf8Bytes | PR | Byte[1000] | 149.37 ns | 0.055 ns | 0.62 |
| GetUtf8Bytes | Main | Byte[100] | 67.71 ns | 0.475 ns | 0.28 |
| GetUtf8Bytes | PR | Byte[100] | 120.54 ns | 0.410 ns | 0.50 |
| GetUtf8Bytes | Main | Byte[250] | 103.12 ns | 0.448 ns | 0.43 |
| GetUtf8Bytes | PR | Byte[250] | 173.88 ns | 0.251 ns | 0.72 |
| GetUtf8Bytes | Main | Byte[500] | 160.72 ns | 0.367 ns | 0.67 |
| GetUtf8Bytes | PR | Byte[500] | 177.64 ns | 0.348 ns | 0.74 |
| ValidateUtf8 | Main | Byte[1000] | 342.15 ns | 0.037 ns | 1.00 |
| ValidateUtf8 | PR | Byte[1000] | 132.77 ns | 0.044 ns | 0.39 |
| ValidateUtf8 | Main | Byte[100] | 35.08 ns | 0.014 ns | 0.10 |
| ValidateUtf8 | PR | Byte[100] | 75.52 ns | 0.018 ns | 0.22 |
| ValidateUtf8 | Main | Byte[250] | 75.96 ns | 0.070 ns | 0.22 |
| ValidateUtf8 | PR | Byte[250] | 124.09 ns | 0.162 ns | 0.36 |
| ValidateUtf8 | Main | Byte[500] | 169.20 ns | 0.019 ns | 0.49 |
| ValidateUtf8 | PR | Byte[500] | 111.96 ns | 0.022 ns | 0.33 |
@EgorBot -intel -amd
using System.Text;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System.Text.Unicode;
BenchmarkRunner.Run<Bench>(args: args);
public class Bench
{
public static IEnumerable<byte[]> GetUtf8BytesData()
{
// Chinese "Lorem Ipsum"
var utf8 = "唐聞球方五保査禁答近確掲著協世好知長。育乗江校上価話戒宏口自森特室堂討。陸迎奔必秋最量注好枚挑周。間父癒曲在近真権幕覧超持樹件芸保展島船点。齢度約治末価埼坂内辞千故資接藤雨約宿県。定戻業担伸立発告敗家響意球禎。呼真局験善体続得新税知群孫大場。変省創与毎容開拡作北経眺間。樹野市現館開分供同南費海。投以画露両装知全茨済力上速田弘変掲材保内。王野嗅結択芸合験覧託委致就近資。励意親者著識連愚戦親能精球信相準大避一。民覧過走最国転開社加砲者度座図。提著学月牟止百県意能宝質約投分記加。中長塚相選暇版経田経問下訟全報府。要事集細両体要特義点必周優載治山集摘。手機掛果題銀料新政庁分堀住画禁信。味表柄読必望著後入協攻末源安 案志検江水口宿言京並属需就一生断導。通崎楽大最放新属健戦維議本金部兜素定市船"u8.ToArray();
yield return utf8.AsSpan(0, 1000).ToArray();
yield return utf8.AsSpan(0, 500).ToArray();
yield return utf8.AsSpan(0, 250).ToArray();
yield return utf8.AsSpan(0, 100).ToArray();
}
[Benchmark]
[ArgumentsSource(nameof(GetUtf8BytesData))]
public int GetUtf8Bytes(byte[] str) => Encoding.UTF8.GetCharCount(str);
public static IEnumerable<byte[]> ValidateUtf8Data()
{
// ru-RU "Lorem Ipsum"
var utf8 = "Лорем ипсум долор сит амет, хас тале феугаит ех, мел дицит сонет сцрипта ид? Еррорибус темпорибус адверсариум про те, видит ностер хас не, яуод феугаит цу ест. Но дицунт рецусабо диссентиас цум, оптион евертитур ан вих. Но мел антиопам молестиае, продессет абхорреант витуператорибус ат сит, дицант глориатур персецути при еу. При еяуидем пхаедрум рецусабо ех, не вим ерант вертерем Ехерци семпер те нец. Ид нолуиссе детерруиссет нам, яуо ан адхуц дицит пертинациа, мел тота цлита цомпрехенсам ид? Ид аугуе граецис еффициенди вис, ат анимал фиерент инструцтиор пер, не виде еффициенди при!"u8.ToArray();
yield return utf8.AsSpan(0, 1000).ToArray();
yield return utf8.AsSpan(0, 500).ToArray();
yield return utf8.AsSpan(0, 250).ToArray();
yield return utf8.AsSpan(0, 100).ToArray();
}
[Benchmark]
[ArgumentsSource(nameof(ValidateUtf8Data))]
public bool ValidateUtf8(byte[] str) => Utf8.IsValid(str);
}
Benchmark results on Intel
BenchmarkDotNet v0.13.12, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 16 logical and 8 physical cores
Job-ARQQWD : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-JZIDFW : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
| Method | Toolchain | str | Mean | Error | Ratio |
|---|---|---|---|---|---|
| GetUtf8Bytes | Main | Byte[1000] | 238.02 ns | 1.036 ns | 1.00 |
| GetUtf8Bytes | PR | Byte[1000] | 117.88 ns | 0.013 ns | 0.50 |
| GetUtf8Bytes | Main | Byte[100] | 69.25 ns | 0.287 ns | 0.29 |
| GetUtf8Bytes | PR | Byte[100] | 61.40 ns | 0.437 ns | 0.26 |
| GetUtf8Bytes | Main | Byte[250] | 108.00 ns | 0.512 ns | 0.45 |
| GetUtf8Bytes | PR | Byte[250] | 121.58 ns | 0.433 ns | 0.51 |
| GetUtf8Bytes | Main | Byte[500] | 160.09 ns | 0.493 ns | 0.67 |
| GetUtf8Bytes | PR | Byte[500] | 125.87 ns | 0.509 ns | 0.53 |
| ValidateUtf8 | Main | Byte[1000] | 342.05 ns | 0.111 ns | 1.00 |
| ValidateUtf8 | PR | Byte[1000] | 114.51 ns | 0.011 ns | 0.33 |
| ValidateUtf8 | Main | Byte[100] | 35.04 ns | 0.007 ns | 0.10 |
| ValidateUtf8 | PR | Byte[100] | 23.51 ns | 0.008 ns | 0.07 |
| ValidateUtf8 | Main | Byte[250] | 76.36 ns | 0.105 ns | 0.22 |
| ValidateUtf8 | PR | Byte[250] | 73.53 ns | 0.051 ns | 0.21 |
| ValidateUtf8 | Main | Byte[500] | 169.19 ns | 0.008 ns | 0.49 |
| ValidateUtf8 | PR | Byte[500] | 84.19 ns | 0.015 ns | 0.25 |
Benchmark results on Amd
BenchmarkDotNet v0.13.12, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
AMD EPYC 7763, 1 CPU, 16 logical and 8 physical cores
Job-GWHBIB : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-LYEYRH : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX2
| Method | Toolchain | str | Mean | Error | Ratio |
|---|---|---|---|---|---|
| GetUtf8Bytes | Main | Byte[1000] | 211.16 ns | 0.075 ns | 1.00 |
| GetUtf8Bytes | PR | Byte[1000] | 100.20 ns | 0.025 ns | 0.47 |
| GetUtf8Bytes | Main | Byte[100] | 67.82 ns | 0.137 ns | 0.32 |
| GetUtf8Bytes | PR | Byte[100] | 61.96 ns | 0.298 ns | 0.29 |
| GetUtf8Bytes | Main | Byte[250] | 97.97 ns | 0.077 ns | 0.46 |
| GetUtf8Bytes | PR | Byte[250] | 109.18 ns | 0.098 ns | 0.52 |
| GetUtf8Bytes | Main | Byte[500] | 150.22 ns | 0.184 ns | 0.71 |
| GetUtf8Bytes | PR | Byte[500] | 113.65 ns | 0.109 ns | 0.54 |
| ValidateUtf8 | Main | Byte[1000] | 331.35 ns | 0.245 ns | 1.00 |
| ValidateUtf8 | PR | Byte[1000] | 95.34 ns | 0.021 ns | 0.29 |
| ValidateUtf8 | Main | Byte[100] | 36.10 ns | 0.032 ns | 0.11 |
| ValidateUtf8 | PR | Byte[100] | 22.56 ns | 0.009 ns | 0.07 |
| ValidateUtf8 | Main | Byte[250] | 77.43 ns | 0.056 ns | 0.23 |
| ValidateUtf8 | PR | Byte[250] | 60.51 ns | 0.034 ns | 0.18 |
| ValidateUtf8 | Main | Byte[500] | 167.75 ns | 0.581 ns | 0.51 |
| ValidateUtf8 | PR | Byte[500] | 73.30 ns | 0.022 ns | 0.22 |
Question regarding the PR title: it seems using AVX2 (256), not AVX-512
@huoyaoyuan I think that's a good point. This does seem to be AVX2 (which is not a bad idea).