runtime
runtime copied to clipboard
Remove JitGenericHandleCache
- It was only used for overflow scenarios from the generic dictionary (which don't happen), virtual resolution scenarios for creating delegates, and a few other rare R2R scenarios
- Replace the virtual resolution scenarios with a cache of the affected data in managed code, and move the helpers to managed
- Just remove the pointless checks from within the various normal generic lookup paths
- Use the GenericCache type which was previously only used on NativeAOT for Generic Virtual method resolution paths.
Tagging subscribers to this area: @mangod9 See info in area-owners.md if you want to be subscribed.
@EgorBot -intel
using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
namespace VirtualDispatchChecks
{
class _1
{
public virtual string Method() => "A";
}
class _2 : _1
{
public override string Method() => "B";
}
class _3 : _1
{
public override string Method() => "C";
}
class _4 : _1
{
public override string Method() => "D";
}
class _5 : _1
{
public override string Method() => "E";
}
class _6 : _1
{
public override string Method() => "F";
}
class _7 : _1
{
public override string Method() => "F";
}
class _8 : _1
{
public override string Method() => "F";
}
class _9 : _1
{
public override string Method() => "F";
}
class _10 : _1
{
public override string Method() => "F";
}
public class ResolveVirtualRepeatedly
{
static _1 _1 = new _1();
static _1 _2 = new _2();
static _1 _3 = new _3();
static _1 _4 = new _4();
static _1 _5 = new _5();
static _1 _6 = new _6();
static _1 _7 = new _7();
static _1 _8 = new _8();
static _1 _9 = new _9();
static _1 _10 = new _10();
[MethodImpl(MethodImplOptions.NoInlining)]
static _1 GetA(int index)
{
switch (index % 10)
{
case 0:
return _10;
case 1: return _1;
case 2: return _2;
case 3: return _3;
case 4: return _4;
case 5: return _5;
case 6: return _6;
case 7: return _7;
case 8: return _8;
case 9: return _9;
}
return null!;
}
[MethodImpl(MethodImplOptions.NoInlining)]
static Func<string> GetAction(int index)
{
return GetA(index).Method;
}
[Benchmark]
public void Get1Virtual()
{
GetAction(0);
}
[Benchmark]
public void Get10Virtual()
{
GetAction(0);
GetAction(1);
GetAction(2);
GetAction(3);
GetAction(4);
GetAction(5);
GetAction(6);
GetAction(7);
GetAction(8);
GetAction(9);
}
}
}
@davidwrighton I have a small limitation in my bot (I plan to fix it), but currently the way it works - it clones main, build corerun, then it applies PR patch, for that it just does wget https://github.com/dotnet/runtime/pull/106843.patch (adds .patch to PR URL basically) which may fail for PRs with large changes/commits (typically, squashing commits via force push helps) -- I'll take a look today to use a better approach to build diff (to clone actual branch).
@EgorBot -intel
using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
namespace VirtualDispatchChecks
{
class _1
{
public virtual string Method() => "A";
}
class _2 : _1
{
public override string Method() => "B";
}
class _3 : _1
{
public override string Method() => "C";
}
class _4 : _1
{
public override string Method() => "D";
}
class _5 : _1
{
public override string Method() => "E";
}
class _6 : _1
{
public override string Method() => "F";
}
class _7 : _1
{
public override string Method() => "F";
}
class _8 : _1
{
public override string Method() => "F";
}
class _9 : _1
{
public override string Method() => "F";
}
class _10 : _1
{
public override string Method() => "F";
}
public class ResolveVirtualRepeatedly
{
static _1 _1 = new _1();
static _1 _2 = new _2();
static _1 _3 = new _3();
static _1 _4 = new _4();
static _1 _5 = new _5();
static _1 _6 = new _6();
static _1 _7 = new _7();
static _1 _8 = new _8();
static _1 _9 = new _9();
static _1 _10 = new _10();
[MethodImpl(MethodImplOptions.NoInlining)]
static _1 GetA(int index)
{
switch (index % 10)
{
case 0:
return _10;
case 1: return _1;
case 2: return _2;
case 3: return _3;
case 4: return _4;
case 5: return _5;
case 6: return _6;
case 7: return _7;
case 8: return _8;
case 9: return _9;
}
return null!;
}
[MethodImpl(MethodImplOptions.NoInlining)]
static Func<string> GetAction(int index)
{
return GetA(index).Method;
}
[Benchmark]
public void Get1Virtual()
{
GetAction(0);
}
[Benchmark]
public void Get10Virtual()
{
GetAction(0);
GetAction(1);
GetAction(2);
GetAction(3);
GetAction(4);
GetAction(5);
GetAction(6);
GetAction(7);
GetAction(8);
GetAction(9);
}
}
}
Benchmark results on Intel
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
Job-IAJWTB : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-ELTAZC : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
RatioSD=0.03
| Method | Toolchain | Mean | Error | Ratio |
|---|---|---|---|---|
| Get1Virtual | Main | 13.88 ns | 0.278 ns | 1.00 |
| Get1Virtual | PR | 14.25 ns | 0.319 ns | 1.03 |
| Get10Virtual | Main | 156.04 ns | 3.011 ns | 1.00 |
| Get10Virtual | PR | 167.20 ns | 1.656 ns | 1.07 |
@EgorBot -intel -arm64
using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
namespace VirtualDispatchChecks
{
class _1
{
public virtual string Method() => "A";
}
class _2 : _1
{
public override string Method() => "B";
}
class _3 : _1
{
public override string Method() => "C";
}
class _4 : _1
{
public override string Method() => "D";
}
class _5 : _1
{
public override string Method() => "E";
}
class _6 : _1
{
public override string Method() => "F";
}
class _7 : _1
{
public override string Method() => "F";
}
class _8 : _1
{
public override string Method() => "F";
}
class _9 : _1
{
public override string Method() => "F";
}
class _10 : _1
{
public override string Method() => "F";
}
public class ResolveVirtualRepeatedly
{
static _1 _1 = new _1();
static _1 _2 = new _2();
static _1 _3 = new _3();
static _1 _4 = new _4();
static _1 _5 = new _5();
static _1 _6 = new _6();
static _1 _7 = new _7();
static _1 _8 = new _8();
static _1 _9 = new _9();
static _1 _10 = new _10();
[MethodImpl(MethodImplOptions.NoInlining)]
static _1 GetA(int index)
{
switch (index % 10)
{
case 0:
return _10;
case 1: return _1;
case 2: return _2;
case 3: return _3;
case 4: return _4;
case 5: return _5;
case 6: return _6;
case 7: return _7;
case 8: return _8;
case 9: return _9;
}
return null!;
}
[MethodImpl(MethodImplOptions.NoInlining)]
static Func<string> GetAction(int index)
{
return GetA(index).Method;
}
[Benchmark]
public void Get1Virtual()
{
GetAction(0);
}
[Benchmark]
public void Get10Virtual()
{
GetAction(0);
GetAction(1);
GetAction(2);
GetAction(3);
GetAction(4);
GetAction(5);
GetAction(6);
GetAction(7);
GetAction(8);
GetAction(9);
}
}
}
Benchmark results on Intel
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
Job-UAKKKJ : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Job-SUOWDN : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
| Method | Toolchain | Mean | Error | Ratio |
|---|---|---|---|---|
| Get1Virtual | Main | 16.24 ns | 0.228 ns | 1.00 |
| Get1Virtual | PR | 17.15 ns | 0.188 ns | 1.06 |
| Get10Virtual | Main | 164.94 ns | 3.211 ns | 1.00 |
| Get10Virtual | PR | 179.95 ns | 3.583 ns | 1.09 |
Benchmark results on Arm64
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Unknown processor
Job-GBFISH : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
Job-UCOFWF : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
| Method | Toolchain | Mean | Error | Ratio |
|---|---|---|---|---|
| Get1Virtual | Main | 20.61 ns | 0.022 ns | 1.00 |
| Get1Virtual | PR | 21.18 ns | 0.149 ns | 1.03 |
| Get10Virtual | Main | 215.70 ns | 0.238 ns | 1.00 |
| Get10Virtual | PR | 308.22 ns | 0.914 ns | 1.43 |
PS: I have fixed the patch applying so commit squashing is no longer needed. Also, you can use -perf to get output of perf for a benchmark (only one [Benchmark] is expected) before/after.
@EgorBot -arm64 -perf
using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
namespace VirtualDispatchChecks
{
class _1
{
public virtual string Method() => "A";
}
class _2 : _1
{
public override string Method() => "B";
}
class _3 : _1
{
public override string Method() => "C";
}
class _4 : _1
{
public override string Method() => "D";
}
class _5 : _1
{
public override string Method() => "E";
}
class _6 : _1
{
public override string Method() => "F";
}
class _7 : _1
{
public override string Method() => "F";
}
class _8 : _1
{
public override string Method() => "F";
}
class _9 : _1
{
public override string Method() => "F";
}
class _10 : _1
{
public override string Method() => "F";
}
public class ResolveVirtualRepeatedly
{
static _1 _1 = new _1();
static _1 _2 = new _2();
static _1 _3 = new _3();
static _1 _4 = new _4();
static _1 _5 = new _5();
static _1 _6 = new _6();
static _1 _7 = new _7();
static _1 _8 = new _8();
static _1 _9 = new _9();
static _1 _10 = new _10();
[MethodImpl(MethodImplOptions.NoInlining)]
static _1 GetA(int index)
{
switch (index % 10)
{
case 0:
return _10;
case 1: return _1;
case 2: return _2;
case 3: return _3;
case 4: return _4;
case 5: return _5;
case 6: return _6;
case 7: return _7;
case 8: return _8;
case 9: return _9;
}
return null!;
}
[MethodImpl(MethodImplOptions.NoInlining)]
static Func<string> GetAction(int index)
{
return GetA(index).Method;
}
[Benchmark]
public void Get10Virtual()
{
GetAction(0);
GetAction(1);
GetAction(2);
GetAction(3);
GetAction(4);
GetAction(5);
GetAction(6);
GetAction(7);
GetAction(8);
GetAction(9);
}
}
}
Benchmark results on Arm64
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Unknown processor
Job-YQNLAB : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
Job-AMUJHL : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
| Method | Toolchain | Mean | Error | Ratio |
|---|---|---|---|---|
| Get10Virtual | Main | 229.3 ns | 2.22 ns | 1.00 |
| Get10Virtual | PR | 246.1 ns | 1.62 ns | 1.07 |
Flame graphs: Main vs PR 🔥 Hot asm: Main vs PR Hot functions: Main vs PR
For clean perf results, make sure you have just one [Benchmark] in your app.
An analysis of these results indicates that
- While universally slower with the new code, we are close enough for the performance of this relatively rare scenario.
- Arm64 showed significantly lower performance once, but that appears to be a case of unlucky hash values in the hash function. Since the has function is not meaningfully changed with this PR, this is considered immaterial.
- When running in actual use, this should be slightly faster. Benchmark.NET results are gathered with tiered compilation disabled, and the codegen for the function is slightly better when tiered.
Benchmark.NET results are gathered with tiered compilation disabled, and the codegen for the function is slightly better when tiered.
In the bot it's not turned off, in order to disable it, you can pass --envvars DOTNET_TieredCompilation:0. (it's standard BDN CLI args).
E.g. you can see Tier names in https://gist.github.com/EgorBot/df35cdd0ebee41f43a59d1c44cfa5595 (OptimizedTier1)
@EgorBot -intel -arm64
using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
namespace VirtualDispatchChecks
{
class _1
{
public virtual string Method() => "A";
}
class _2 : _1
{
public override string Method() => "B";
}
class _3 : _1
{
public override string Method() => "C";
}
class _4 : _1
{
public override string Method() => "D";
}
class _5 : _1
{
public override string Method() => "E";
}
class _6 : _1
{
public override string Method() => "F";
}
class _7 : _1
{
public override string Method() => "F";
}
class _8 : _1
{
public override string Method() => "F";
}
class _9 : _1
{
public override string Method() => "F";
}
class _10 : _1
{
public override string Method() => "F";
}
public class ResolveVirtualRepeatedly
{
static _1 _1 = new _1();
static _1 _2 = new _2();
static _1 _3 = new _3();
static _1 _4 = new _4();
static _1 _5 = new _5();
static _1 _6 = new _6();
static _1 _7 = new _7();
static _1 _8 = new _8();
static _1 _9 = new _9();
static _1 _10 = new _10();
[MethodImpl(MethodImplOptions.NoInlining)]
static _1 GetA(int index)
{
switch (index % 10)
{
case 0:
return _10;
case 1: return _1;
case 2: return _2;
case 3: return _3;
case 4: return _4;
case 5: return _5;
case 6: return _6;
case 7: return _7;
case 8: return _8;
case 9: return _9;
}
return null!;
}
[MethodImpl(MethodImplOptions.NoInlining)]
static Func<string> GetAction(int index)
{
return GetA(index).Method;
}
[Benchmark]
public void Get1Virtual()
{
GetAction(0);
}
[Benchmark]
public void Get10Virtual()
{
GetAction(0);
GetAction(1);
GetAction(2);
GetAction(3);
GetAction(4);
GetAction(5);
GetAction(6);
GetAction(7);
GetAction(8);
GetAction(9);
}
}
}
Benchmark results on Intel
I am not able to reproduce this regression on Dev Box Amd. The PR is ~7% faster compared to baseline on that machine.