runtime Remove JitGenericHandleCache

trafficstars

It was only used for overflow scenarios from the generic dictionary (which don't happen), virtual resolution scenarios for creating delegates, and a few other rare R2R scenarios
Replace the virtual resolution scenarios with a cache of the affected data in managed code, and move the helpers to managed
Just remove the pointless checks from within the various normal generic lookup paths
Use the GenericCache type which was previously only used on NativeAOT for Generic Virtual method resolution paths.

Aug 22 '24 22:08 davidwrighton

Tagging subscribers to this area: @mangod9 See info in area-owners.md if you want to be subscribed.

Aug 22 '24 22:08 dotnet-policy-service[bot]

@EgorBot -intel

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

Aug 26 '24 20:08 davidwrighton

❌ Failed on Intel: PR patch apply failed (try to squash commits in your PR?)

Build log

Aug 26 '24 20:08 EgorBot

@davidwrighton I have a small limitation in my bot (I plan to fix it), but currently the way it works - it clones main, build corerun, then it applies PR patch, for that it just does wget https://github.com/dotnet/runtime/pull/106843.patch (adds .patch to PR URL basically) which may fail for PRs with large changes/commits (typically, squashing commits via force push helps) -- I'll take a look today to use a better approach to build diff (to clone actual branch).

Aug 26 '24 20:08 EgorBo

@EgorBot -intel

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

Aug 26 '24 21:08 davidwrighton

Benchmark results on Intel

BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
  Job-IAJWTB : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-ELTAZC : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
RatioSD=0.03

Method	Toolchain	Mean	Error	Ratio
Get1Virtual	Main	13.88 ns	0.278 ns	1.00
Get1Virtual	PR	14.25 ns	0.319 ns	1.03

Get10Virtual	Main	156.04 ns	3.011 ns	1.00
Get10Virtual	PR	167.20 ns	1.656 ns	1.07

BDN_Artifacts.zip

Aug 26 '24 21:08 EgorBot

@EgorBot -intel -arm64

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

Aug 27 '24 18:08 davidwrighton

Benchmark results on Intel

BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
  Job-UAKKKJ : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-SUOWDN : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI

Method	Toolchain	Mean	Error	Ratio
Get1Virtual	Main	16.24 ns	0.228 ns	1.00
Get1Virtual	PR	17.15 ns	0.188 ns	1.06

Get10Virtual	Main	164.94 ns	3.211 ns	1.00
Get10Virtual	PR	179.95 ns	3.583 ns	1.09

BDN_Artifacts.zip

Aug 27 '24 19:08 EgorBot

Benchmark results on Arm64

BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Unknown processor
  Job-GBFISH : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
  Job-UCOFWF : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD

Method	Toolchain	Mean	Error	Ratio
Get1Virtual	Main	20.61 ns	0.022 ns	1.00
Get1Virtual	PR	21.18 ns	0.149 ns	1.03

Get10Virtual	Main	215.70 ns	0.238 ns	1.00
Get10Virtual	PR	308.22 ns	0.914 ns	1.43

BDN_Artifacts.zip

Aug 27 '24 19:08 EgorBot

PS: I have fixed the patch applying so commit squashing is no longer needed. Also, you can use -perf to get output of perf for a benchmark (only one [Benchmark] is expected) before/after.

@EgorBot -arm64 -perf

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

Aug 27 '24 19:08 EgorBo

Benchmark results on Arm64

BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Unknown processor
  Job-YQNLAB : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
  Job-AMUJHL : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD

Method	Toolchain	Mean	Error	Ratio
Get10Virtual	Main	229.3 ns	2.22 ns	1.00
Get10Virtual	PR	246.1 ns	1.62 ns	1.07

BDN_Artifacts.zip

Flame graphs: Main vs PR 🔥 Hot asm: Main vs PR Hot functions: Main vs PR

For clean perf results, make sure you have just one [Benchmark] in your app.

Aug 27 '24 19:08 EgorBot

An analysis of these results indicates that

While universally slower with the new code, we are close enough for the performance of this relatively rare scenario.
Arm64 showed significantly lower performance once, but that appears to be a case of unlucky hash values in the hash function. Since the has function is not meaningfully changed with this PR, this is considered immaterial.
When running in actual use, this should be slightly faster. Benchmark.NET results are gathered with tiered compilation disabled, and the codegen for the function is slightly better when tiered.

Aug 27 '24 22:08 davidwrighton

Benchmark.NET results are gathered with tiered compilation disabled, and the codegen for the function is slightly better when tiered.

In the bot it's not turned off, in order to disable it, you can pass --envvars DOTNET_TieredCompilation:0. (it's standard BDN CLI args). E.g. you can see Tier names in https://gist.github.com/EgorBot/df35cdd0ebee41f43a59d1c44cfa5595 (OptimizedTier1)

Aug 27 '24 22:08 EgorBo

@EgorBot -intel -arm64

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

Sep 04 '24 21:09 davidwrighton

Benchmark results on Intel

I am not able to reproduce this regression on Dev Box Amd. The PR is ~7% faster compared to baseline on that machine.

Sep 17 '24 07:09 jkotas

runtime runtime copied to clipboard

Remove JitGenericHandleCache

runtime
runtime copied to clipboard