runtime icon indicating copy to clipboard operation
runtime copied to clipboard

Remove JitGenericHandleCache

Open davidwrighton opened this issue 1 year ago • 7 comments
trafficstars

  • It was only used for overflow scenarios from the generic dictionary (which don't happen), virtual resolution scenarios for creating delegates, and a few other rare R2R scenarios
  • Replace the virtual resolution scenarios with a cache of the affected data in managed code, and move the helpers to managed
  • Just remove the pointless checks from within the various normal generic lookup paths
  • Use the GenericCache type which was previously only used on NativeAOT for Generic Virtual method resolution paths.

davidwrighton avatar Aug 22 '24 22:08 davidwrighton

Tagging subscribers to this area: @mangod9 See info in area-owners.md if you want to be subscribed.

@EgorBot -intel

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

davidwrighton avatar Aug 26 '24 20:08 davidwrighton

❌ Failed on Intel: PR patch apply failed (try to squash commits in your PR?)

Build log

EgorBot avatar Aug 26 '24 20:08 EgorBot

@davidwrighton I have a small limitation in my bot (I plan to fix it), but currently the way it works - it clones main, build corerun, then it applies PR patch, for that it just does wget https://github.com/dotnet/runtime/pull/106843.patch (adds .patch to PR URL basically) which may fail for PRs with large changes/commits (typically, squashing commits via force push helps) -- I'll take a look today to use a better approach to build diff (to clone actual branch).

EgorBo avatar Aug 26 '24 20:08 EgorBo

@EgorBot -intel

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

davidwrighton avatar Aug 26 '24 21:08 davidwrighton

Benchmark results on Intel
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
  Job-IAJWTB : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-ELTAZC : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
RatioSD=0.03
Method Toolchain Mean Error Ratio
Get1Virtual Main 13.88 ns 0.278 ns 1.00
Get1Virtual PR 14.25 ns 0.319 ns 1.03
Get10Virtual Main 156.04 ns 3.011 ns 1.00
Get10Virtual PR 167.20 ns 1.656 ns 1.07

BDN_Artifacts.zip

EgorBot avatar Aug 26 '24 21:08 EgorBot

@EgorBot -intel -arm64

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

davidwrighton avatar Aug 27 '24 18:08 davidwrighton

Benchmark results on Intel
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
  Job-UAKKKJ : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-SUOWDN : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Toolchain Mean Error Ratio
Get1Virtual Main 16.24 ns 0.228 ns 1.00
Get1Virtual PR 17.15 ns 0.188 ns 1.06
Get10Virtual Main 164.94 ns 3.211 ns 1.00
Get10Virtual PR 179.95 ns 3.583 ns 1.09

BDN_Artifacts.zip

EgorBot avatar Aug 27 '24 19:08 EgorBot

Benchmark results on Arm64
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Unknown processor
  Job-GBFISH : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
  Job-UCOFWF : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
Method Toolchain Mean Error Ratio
Get1Virtual Main 20.61 ns 0.022 ns 1.00
Get1Virtual PR 21.18 ns 0.149 ns 1.03
Get10Virtual Main 215.70 ns 0.238 ns 1.00
Get10Virtual PR 308.22 ns 0.914 ns 1.43

BDN_Artifacts.zip

EgorBot avatar Aug 27 '24 19:08 EgorBot

PS: I have fixed the patch applying so commit squashing is no longer needed. Also, you can use -perf to get output of perf for a benchmark (only one [Benchmark] is expected) before/after.

@EgorBot -arm64 -perf

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

EgorBo avatar Aug 27 '24 19:08 EgorBo

Benchmark results on Arm64
BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Unknown processor
  Job-YQNLAB : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
  Job-AMUJHL : .NET 10.0.0 (42.42.42.42424), Arm64 RyuJIT AdvSIMD
Method Toolchain Mean Error Ratio
Get10Virtual Main 229.3 ns 2.22 ns 1.00
Get10Virtual PR 246.1 ns 1.62 ns 1.07

BDN_Artifacts.zip

Flame graphs: Main vs PR 🔥 Hot asm: Main vs PR Hot functions: Main vs PR

For clean perf results, make sure you have just one [Benchmark] in your app.

EgorBot avatar Aug 27 '24 19:08 EgorBot

An analysis of these results indicates that

  1. While universally slower with the new code, we are close enough for the performance of this relatively rare scenario.
  2. Arm64 showed significantly lower performance once, but that appears to be a case of unlucky hash values in the hash function. Since the has function is not meaningfully changed with this PR, this is considered immaterial.
  3. When running in actual use, this should be slightly faster. Benchmark.NET results are gathered with tiered compilation disabled, and the codegen for the function is slightly better when tiered.

davidwrighton avatar Aug 27 '24 22:08 davidwrighton

Benchmark.NET results are gathered with tiered compilation disabled, and the codegen for the function is slightly better when tiered.

In the bot it's not turned off, in order to disable it, you can pass --envvars DOTNET_TieredCompilation:0. (it's standard BDN CLI args). E.g. you can see Tier names in https://gist.github.com/EgorBot/df35cdd0ebee41f43a59d1c44cfa5595 (OptimizedTier1)

EgorBo avatar Aug 27 '24 22:08 EgorBo

@EgorBot -intel -arm64

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;

namespace VirtualDispatchChecks
{

    class _1
    {
        public virtual string Method() => "A";
    }

    class _2 : _1
    {
        public override string Method() => "B";
    }

    class _3 : _1
    {
        public override string Method() => "C";
    }

    class _4 : _1
    {
        public override string Method() => "D";
    }

    class _5 : _1
    {
        public override string Method() => "E";
    }
    class _6 : _1
    {
        public override string Method() => "F";
    }
    class _7 : _1
    {
        public override string Method() => "F";
    }

    class _8 : _1
    {
        public override string Method() => "F";
    }

    class _9 : _1
    {
        public override string Method() => "F";
    }

    class _10 : _1
    {
        public override string Method() => "F";
    }

    public class ResolveVirtualRepeatedly
    {
        static _1 _1 = new _1();
        static _1 _2 = new _2();
        static _1 _3 = new _3();
        static _1 _4 = new _4();
        static _1 _5 = new _5();
        static _1 _6 = new _6();
        static _1 _7 = new _7();
        static _1 _8 = new _8();
        static _1 _9 = new _9();
        static _1 _10 = new _10();
        [MethodImpl(MethodImplOptions.NoInlining)]
        static _1 GetA(int index)
        {
            switch (index % 10)
            {
                case 0:
                    return _10;
                case 1: return _1;
                case 2: return _2;
                case 3: return _3;
                case 4: return _4;
                case 5: return _5;
                case 6: return _6;
                case 7: return _7;
                case 8: return _8;
                case 9: return _9;
            }
            return null!;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static Func<string> GetAction(int index)
        {
            return GetA(index).Method;
        }

        [Benchmark]
        public void Get1Virtual()
        {
            GetAction(0);
        }

        [Benchmark]
        public void Get10Virtual()
        {
            GetAction(0);
            GetAction(1);
            GetAction(2);
            GetAction(3);
            GetAction(4);
            GetAction(5);
            GetAction(6);
            GetAction(7);
            GetAction(8);
            GetAction(9);
        }
    }
}

davidwrighton avatar Sep 04 '24 21:09 davidwrighton

Benchmark results on Intel

I am not able to reproduce this regression on Dev Box Amd. The PR is ~7% faster compared to baseline on that machine.

jkotas avatar Sep 17 '24 07:09 jkotas