sdk icon indicating copy to clipboard operation
sdk copied to clipboard

Surprising `is` test performance

Open jensjoha opened this issue 6 months ago • 2 comments

I noticed that an is test in our code gave rise to code like this:

        ;; v5 <- InstanceOf:12(v3 IS NullValue<dynamic>, instantiator_type_args(v0), function_type_args(v0)) T{bool}
0x7fa178cada4b    493b4670               cmpq rax,[thr+0x70]   null
0x7fa178cada4f    0f8497000000           jz 0x00007fa178cadaec
        ;; InlineInstanceof
        ;; InstantiatedTypeWithArgumentsTest
0x7fa178cada55    a801                   test al,1
0x7fa178cada57    0f848f000000           jz 0x00007fa178cadaec
0x7fa178cada5d    448b48ff               movl r9,[rax-0x1]
0x7fa178cada61    41c1e90c               shrl r9,12
0x7fa178cada65    4981f9e1000000         cmpq r9,0xe1
0x7fa178cada6c    0f8483000000           jz 0x00007fa178cadaf5
        ;; Subtype1TestCacheLookup
0x7fa178cada72    8b70ff                 movl rsi,[rax-0x1]
0x7fa178cada75    c1ee0c                 shrl rsi,12
0x7fa178cada78    4d8b8e10070000         movq r9,[thr+0x710]
0x7fa178cada7f    4d8b4918               movq r9,[r9+0x18]
0x7fa178cada83    4d8b0cf1               movq r9,[r9+rsi*8]
0x7fa178cada87    4d8b494f               movq r9,[r9+0x4f]
0x7fa178cada8b    4d3b4e70               cmpq r9,[thr+0x70]   null
0x7fa178cada8f    0f8457000000           jz 0x00007fa178cadaec
0x7fa178cada95    458b490f               movl r9,[r9+0xf]
0x7fa178cada99    49c1e903               shrq r9,3
0x7fa178cada9d    4981f9e1000000         cmpq r9,0xe1
0x7fa178cadaa4    0f844b000000           jz 0x00007fa178cadaf5
0x7fa178cadaaa    4d8b8fe7170000         movq r9,[pp+0x17e7]   SubtypeTestCache(1, 0)
0x7fa178cadab1    4d8b9f57150000         movq tmp,[pp+0x1557]   Code([Stub] Subtype1TestCache)
0x7fa178cadab8    41ff5307               call [tmp+0x7]
0x7fa178cadabc    4d3b4670               cmpq r8,[thr+0x70]   null
0x7fa178cadac0    740f                   jz 0x00007fa178cadad1
        ;; BoolTest
0x7fa178cadac2    41f6c010               testb r8,0x10
0x7fa178cadac6    0f8429000000           jz 0x00007fa178cadaf5
0x7fa178cadacc    e91b000000             jmp 0x00007fa178cadaec
0x7fa178cadad1    498b9fef170000         movq rbx,[pp+0x17ef]   Type: NullValue
0x7fa178cadad8    4d8b8ff7170000         movq r9,[pp+0x17f7]   SubtypeTestCache(1, 0)
0x7fa178cadadf    4d8b9f970d0000         movq tmp,[pp+0xd97]   Code([Stub] InstanceOf)
0x7fa178cadae6    41ff5307               call [tmp+0x7]
0x7fa178cadaea    eb0d                   jmp 0x00007fa178cadaf9
0x7fa178cadaec    498b8680000000         movq rax,[thr+0x80]   false
0x7fa178cadaf3    eb04                   jmp 0x00007fa178cadaf9
0x7fa178cadaf5    498b4678               movq rax,[thr+0x78]   true
        ;; ParallelMove rax <- rax
        ;; Branch if StrictCompare:14(===, v5, v6) goto (3, 4)
        ;; BoolTest
0x7fa178cadaf9    a810                   test al,0x10
0x7fa178cadafb    0f8505000000           jnz 0x00007fa178cadb06

which seems like a lot.

I've constructed the following benchmark:

class Foo1<T> {
  const Foo1();

  @override
  String toString() => "Foo1<$T>";
}

class Foo2<T> {
  const Foo2();

  @override
  String toString() => "Foo2<$T>";
}

class Foo3<T> {
  const Foo3();

  @override
  String toString() => "Foo3<$T>";
}

enum Foo1_1 implements Foo1<Object> { A, B, C }

enum Foo1_2 implements Foo1<Object> { A, B, C }

enum Foo1_3 implements Foo1<Object> { A, B, C }

enum Foo2_1 implements Foo2<Object> { A, B, C }

enum Foo2_2 implements Foo2<Object> { A, B, C }

enum Foo2_3 implements Foo2<Object> { A, B, C }

enum Foo3_1 implements Foo3<Object> { A, B, C }

enum Foo3_2 implements Foo3<Object> { A, B, C }

enum Foo3_3 implements Foo3<Object> { A, B, C }

void main(List args) {
  for(int j = 0; j < 3; j++) {
  int result;
  Stopwatch stopwatch = new Stopwatch();
  if (args.first == "1") {
    stopwatch.start();
    // Only Foo1* enums.
    result = fooMain1([
      ...args,
      null,
      Foo1_1.A,
      Foo1_2.A,
      Foo1_3.A,
      null,
      "hello",
    ]);
    stopwatch.stop();
  } else if (args.first == "2") {
    // Only Foo2 instantiations.
    stopwatch.start();
    result = fooMain2([
      ...args,
      null,
      const Foo2<String>(),
      const Foo2<int>(),
      const Foo2<double>(),
      null,
      "hello"
    ]);
    stopwatch.stop();
  } else if (args.first == "3") {
    // Both Foo3* enums and Foo3 instantiations.
    stopwatch.start();
    result = fooMain3([
      ...args,
      null,
      Foo3_1.A,
      Foo3_2.A,
      const Foo3<String>(),
      null,
      "hello",
    ]);
    stopwatch.stop();
  } else {
    throw "Need 1, 2 or 3";
  }
  print("Got $result in ${stopwatch.elapsedMilliseconds} ms");
  }
}
const int iterations = 100000000;

@pragma("vm:never-inline")
int fooMain1(List args) {
  int result = 0;
  for (int i = 0; i < iterations; i++) {
    for (dynamic x in args) {
      if (x is Foo1) {
        result++;
      }
    }
  }
  return result;
}

@pragma("vm:never-inline")
int fooMain2(List args) {
  int result = 0;
  for (int i = 0; i < iterations; i++) {
    for (dynamic x in args) {
      if (x is Foo2) {
        result++;
      }
    }
  }
  return result;
}

@pragma("vm:never-inline")
int fooMain3(List args) {
  int result = 0;
  for (int i = 0; i < iterations; i++) {
    for (dynamic x in args) {
      if (x is Foo3) {
        result++;
      }
    }
  }
  return result;
}

that illustrates what's happening, at least in AOT:

fooMain1 is only given Foo1* enums and creates code like this:

42425   v20 <- LoadClassId(v147 T{X0?}) [1, 614] int64
   0x000000000006d2d8 <+172>:   test   $0x1,%cl
   0x000000000006d2db <+175>:   mov    $0x3c,%edx
   0x000000000006d2e0 <+180>:   je     0x6d2e8 <fooMain1+188>
   0x000000000006d2e2 <+182>:   mov    -0x1(%rcx),%edx
   0x000000000006d2e5 <+185>:   shr    $0xc,%edx

42426   Branch if TestRange(v20 in [581-583]) T{bool} goto (7, 8)
   0x000000000006d2e8 <+188>:   lea    -0x245(%rdx),%rcx
   0x000000000006d2ef <+195>:   cmp    $0x2,%rcx
   0x000000000006d2f3 <+199>:   ja     0x6d309 <fooMain1+221>

On my machine this runs in ~1730 ms (out/ReleaseX64/dart-sdk/bin/dartaotruntime is_test_benchmark.aot 1).

fooMain2 is only given (const) values of Foo2 and creates code like this:

42349   v20 <- LoadClassId(v150 T{X0?}) [1, 614] int64
   0x000000000006d1a4 <+172>:   test   $0x1,%cl
   0x000000000006d1a7 <+175>:   mov    $0x3c,%edx
   0x000000000006d1ac <+180>:   je     0x6d1b4 <fooMain2+188>
   0x000000000006d1ae <+182>:   mov    -0x1(%rcx),%edx
   0x000000000006d1b1 <+185>:   shr    $0xc,%edx

42350   Branch if EqualityCompare(v20 == v21) T{bool} goto (7, 8)
   0x000000000006d1b4 <+188>:   cmp    $0xe2,%rdx
   0x000000000006d1bb <+195>:   jne    0x6d1d1 <fooMain2+217>

On my machine this runs in ~1360 ms (out/ReleaseX64/dart-sdk/bin/dartaotruntime is_test_benchmark.aot 2).

fooMain3 is given both Foo3* enums and (const) values of Foo3. One would think it could do the combination of the above and do a cid range check and a cid equality test (or if placing things differently maybe just a larger range check) - but instead it creates code like this:

42267   v16 <- InstanceOf:36(v154 T{X0?} IS Foo3<dynamic>, instantiator_type_args(v0), function_type_args(v0)) T{bool}
   0x000000000006cfd8 <+180>:   cmp    0x70(%r14),%rax
   0x000000000006cfdc <+184>:   je     0x6d079 <fooMain3+341>

42268   InlineInstanceof
42269   InstantiatedTypeWithArgumentsTest
   0x000000000006cfe2 <+190>:   test   $0x1,%al
   0x000000000006cfe4 <+192>:   je     0x6d079 <fooMain3+341>
   0x000000000006cfea <+198>:   mov    -0x1(%rax),%r9d
   0x000000000006cfee <+202>:   shr    $0xc,%r9d
   0x000000000006cff2 <+206>:   cmp    $0xe1,%r9
   0x000000000006cff9 <+213>:   je     0x6d082 <fooMain3+350>

42270   Subtype1TestCacheLookup
   0x000000000006cfff <+219>:   mov    -0x1(%rax),%esi
   0x000000000006d002 <+222>:   shr    $0xc,%esi
   0x000000000006d005 <+225>:   mov    0x718(%r14),%r9
   0x000000000006d00c <+232>:   mov    0x18(%r9),%r9
   0x000000000006d010 <+236>:   mov    (%r9,%rsi,8),%r9
   0x000000000006d014 <+240>:   mov    0x4f(%r9),%r9
   0x000000000006d018 <+244>:   cmp    0x70(%r14),%r9
   0x000000000006d01c <+248>:   je     0x6d079 <fooMain3+341>
   0x000000000006d022 <+254>:   mov    0xf(%r9),%r9d
   0x000000000006d026 <+258>:   shr    $0x3,%r9
   0x000000000006d02a <+262>:   cmp    $0xe1,%r9
   0x000000000006d031 <+269>:   je     0x6d082 <fooMain3+350>
   0x000000000006d037 <+275>:   mov    0x1867(%r15),%r9
   0x000000000006d03e <+282>:   mov    0x1557(%r15),%r11
   0x000000000006d045 <+289>:   call   *0x7(%r11)
   0x000000000006d049 <+293>:   cmp    0x70(%r14),%r8
   0x000000000006d04d <+297>:   je     0x6d05e <fooMain3+314>

42271   BoolTest
   0x000000000006d04f <+299>:   test   $0x10,%r8b
   0x000000000006d053 <+303>:   je     0x6d082 <fooMain3+350>
   0x000000000006d059 <+309>:   jmp    0x6d079 <fooMain3+341>
   0x000000000006d05e <+314>:   mov    0x186f(%r15),%rbx
   0x000000000006d065 <+321>:   mov    0x1877(%r15),%r9
   0x000000000006d06c <+328>:   mov    0xd97(%r15),%r11
   0x000000000006d073 <+335>:   call   *0x7(%r11)
   0x000000000006d077 <+339>:   jmp    0x6d086 <fooMain3+354>
   0x000000000006d079 <+341>:   mov    0x80(%r14),%rax
   0x000000000006d080 <+348>:   jmp    0x6d086 <fooMain3+354>
   0x000000000006d082 <+350>:   mov    0x78(%r14),%rax

42272   ParallelMove rax <- rax
42273   Branch if StrictCompare:38(===, v16, v8) goto (7, 8)
42274   BoolTest
   0x000000000006d086 <+354>:   test   $0x10,%al
   0x000000000006d088 <+356>:   jne    0x6d09e <fooMain3+378>

On my machine this runs in ~3350 ms (out/ReleaseX64/dart-sdk/bin/dartaotruntime is_test_benchmark.aot 3).

Interestingly the runtimes for non-aot-Dart are ~4100 ms (1), ~2770 ms (2) and ~3630 ms (3) - even though it seemingly does the "InlineInstanceof" thing for all of them.

jensjoha avatar Jun 03 '25 07:06 jensjoha

@jensjoha could you give us an idea of how critical this performance improvement would be on the code you are analyzing.

a-siva avatar Jun 04 '25 20:06 a-siva

I landed https://dart-review.googlesource.com/c/sdk/+/432340 which 'hacked around it' on the instance I was looking at. That did give a ~1% speedup of the analyzer analyzing the CFE.

jensjoha avatar Jun 10 '25 09:06 jensjoha