Halide icon indicating copy to clipboard operation
Halide copied to clipboard

Pipeline with two async producers produce incorrect results

Open vksnk opened this issue 1 year ago • 4 comments

The following test produces incorrect results:

{
        Func producer1("producer1"), producer2("producer2"), consumer("consumer");
        Var x, y, xo, yo, xi, yi;

        producer1(x, y) = x + y;
        producer2(x, y) = x * y;
        consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1);

        consumer
            .compute_root();
        producer1
            .compute_at(consumer, y)
            .store_at(consumer, Var::outermost())
            .fold_storage(y, 4)
            .async();
        producer2
            .compute_at(consumer, y)
            .store_at(consumer, Var::outermost())
            .fold_storage(y, 1)
            .async();

        Buffer<int> out = consumer.realize({128, 128});

        out.for_each_element([&](int x, int y) {
            int correct = 2 * (x + y) + x * y;
            if (out(x, y) != correct) {
                printf("out(%d, %d) = %d instead of %d\n",
                       x, y, out(x, y), correct);
                exit(1);
            }
        });
    }

The final IR looks wrong to me:

 let producer1.folding_semaphore._2 = (struct halide_semaphore_t *)alloca(16)
 halide_semaphore_init(producer1.folding_semaphore._2, 4)
 allocate producer1[int32 * (consumer.extent.0 + 2) * 4]
 let producer1.semaphore_0 = (struct halide_semaphore_t *)alloca(16)
 halide_semaphore_init(producer1.semaphore_0, 0)
 let producer1.semaphore_0_4 = (struct halide_semaphore_t *)alloca(16)
 halide_semaphore_init(producer1.semaphore_0_4, 0)
 let producer2.folding_semaphore._0 = (struct halide_semaphore_t *)alloca(16)
 halide_semaphore_init(producer2.folding_semaphore._0, 1)
 allocate producer2[int32 * consumer.extent.0 * 1]
 let producer2.semaphore_0 = (struct halide_semaphore_t *)alloca(16)
 halide_semaphore_init(producer2.semaphore_0, 0)
 fork {
  let t109 = consumer.min.0 + consumer.min.1
  for (consumer.s0.v1.$n.rebased, 0, consumer.extent.1 + 2) {
   let t104 = select(0 < consumer.s0.v1.$n.rebased, 1, 3)
   acquire (producer1.folding_semaphore._2, t104) {
    produce producer1 {
     let t111 = (((consumer.min.1 + consumer.s0.v1.$n.rebased) + 3) % 4)*(consumer.extent.0 + 2)
     let t110 = consumer.s0.v1.$n.rebased + t109
     for (producer1.s0.v0.rebased, 0, consumer.extent.0 + 2) {
      producer1[producer1.s0.v0.rebased + t111] = (producer1.s0.v0.rebased + t110) + -2
     }
     halide_semaphore_release(producer1.semaphore_0, 1)
     halide_semaphore_release(producer1.semaphore_0_4, 1)
    }
   }
  }
 } {
  for (consumer.s0.v1.$n.rebased, 0, consumer.extent.1 + 2) {
   let t86 = select((consumer.s0.v1.$n.rebased < 2) && (0 < consumer.s0.v1.$n.rebased), 0, 1)
   acquire (producer2.folding_semaphore._0, t86) {
    acquire (producer1.semaphore_0_4, 1) {
     if (2 <= consumer.s0.v1.$n.rebased) {
      consume producer1 {
       produce producer2 {
        let t112 = consumer.min.1 + consumer.s0.v1.$n.rebased
        for (producer2.s0.v0.rebased, 0, consumer.extent.0) {
         producer2[producer2.s0.v0.rebased] = (t112 + -2)*(consumer.min.0 + producer2.s0.v0.rebased)
        }
        halide_semaphore_release(producer2.semaphore_0, 1)
       }
      }
     }
    }
   }
  }
 } {
  produce consumer {
   let t113 = 0 - (consumer.min.1*consumer.stride.1)
   for (consumer.s0.v1.$n.rebased, 0, consumer.extent.1 + 2) {
    acquire (producer1.semaphore_0, 1) {
     if (2 <= consumer.s0.v1.$n.rebased) {
      consume producer1 {
       acquire (producer2.semaphore_0, 1) {
        consume producer2 {
         let t114 = consumer.min.1 + consumer.s0.v1.$n.rebased
         for (consumer.s0.v0.rebased, 0, consumer.extent.0) {
          consumer[(((t114 + -2)*consumer.stride.1) + t113) + consumer.s0.v0.rebased] = producer2[consumer.s0.v0.rebased] + (producer1[((((t114 + 3) % 4)*(consumer.extent.0 + 2)) + consumer.s0.v0.rebased) + 2] + producer1[(((t114 + 1) % 4)*(consumer.extent.0 + 2)) + consumer.s0.v0.rebased])
         }
        }
       }
      }
     }
     halide_semaphore_release(producer2.folding_semaphore._0, 1)
    }
    let t103 = select(consumer.s0.v1.$n.rebased < (consumer.extent.1 + 1), 1, 3)
    halide_semaphore_release(producer1.folding_semaphore._2, t103)
   }
  }
 }
 free producer1
 free producer2
}

For example, producer1 and producer2 seem to be connected via producer1.semaphore_0_4 semaphore, but there is no actual dependency between them.

vksnk avatar Nov 28 '23 23:11 vksnk

Does #7868 complain about this schedule?

abadams avatar Nov 29 '23 16:11 abadams

I can definitely check it, but if it does that means that #7868 is incorrect. The pipeline and its schedule should be perfectly valid, it's just two async producers with no interdependencies of any kind.

vksnk avatar Nov 29 '23 17:11 vksnk

I'm curious if the reason for the miscompilation is due to the same IR mangling that I tried for forbid in that PR.

abadams avatar Nov 29 '23 17:11 abadams

Just checked and #7868 doesn't complain about this schedule.

vksnk avatar Nov 29 '23 17:11 vksnk

Not sure which PR fixed it, but I just checked and this is passing on main

abadams avatar Jun 04 '24 18:06 abadams