Halide
Halide copied to clipboard
Pipeline with two async producers produce incorrect results
The following test produces incorrect results:
{
Func producer1("producer1"), producer2("producer2"), consumer("consumer");
Var x, y, xo, yo, xi, yi;
producer1(x, y) = x + y;
producer2(x, y) = x * y;
consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1);
consumer
.compute_root();
producer1
.compute_at(consumer, y)
.store_at(consumer, Var::outermost())
.fold_storage(y, 4)
.async();
producer2
.compute_at(consumer, y)
.store_at(consumer, Var::outermost())
.fold_storage(y, 1)
.async();
Buffer<int> out = consumer.realize({128, 128});
out.for_each_element([&](int x, int y) {
int correct = 2 * (x + y) + x * y;
if (out(x, y) != correct) {
printf("out(%d, %d) = %d instead of %d\n",
x, y, out(x, y), correct);
exit(1);
}
});
}
The final IR looks wrong to me:
let producer1.folding_semaphore._2 = (struct halide_semaphore_t *)alloca(16)
halide_semaphore_init(producer1.folding_semaphore._2, 4)
allocate producer1[int32 * (consumer.extent.0 + 2) * 4]
let producer1.semaphore_0 = (struct halide_semaphore_t *)alloca(16)
halide_semaphore_init(producer1.semaphore_0, 0)
let producer1.semaphore_0_4 = (struct halide_semaphore_t *)alloca(16)
halide_semaphore_init(producer1.semaphore_0_4, 0)
let producer2.folding_semaphore._0 = (struct halide_semaphore_t *)alloca(16)
halide_semaphore_init(producer2.folding_semaphore._0, 1)
allocate producer2[int32 * consumer.extent.0 * 1]
let producer2.semaphore_0 = (struct halide_semaphore_t *)alloca(16)
halide_semaphore_init(producer2.semaphore_0, 0)
fork {
let t109 = consumer.min.0 + consumer.min.1
for (consumer.s0.v1.$n.rebased, 0, consumer.extent.1 + 2) {
let t104 = select(0 < consumer.s0.v1.$n.rebased, 1, 3)
acquire (producer1.folding_semaphore._2, t104) {
produce producer1 {
let t111 = (((consumer.min.1 + consumer.s0.v1.$n.rebased) + 3) % 4)*(consumer.extent.0 + 2)
let t110 = consumer.s0.v1.$n.rebased + t109
for (producer1.s0.v0.rebased, 0, consumer.extent.0 + 2) {
producer1[producer1.s0.v0.rebased + t111] = (producer1.s0.v0.rebased + t110) + -2
}
halide_semaphore_release(producer1.semaphore_0, 1)
halide_semaphore_release(producer1.semaphore_0_4, 1)
}
}
}
} {
for (consumer.s0.v1.$n.rebased, 0, consumer.extent.1 + 2) {
let t86 = select((consumer.s0.v1.$n.rebased < 2) && (0 < consumer.s0.v1.$n.rebased), 0, 1)
acquire (producer2.folding_semaphore._0, t86) {
acquire (producer1.semaphore_0_4, 1) {
if (2 <= consumer.s0.v1.$n.rebased) {
consume producer1 {
produce producer2 {
let t112 = consumer.min.1 + consumer.s0.v1.$n.rebased
for (producer2.s0.v0.rebased, 0, consumer.extent.0) {
producer2[producer2.s0.v0.rebased] = (t112 + -2)*(consumer.min.0 + producer2.s0.v0.rebased)
}
halide_semaphore_release(producer2.semaphore_0, 1)
}
}
}
}
}
}
} {
produce consumer {
let t113 = 0 - (consumer.min.1*consumer.stride.1)
for (consumer.s0.v1.$n.rebased, 0, consumer.extent.1 + 2) {
acquire (producer1.semaphore_0, 1) {
if (2 <= consumer.s0.v1.$n.rebased) {
consume producer1 {
acquire (producer2.semaphore_0, 1) {
consume producer2 {
let t114 = consumer.min.1 + consumer.s0.v1.$n.rebased
for (consumer.s0.v0.rebased, 0, consumer.extent.0) {
consumer[(((t114 + -2)*consumer.stride.1) + t113) + consumer.s0.v0.rebased] = producer2[consumer.s0.v0.rebased] + (producer1[((((t114 + 3) % 4)*(consumer.extent.0 + 2)) + consumer.s0.v0.rebased) + 2] + producer1[(((t114 + 1) % 4)*(consumer.extent.0 + 2)) + consumer.s0.v0.rebased])
}
}
}
}
}
halide_semaphore_release(producer2.folding_semaphore._0, 1)
}
let t103 = select(consumer.s0.v1.$n.rebased < (consumer.extent.1 + 1), 1, 3)
halide_semaphore_release(producer1.folding_semaphore._2, t103)
}
}
}
free producer1
free producer2
}
For example, producer1 and producer2 seem to be connected via producer1.semaphore_0_4
semaphore, but there is no actual dependency between them.
Does #7868 complain about this schedule?
I can definitely check it, but if it does that means that #7868 is incorrect. The pipeline and its schedule should be perfectly valid, it's just two async producers with no interdependencies of any kind.
I'm curious if the reason for the miscompilation is due to the same IR mangling that I tried for forbid in that PR.
Just checked and #7868 doesn't complain about this schedule.
Not sure which PR fixed it, but I just checked and this is passing on main