loopy
loopy copied to clipboard
[bug] lp.add_prefetch adds a prefetch instruction but does not replace accesses with prefetched variable
Here's a reproducer
import loopy as lp
t_unit = lp.make_kernel(
[
"[N_e] -> { [r, x, i_tile, i_inner_outer, i_inner_inner, i_prcmpt, r_prcmpt, e_outer, e_inner, J_dim_0, J_dim_1, j_tile, j_inner] : (i_prcmpt) mod 4 = 0 and 0 <= r <= 2 and 0 <= x <= 2 and i_tile >= 0 and i_inner_inner >= 0 and -4i_inner_outer <= i_inner_inner <= 34 - 9i_tile - 4i_inner_outer and i_inner_inner <= 8 - 4i_inner_outer and i_inner_inner <= 3 and -i_inner_inner <= i_prcmpt <= 34 - 9i_tile - i_inner_inner and i_prcmpt <= 8 - i_inner_inner and 0 <= r_prcmpt <= 2 and e_inner >= 0 and -14e_outer <= e_inner <= 13 and e_inner < N_e - 14e_outer and 0 <= J_dim_0 <= 2 and 0 <= J_dim_1 <= 2 and j_inner >= 0 and -35j_tile <= j_inner <= 34 - 35j_tile and j_inner <= 34 }",
],
'''
<> J_fetch[J_dim_0, J_dim_1] = J[J_dim_0, J_dim_1, e_inner + 14*e_outer] {id=J_fetch_rule}
<> subst_0[i_prcmpt, r_prcmpt] = reduce(sum, [j_tile, j_inner], D[r_prcmpt, i_prcmpt + 9*i_tile + i_inner_inner, j_inner + j_tile*35]*u[e_inner + e_outer*14, j_inner + j_tile*35]) {id=prcmpt_j_redn}
_fe_out[x, e_inner + e_outer*14, i_tile*9 + i_inner_inner + i_inner_outer*4] = reduce(sum, [r], subst_0[4*i_inner_outer, r]*J_fetch[x, r]) {id=insn, dep=J_fetch_rule:prcmpt_j_redn}
''', lang_version=(2018, 2))
t_unit = lp.add_prefetch(t_unit, "D", ["i_inner_inner", "r_prcmpt",
"i_prcmpt", "j_inner"],
fetch_outer_inames=frozenset(["e_outer",
"i_tile",
"j_tile"]),
temporary_address_space=lp.AddressSpace.LOCAL,
default_tag=None)
print(t_unit)
That generates the kernel:
---------------------------------------------------------------------------
INSTRUCTIONS:
for e_outer, i_tile, D_dim_1, j_tile, D_dim_0, D_dim_2
D_fetch[D_dim_0, D_dim_1, D_dim_2] = D[D_dim_0, D_dim_1 + 9*i_tile, D_dim_2] {id=D_fetch_rule}
end i_tile, D_dim_1, j_tile, D_dim_0, D_dim_2
for e_inner, J_dim_1, J_dim_0
↱ J_fetch[J_dim_0, J_dim_1] = J[J_dim_0, J_dim_1, e_inner + 14*e_outer] {id=J_fetch_rule}
│ end J_dim_1, J_dim_0
│ for i_prcmpt, r_prcmpt, i_tile, i_inner_inner
│↱ subst_0[i_prcmpt, r_prcmpt] = reduce(sum, [j_tile, j_inner], D[r_prcmpt, i_prcmpt + 9*i_tile + i_inner_inner, j_inner + j_tile*35]*u[e_inner + e_outer*14, j_inner + j_tile*35]) {id=prcmpt_j_redn}
││ end i_prcmpt, r_prcmpt
││ for x, i_inner_outer
└└ _fe_out[x, e_inner + e_outer*14, i_tile*9 + i_inner_inner + i_inner_outer*4] = reduce(sum, [r], subst_0[4*i_inner_outer, r]*J_fetch[x, r]) {id=insn}
end e_inner, e_outer, i_tile, i_inner_inner, x, i_inner_outer
---------------------------------------------------------------------------
Notice how the transformed kernel assigns D_fetch without using it anywhere.