loopy
loopy copied to clipboard
duplicaing an iname results in an unschedulable kernel
I'm not sure why it becomes unschedulable.
import loopy as lp
import numpy as np
from pymbolic.primitives import *
import immutables
e2p_from_single_box_knl = lp.make_kernel(
[
"[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }",
"{ [idim, idim_0] : 0 <= idim <= 2 and 0 <= idim_0 <= 2 }",
"{ [itgt_offset_outer, itgt_offset_inner] : itgt_offset_inner >= 0 and -32itgt_offset_outer <= itgt_offset_inner <= 46 - 32itgt_offset_outer and itgt_offset_inner <= 31 }",
"{ [icoeff_outer, icoeff_inner] : icoeff_inner >= 0 and -32icoeff_outer <= icoeff_inner <= 120 - 32icoeff_outer and icoeff_inner <= 31 }",
"{ [iknl, iknl_0] : iknl = 0 and iknl_0 = 0 }",
"{ [dummy] : 0 <= dummy <= 31 }",
"[ntargets] -> { [] : ntargets > 0 }",
"{ [e2p_idim] : 0 <= e2p_idim <= 2 }",
"{ [e2p_iorder0] : 0 < e2p_iorder0 <= 10 }",
"{ [e2p_zero_idx] : 1 = 0 }",
"{ [e2p_icoeff_outer, e2p_icoeff_inner] : e2p_icoeff_inner >= 0 and -32e2p_icoeff_outer <= e2p_icoeff_inner <= 120 - 32e2p_icoeff_outer and e2p_icoeff_inner <= 31 }",
"{ [e2p_x0] : 0 <= e2p_x0 <= 10 }",
"[e2p_x0] -> { [e2p_iorder1] : e2p_x0 <= e2p_iorder1 <= 10 }",
"[e2p_iorder1, e2p_x0] -> { [e2p_x2] : 0 <= e2p_x2 <= e2p_iorder1 - e2p_x0 }",
"[e2p_iorder1, e2p_x0, e2p_x2] -> { [e2p_x1] : e2p_x1 = e2p_iorder1 - e2p_x0 - e2p_x2 }",
"[e2p_x0] -> { [e2p_iorder2] : e2p_x0 <= e2p_iorder2 <= 10 }",
"[e2p_iorder2, e2p_x0] -> { [e2p_y2] : 0 <= e2p_y2 <= e2p_iorder2 - e2p_x0 }",
"[e2p_iorder2, e2p_x0, e2p_y2] -> { [e2p_y1] : e2p_y1 = e2p_iorder2 - e2p_x0 - e2p_y2 }",
],
'''
kernel_scaling = (1 / 4)*3.141592653589793**(-1) {id=kernel_scaling, inames=+dummy:itgt_box}
tgt_ibox = target_boxes[itgt_box] {id=fetch_init0, inames=dummy:itgt_box}
itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1, dep=fetch_init0, inames=dummy:itgt_box}
itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=fetch_init2, dep=fetch_init0:fetch_init1, inames=dummy:itgt_box}
center[idim] = centers[idim, tgt_ibox] {id=fetch_center, dep=fetch_init0, inames=dummy:itgt_box:idim}
coeffs[icoeff_inner + icoeff_outer*32] = src_expansions[tgt_ibox + (-1)*src_base_ibox, icoeff_inner + icoeff_outer*32] {id=fetch_coeffs, dep=fetch_init0, inames=icoeff_outer:itgt_box:icoeff_inner}
itgt = itgt_start + itgt_offset_inner + itgt_offset_outer*32 {id=insn, dep=fetch_init1, inames=itgt_offset_outer:itgt_offset_inner:itgt_box}
run_itgt = itgt < itgt_end {id=insn_0, dep=fetch_init2:insn, inames=itgt_offset_outer:itgt_offset_inner:itgt_box}
tgt[idim_0] = targets[idim_0, itgt] {id=fetch_tgt, dep=insn:insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:idim_0}
result_temp[iknl_0] = 0 {id=init_result, dep=insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:iknl_0}
... nop {id=e2p__start, dep=fetch_coeffs:fetch_tgt:insn_0:init_result:fetch_center:insn, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box}
e2p_b[e2p_idim] = (tgt[e2p_idim] + (-1)*center[e2p_idim])*(1 / rscale) {id=e2p_set_b, dep=e2p__start, inames=itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box}
e2p_power_b[e2p_idim, e2p_zero_idx] = 0 {id=e2p_zero_monomials, dep=e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_zero_idx:itgt_box}
e2p_power_b[e2p_idim, 0] = 1 {id=e2p_init_monomials, dep=e2p__start:e2p_zero_monomials, inames=+itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box}
e2p_power_b[e2p_idim, e2p_iorder0] = e2p_power_b[e2p_idim, e2p_iorder0 + -1]*e2p_b[e2p_idim]*(1 / e2p_iorder0) {id=e2p_update_monomials, dep=e2p_set_b:e2p_init_monomials:e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_iorder0:itgt_box}
e2p_coeffs_copy[e2p_icoeff_inner + e2p_icoeff_outer*32] = coeffs[e2p_icoeff_inner + e2p_icoeff_outer*32] {id=e2p_copy_coeffs, dep=e2p__start, inames=+e2p_icoeff_outer:e2p_icoeff_inner:itgt_box:itgt_offset_outer}
e2p_coeffs_copy[((e2p_x0 % 2 + e2p_x1 + e2p_x2)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 1)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)) // 6 + (e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_x1 if e2p_x0 % 2 + e2p_x1 + e2p_x2 < 1 else (2*(e2p_x0 % 2 + e2p_x1 + e2p_x2)*(2 + e2p_x0 % 2 + e2p_x1 + e2p_x2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_x1 + e2p_x2))) // 2 + e2p_x1] = e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 + 2 if (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2))) // 2 + e2p_x1 + 2]*(-1.0) + e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 if (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2))) // 2 + e2p_x1]*(-1.0) {id=e2p_update_coeffs, dep=e2p__start:e2p_copy_coeffs, inames=+e2p_x2:e2p_iorder1:itgt_offset_outer:e2p_x0:e2p_x1:itgt_box}
result_temp[0] = result_temp[0] + e2p_coeffs_copy[((e2p_x0 % 2 + e2p_y1 + e2p_y2)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 1)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)) // 6 + (e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_y1 if e2p_x0 % 2 + e2p_y1 + e2p_y2 < 1 else (2*(e2p_x0 % 2 + e2p_y1 + e2p_y2)*(2 + e2p_x0 % 2 + e2p_y1 + e2p_y2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_y1 + e2p_y2))) // 2 + e2p_y1]*e2p_power_b[0, e2p_x0]*e2p_power_b[1, e2p_y1]*e2p_power_b[2, e2p_y2] {id=e2p_write_0, dep=e2p_update_monomials:e2p_update_coeffs:e2p__start, inames=+itgt_offset_inner:itgt_offset_outer:e2p_iorder2:e2p_x0:e2p_y1:e2p_y2:itgt_box}
... nop {id=update_result, dep=e2p_write_0:e2p_update_monomials:e2p_zero_monomials:e2p_update_coeffs:e2p_set_b:e2p_init_monomials:e2p_copy_coeffs, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box}
result[iknl, itgt] = result_temp[iknl]*kernel_scaling {id=write_result, dep=update_result:insn:insn_0:kernel_scaling, inames=iknl:itgt_offset_inner:itgt_box:itgt_offset_outer}
''', [
lp.GlobalArg(
name="targets", dtype=None,
shape=(3, Variable('ntargets')), for_atomic=False),
lp.GlobalArg(
name="box_target_starts", dtype=None,
shape=None, for_atomic=False),
lp.GlobalArg(
name="box_target_counts_nonchild", dtype=None,
shape=None, for_atomic=False),
lp.GlobalArg(
name="centers", dtype=None,
shape=(3, Variable('naligned_boxes')), for_atomic=False),
lp.ValueArg(
name="rscale",
dtype=None),
lp.GlobalArg(
name="result", dtype=None,
shape=(1, Variable('ntargets')), for_atomic=False),
lp.GlobalArg(
name="src_expansions", dtype=None,
shape=(Variable('nsrc_level_boxes'), 121), for_atomic=False),
lp.ValueArg(
name="nsrc_level_boxes",
dtype=np.int32),
lp.ValueArg(
name="naligned_boxes",
dtype=np.int32),
lp.ValueArg(
name="src_base_ibox",
dtype=np.int32),
lp.ValueArg(
name="ntargets",
dtype=np.int32),
lp.ValueArg(
name="ntgt_boxes",
dtype=None),
lp.GlobalArg(
name="target_boxes", dtype=None,
shape=(Variable('ntgt_boxes'),), for_atomic=False),
lp.TemporaryVariable(
name="kernel_scaling",
shape=(), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="tgt_ibox",
shape=(), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="itgt_start",
shape=(), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="itgt_end",
shape=(), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="center",
shape=(3,), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="coeffs",
shape=(121,), for_atomic=False,
address_space=lp.AddressSpace.LOCAL,
read_only=False,
),
lp.TemporaryVariable(
name="itgt",
shape=(), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="run_itgt",
shape=(), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="tgt",
shape=(3,), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="result_temp",
shape=(1,), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="e2p_b",
shape=(3,), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="e2p_power_b",
shape=(3, 11), for_atomic=False,
address_space=lp.auto,
read_only=False,
),
lp.TemporaryVariable(
name="e2p_coeffs_copy",
shape=(121,), for_atomic=False,
address_space=lp.AddressSpace.LOCAL,
read_only=False,
),
],
lang_version=(2018, 2),
iname_slab_increments=immutables.Map({'itgt_offset_outer': (0, 0), 'e2p_icoeff_outer': (0, 0), 'icoeff_outer': (0, 0)}),
applied_iname_rewrites=({Variable('itgt_offset'): Sum((Variable('itgt_offset_inner'), Product((Variable('itgt_offset_outer'), 32))))}, {Variable('icoeff'): Sum((Variable('icoeff_inner'), Product((Variable('icoeff_outer'), 32))))}, {Variable('e2p_icoeff'): Sum((Variable('e2p_icoeff_inner'), Product((Variable('e2p_icoeff_outer'), 32))))}),
name="e2p_from_single_box",
)
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim_0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_offset_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_idim:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl_0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder1:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_icoeff_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "icoeff_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder2:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_box:g.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "dummy:l.0")
knl = lp.merge([e2p_from_single_box_knl])
knl = lp.add_and_infer_dtypes(knl, {"targets": np.float64, "box_target_starts": np.int32,
"box_target_counts_nonchild": np.int32, "target_boxes": np.int32,
"centers": np.float64, "rscale": np.float64, "result": np.float64, "src_expansions": np.float64})
print(lp.generate_code_v2(knl).device_code())
knl = lp.split_iname(knl, "e2p_x0", 2)
knl = lp.duplicate_inames(knl, "e2p_x0_inner", within="id:e2p_update_coeffs")
print(lp.generate_code_v2(knl).device_code())