loopy icon indicating copy to clipboard operation
loopy copied to clipboard

duplicaing an iname results in an unschedulable kernel

Open isuruf opened this issue 1 year ago • 2 comments

I'm not sure why it becomes unschedulable.

import loopy as lp
import numpy as np
from pymbolic.primitives import *
import immutables

e2p_from_single_box_knl = lp.make_kernel(
    [
    "[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }",
    "{ [idim, idim_0] : 0 <= idim <= 2 and 0 <= idim_0 <= 2 }",
    "{ [itgt_offset_outer, itgt_offset_inner] : itgt_offset_inner >= 0 and -32itgt_offset_outer <= itgt_offset_inner <= 46 - 32itgt_offset_outer and itgt_offset_inner <= 31 }",
    "{ [icoeff_outer, icoeff_inner] : icoeff_inner >= 0 and -32icoeff_outer <= icoeff_inner <= 120 - 32icoeff_outer and icoeff_inner <= 31 }",
    "{ [iknl, iknl_0] : iknl = 0 and iknl_0 = 0 }",
    "{ [dummy] : 0 <= dummy <= 31 }",
    "[ntargets] -> { [] : ntargets > 0 }",
    "{ [e2p_idim] : 0 <= e2p_idim <= 2 }",
    "{ [e2p_iorder0] : 0 < e2p_iorder0 <= 10 }",
    "{ [e2p_zero_idx] : 1 = 0 }",
    "{ [e2p_icoeff_outer, e2p_icoeff_inner] : e2p_icoeff_inner >= 0 and -32e2p_icoeff_outer <= e2p_icoeff_inner <= 120 - 32e2p_icoeff_outer and e2p_icoeff_inner <= 31 }",
    "{ [e2p_x0] : 0 <= e2p_x0 <= 10 }",
    "[e2p_x0] -> { [e2p_iorder1] : e2p_x0 <= e2p_iorder1 <= 10 }",
    "[e2p_iorder1, e2p_x0] -> { [e2p_x2] : 0 <= e2p_x2 <= e2p_iorder1 - e2p_x0 }",
    "[e2p_iorder1, e2p_x0, e2p_x2] -> { [e2p_x1] : e2p_x1 = e2p_iorder1 - e2p_x0 - e2p_x2 }",
    "[e2p_x0] -> { [e2p_iorder2] : e2p_x0 <= e2p_iorder2 <= 10 }",
    "[e2p_iorder2, e2p_x0] -> { [e2p_y2] : 0 <= e2p_y2 <= e2p_iorder2 - e2p_x0 }",
    "[e2p_iorder2, e2p_x0, e2p_y2] -> { [e2p_y1] : e2p_y1 = e2p_iorder2 - e2p_x0 - e2p_y2 }",
    ],
    '''

    kernel_scaling = (1 / 4)*3.141592653589793**(-1) {id=kernel_scaling, inames=+dummy:itgt_box}
    tgt_ibox = target_boxes[itgt_box] {id=fetch_init0, inames=dummy:itgt_box}
    itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1, dep=fetch_init0, inames=dummy:itgt_box}
    itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=fetch_init2, dep=fetch_init0:fetch_init1, inames=dummy:itgt_box}
    center[idim] = centers[idim, tgt_ibox] {id=fetch_center, dep=fetch_init0, inames=dummy:itgt_box:idim}
    coeffs[icoeff_inner + icoeff_outer*32] = src_expansions[tgt_ibox + (-1)*src_base_ibox, icoeff_inner + icoeff_outer*32] {id=fetch_coeffs, dep=fetch_init0, inames=icoeff_outer:itgt_box:icoeff_inner}
    itgt = itgt_start + itgt_offset_inner + itgt_offset_outer*32 {id=insn, dep=fetch_init1, inames=itgt_offset_outer:itgt_offset_inner:itgt_box}
    run_itgt = itgt < itgt_end {id=insn_0, dep=fetch_init2:insn, inames=itgt_offset_outer:itgt_offset_inner:itgt_box}
    tgt[idim_0] = targets[idim_0, itgt] {id=fetch_tgt, dep=insn:insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:idim_0}
    result_temp[iknl_0] = 0 {id=init_result, dep=insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:iknl_0}
    ... nop {id=e2p__start, dep=fetch_coeffs:fetch_tgt:insn_0:init_result:fetch_center:insn, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box}
    e2p_b[e2p_idim] = (tgt[e2p_idim] + (-1)*center[e2p_idim])*(1 / rscale) {id=e2p_set_b, dep=e2p__start, inames=itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box}
    e2p_power_b[e2p_idim, e2p_zero_idx] = 0 {id=e2p_zero_monomials, dep=e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_zero_idx:itgt_box}
    e2p_power_b[e2p_idim, 0] = 1 {id=e2p_init_monomials, dep=e2p__start:e2p_zero_monomials, inames=+itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box}
    e2p_power_b[e2p_idim, e2p_iorder0] = e2p_power_b[e2p_idim, e2p_iorder0 + -1]*e2p_b[e2p_idim]*(1 / e2p_iorder0) {id=e2p_update_monomials, dep=e2p_set_b:e2p_init_monomials:e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_iorder0:itgt_box}
    e2p_coeffs_copy[e2p_icoeff_inner + e2p_icoeff_outer*32] = coeffs[e2p_icoeff_inner + e2p_icoeff_outer*32] {id=e2p_copy_coeffs, dep=e2p__start, inames=+e2p_icoeff_outer:e2p_icoeff_inner:itgt_box:itgt_offset_outer}
    e2p_coeffs_copy[((e2p_x0 % 2 + e2p_x1 + e2p_x2)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 1)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)) // 6 + (e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_x1 if e2p_x0 % 2 + e2p_x1 + e2p_x2 < 1 else (2*(e2p_x0 % 2 + e2p_x1 + e2p_x2)*(2 + e2p_x0 % 2 + e2p_x1 + e2p_x2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_x1 + e2p_x2))) // 2 + e2p_x1] = e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 + 2 if (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2))) // 2 + e2p_x1 + 2]*(-1.0) + e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 if (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2))) // 2 + e2p_x1]*(-1.0) {id=e2p_update_coeffs, dep=e2p__start:e2p_copy_coeffs, inames=+e2p_x2:e2p_iorder1:itgt_offset_outer:e2p_x0:e2p_x1:itgt_box}
    result_temp[0] = result_temp[0] + e2p_coeffs_copy[((e2p_x0 % 2 + e2p_y1 + e2p_y2)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 1)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)) // 6 + (e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_y1 if e2p_x0 % 2 + e2p_y1 + e2p_y2 < 1 else (2*(e2p_x0 % 2 + e2p_y1 + e2p_y2)*(2 + e2p_x0 % 2 + e2p_y1 + e2p_y2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_y1 + e2p_y2))) // 2 + e2p_y1]*e2p_power_b[0, e2p_x0]*e2p_power_b[1, e2p_y1]*e2p_power_b[2, e2p_y2] {id=e2p_write_0, dep=e2p_update_monomials:e2p_update_coeffs:e2p__start, inames=+itgt_offset_inner:itgt_offset_outer:e2p_iorder2:e2p_x0:e2p_y1:e2p_y2:itgt_box}
    ... nop {id=update_result, dep=e2p_write_0:e2p_update_monomials:e2p_zero_monomials:e2p_update_coeffs:e2p_set_b:e2p_init_monomials:e2p_copy_coeffs, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box}
    result[iknl, itgt] = result_temp[iknl]*kernel_scaling {id=write_result, dep=update_result:insn:insn_0:kernel_scaling, inames=iknl:itgt_offset_inner:itgt_box:itgt_offset_outer}
    ''', [
        lp.GlobalArg(
            name="targets", dtype=None,
            shape=(3, Variable('ntargets')), for_atomic=False),
        lp.GlobalArg(
            name="box_target_starts", dtype=None,
            shape=None, for_atomic=False),
        lp.GlobalArg(
            name="box_target_counts_nonchild", dtype=None,
            shape=None, for_atomic=False),
        lp.GlobalArg(
            name="centers", dtype=None,
            shape=(3, Variable('naligned_boxes')), for_atomic=False),
        lp.ValueArg(
            name="rscale",
            dtype=None),
        lp.GlobalArg(
            name="result", dtype=None,
            shape=(1, Variable('ntargets')), for_atomic=False),
        lp.GlobalArg(
            name="src_expansions", dtype=None,
            shape=(Variable('nsrc_level_boxes'), 121), for_atomic=False),
        lp.ValueArg(
            name="nsrc_level_boxes",
            dtype=np.int32),
        lp.ValueArg(
            name="naligned_boxes",
            dtype=np.int32),
        lp.ValueArg(
            name="src_base_ibox",
            dtype=np.int32),
        lp.ValueArg(
            name="ntargets",
            dtype=np.int32),
        lp.ValueArg(
            name="ntgt_boxes",
            dtype=None),
        lp.GlobalArg(
            name="target_boxes", dtype=None,
            shape=(Variable('ntgt_boxes'),), for_atomic=False),
        lp.TemporaryVariable(
            name="kernel_scaling",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="tgt_ibox",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="itgt_start",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="itgt_end",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="center",
            shape=(3,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="coeffs",
            shape=(121,), for_atomic=False,
            address_space=lp.AddressSpace.LOCAL,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="itgt",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="run_itgt",
            shape=(), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="tgt",
            shape=(3,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="result_temp",
            shape=(1,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="e2p_b",
            shape=(3,), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="e2p_power_b",
            shape=(3, 11), for_atomic=False,
            address_space=lp.auto,
            read_only=False,
            ),
        lp.TemporaryVariable(
            name="e2p_coeffs_copy",
            shape=(121,), for_atomic=False,
            address_space=lp.AddressSpace.LOCAL,
            read_only=False,
            ),
        ],
        lang_version=(2018, 2),
        iname_slab_increments=immutables.Map({'itgt_offset_outer': (0, 0), 'e2p_icoeff_outer': (0, 0), 'icoeff_outer': (0, 0)}),
        applied_iname_rewrites=({Variable('itgt_offset'): Sum((Variable('itgt_offset_inner'), Product((Variable('itgt_offset_outer'), 32))))}, {Variable('icoeff'): Sum((Variable('icoeff_inner'), Product((Variable('icoeff_outer'), 32))))}, {Variable('e2p_icoeff'): Sum((Variable('e2p_icoeff_inner'), Product((Variable('e2p_icoeff_outer'), 32))))}),
        name="e2p_from_single_box",
        )

e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim_0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_offset_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_idim:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl_0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder1:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_icoeff_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder0:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "icoeff_inner:l.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder2:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl:unr")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_box:g.0")
e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "dummy:l.0")
knl = lp.merge([e2p_from_single_box_knl])
            
knl = lp.add_and_infer_dtypes(knl, {"targets": np.float64, "box_target_starts": np.int32,
  "box_target_counts_nonchild": np.int32, "target_boxes": np.int32,
  "centers": np.float64, "rscale": np.float64, "result": np.float64, "src_expansions": np.float64})

print(lp.generate_code_v2(knl).device_code())
knl = lp.split_iname(knl, "e2p_x0", 2)
knl = lp.duplicate_inames(knl, "e2p_x0_inner", within="id:e2p_update_coeffs")
print(lp.generate_code_v2(knl).device_code())

isuruf avatar Feb 12 '23 17:02 isuruf