loopy icon indicating copy to clipboard operation
loopy copied to clipboard

[Enhancement]: Parallelizing a very long kernel is slow

Open kaushikcfd opened this issue 4 years ago • 6 comments

import loopy as lp
import vmprof


def make_kernel():
    n_insn = 2000
    # 'ndomains' must be greater than 'n' as it also includes the iname domains
    # of sub-arrays refs.
    n_domains = 6000
    k1 = 100
    k2 = 10

    insns_as_str = [f"y{i}[idim{i},jdim{i}] = x{i}[idim{i},jdim{i}]"
                    for i in range(n_insn)]
    domains = [f"{{[idim{i},jdim{i}]: 0<=idim{i}<{k1} and 0<=jdim{i}<{k2}}}"
               for i in range(n_domains)]

    x_args = [lp.GlobalArg(f"x{i}, y{i}", shape=(k1, k2), dtype=float)
              for i in range(n_insn)]

    return lp.make_kernel(domains,
                          "\n".join(insns_as_str),
                          x_args,
                          lang_version=(2018, 2))


def parallelize(knl):
    nwg = 48
    nwi = (16, 2)

    # parallelize each instruction
    for i in range(len(knl.instructions)):
        bigger_loop = f"idim{i}"
        smaller_loop = f"jdim{i}"
        knl = lp.chunk_iname(knl, bigger_loop, nwg,
                             outer_tag="g.0",
                             within=f"iname:{bigger_loop}")
        knl = lp.split_iname(knl, f"{bigger_loop}_inner",
                             nwi[0], inner_tag="l.1",
                             within=f"iname:{bigger_loop}_inner")
        knl = lp.split_iname(knl, smaller_loop,
                             nwi[1], inner_tag="l.0",
                             within=f"iname:{smaller_loop}")

        if i == 25:
            print("I quit after parallelizing 25 instructions.")
            break
        print(f"Done transforming insn i={i}....")


if __name__ == "__main__":
    knl = make_kernel()

    with open("test.prof", "w+b") as f:
        vmprof.enable(f.fileno())
        parallelize(knl)
        vmprof.disable()

/cc @inducer @isuruf

kaushikcfd avatar Apr 13 '21 18:04 kaushikcfd

Takes roughly 8 seconds to process each instruction on a Xeon 2680v2. vmprof profile:

100.0% parallelize  100.0%  /home/kgk2/temp/reproduce_big_loopy_program.py:27
 38.9%|. chunk_iname  38.9%  loopy/loopy/transform/iname.py:367
  4.3%|.|. wrapper  11.1%  pytools/__init__.py:690
  4.3%|.|.|. get_iname_bounds  100.0%  loopy/loopy/kernel/__init__.py:1043
  4.3%|.|.|.|. get_inames_domain  99.2%  loopy/loopy/kernel/__init__.py:690
  3.9%|.|. wrapper  9.9%  islpy/__init__.py:876
  3.8%|.|.|. obj_get_var_dict  98.6%  islpy/__init__.py:596
  3.4%|.|.|.|. wrapper  90.4%  islpy/__init__.py:876
  0.2%|.|.|.|. <native symbol 0x53bf41>  4.5%  -:0
  0.1%|.|.|.|. <native symbol 0x529851>  3.1%  -:0
  0.1%|.|. _PyFunction_Vectorcall  0.1%  -:0
 30.7%|.|. _split_iname_backend  78.7%  loopy/loopy/transform/iname.py:211
  1.3%|.|.|. wrapper  4.3%  pytools/__init__.py:690
  1.3%|.|.|.|. all_inames  100.0%  loopy/loopy/kernel/__init__.py:795
  1.4%|.|.|. get_var_name_generator  4.7%  loopy/loopy/kernel/__init__.py:509
  0.1%|.|.|.|. __init__  7.1%  loopy/loopy/kernel/__init__.py:51
  1.3%|.|.|.|. wrapper  92.9%  pytools/__init__.py:690
  3.9%|.|.|. <listcomp>  12.8%  loopy/loopy/transform/iname.py:253
  3.9%|.|.|.|. _split_iname_in_set  98.9%  loopy/loopy/transform/iname.py:160
  2.6%|.|.|. copy  8.4%  loopy/loopy/kernel/__init__.py:1614
  2.6%|.|.|.|. copy  100.0%  pytools/__init__.py:371
  8.3%|.|.|. map_kernel  27.2%  loopy/loopy/symbolic.py:1034
  0.9%|.|.|.|. <listcomp>  10.4%  loopy/loopy/symbolic.py:1035
  4.8%|.|.|.|. <listcomp>  57.1%  loopy/loopy/symbolic.py:1053
  2.7%|.|.|.|. copy  32.5%  loopy/loopy/kernel/__init__.py:1614
  2.5%|.|.|. finish_kernel  8.3%  loopy/loopy/symbolic.py:941
  2.5%|.|.|.|. copy  100.0%  loopy/loopy/kernel/__init__.py:1614
  3.8%|.|.|. tag_inames  12.5%  loopy/loopy/transform/iname.py:657
  1.3%|.|.|.|. wrapper  33.7%  pytools/__init__.py:690
  2.5%|.|.|.|. copy  66.0%  loopy/loopy/kernel/__init__.py:1614
  6.6%|.|.|. remove_unused_inames  21.5%  loopy/loopy/transform/iname.py:1173
  0.1%|.|.|.|. get_used_inames  0.8%  loopy/loopy/transform/iname.py:1160
  3.9%|.|.|.|. wrapper  58.9%  islpy/__init__.py:876
  2.5%|.|.|.|. copy  38.4%  loopy/loopy/kernel/__init__.py:1614
 61.0%|. split_iname  61.0%  loopy/loopy/transform/iname.py:325
 60.9%|.|. _split_iname_backend  100.0%  loopy/loopy/transform/iname.py:211
  2.6%|.|.|. wrapper  4.3%  pytools/__init__.py:690
  2.6%|.|.|.|. all_inames  100.0%  loopy/loopy/kernel/__init__.py:795
  2.9%|.|.|. get_var_name_generator  4.7%  loopy/loopy/kernel/__init__.py:509
  0.2%|.|.|.|. __init__  7.7%  loopy/loopy/kernel/__init__.py:51
  2.7%|.|.|.|. wrapper  92.3%  pytools/__init__.py:690
  7.8%|.|.|. <listcomp>  12.9%  loopy/loopy/transform/iname.py:253
  7.8%|.|.|.|. _split_iname_in_set  99.3%  loopy/loopy/transform/iname.py:160
  5.2%|.|.|. copy  8.6%  loopy/loopy/kernel/__init__.py:1614
  5.2%|.|.|.|. copy  100.0%  pytools/__init__.py:371
 16.4%|.|.|. map_kernel  26.9%  loopy/loopy/symbolic.py:1034
  1.7%|.|.|.|. <listcomp>  10.6%  loopy/loopy/symbolic.py:1035
  9.5%|.|.|.|. <listcomp>  57.9%  loopy/loopy/symbolic.py:1053
  5.2%|.|.|.|. copy  31.5%  loopy/loopy/kernel/__init__.py:1614
  5.1%|.|.|. finish_kernel  8.3%  loopy/loopy/symbolic.py:941
  5.1%|.|.|.|. copy  100.0%  loopy/loopy/kernel/__init__.py:1614
  7.7%|.|.|. tag_inames  12.7%  loopy/loopy/transform/iname.py:657
  2.7%|.|.|.|. wrapper  34.7%  pytools/__init__.py:690
  5.1%|.|.|.|. copy  65.2%  loopy/loopy/kernel/__init__.py:1614
 13.1%|.|.|. remove_unused_inames  21.6%  loopy/loopy/transform/iname.py:1173
  0.1%|.|.|.|. get_used_inames  0.6%  loopy/loopy/transform/iname.py:1160
  7.7%|.|.|.|. wrapper  58.9%  islpy/__init__.py:876
  5.1%|.|.|.|. copy  38.6%  loopy/loopy/kernel/__init__.py:1614
  0.1%|.|.|.|. _PyFunction_Vectorcall  0.6%  -:0

kaushikcfd avatar Apr 13 '21 18:04 kaushikcfd

Kernel copying takes the longest time. I'm not sure why copying takes that amount of time.

isuruf avatar Apr 13 '21 19:04 isuruf

@isuruf: Yep, LoopKernel has a very huge domains list and we traverse it every LoopKernel.__init__.

Rough breakdown of costs in terms of disjoint high level operations:

Routine name Time spent Possible solution
LoopKernel.__init__ 38% Better way of representing domains.
InameSplitter.map_kernel 15% Traverses expressions in args, unnecessary
_split_iname_in_set 12% Better way of representing domains.
islpy.obj_get_var_dict or islpy.get_var_names 16% cache them in islpy
LoopKernel.get_var_name_generator 5% ??
LoopKernel.all_variable_names ??

kaushikcfd avatar Apr 13 '21 19:04 kaushikcfd

I don't even understand why LoopKernel.all_inames() doesn't return frozenset(self.inames.keys()).

isuruf avatar Apr 13 '21 20:04 isuruf

@inducer, what are your thoughts on memoizing get_var_dict and get_var_names in islpy?

isuruf avatar Apr 13 '21 21:04 isuruf

@inducer, what are your thoughts on memoizing get_var_dict and get_var_names in islpy?

If it helps, go for it. You'll probably need to add a pytools dep.

inducer avatar Apr 13 '21 23:04 inducer