loopy
loopy copied to clipboard
[Enhancement]: Parallelizing a very long kernel is slow
import loopy as lp
import vmprof
def make_kernel():
n_insn = 2000
# 'ndomains' must be greater than 'n' as it also includes the iname domains
# of sub-arrays refs.
n_domains = 6000
k1 = 100
k2 = 10
insns_as_str = [f"y{i}[idim{i},jdim{i}] = x{i}[idim{i},jdim{i}]"
for i in range(n_insn)]
domains = [f"{{[idim{i},jdim{i}]: 0<=idim{i}<{k1} and 0<=jdim{i}<{k2}}}"
for i in range(n_domains)]
x_args = [lp.GlobalArg(f"x{i}, y{i}", shape=(k1, k2), dtype=float)
for i in range(n_insn)]
return lp.make_kernel(domains,
"\n".join(insns_as_str),
x_args,
lang_version=(2018, 2))
def parallelize(knl):
nwg = 48
nwi = (16, 2)
# parallelize each instruction
for i in range(len(knl.instructions)):
bigger_loop = f"idim{i}"
smaller_loop = f"jdim{i}"
knl = lp.chunk_iname(knl, bigger_loop, nwg,
outer_tag="g.0",
within=f"iname:{bigger_loop}")
knl = lp.split_iname(knl, f"{bigger_loop}_inner",
nwi[0], inner_tag="l.1",
within=f"iname:{bigger_loop}_inner")
knl = lp.split_iname(knl, smaller_loop,
nwi[1], inner_tag="l.0",
within=f"iname:{smaller_loop}")
if i == 25:
print("I quit after parallelizing 25 instructions.")
break
print(f"Done transforming insn i={i}....")
if __name__ == "__main__":
knl = make_kernel()
with open("test.prof", "w+b") as f:
vmprof.enable(f.fileno())
parallelize(knl)
vmprof.disable()
/cc @inducer @isuruf
Takes roughly 8 seconds to process each instruction on a Xeon 2680v2. vmprof profile:
100.0% parallelize 100.0% /home/kgk2/temp/reproduce_big_loopy_program.py:27
38.9%|. chunk_iname 38.9% loopy/loopy/transform/iname.py:367
4.3%|.|. wrapper 11.1% pytools/__init__.py:690
4.3%|.|.|. get_iname_bounds 100.0% loopy/loopy/kernel/__init__.py:1043
4.3%|.|.|.|. get_inames_domain 99.2% loopy/loopy/kernel/__init__.py:690
3.9%|.|. wrapper 9.9% islpy/__init__.py:876
3.8%|.|.|. obj_get_var_dict 98.6% islpy/__init__.py:596
3.4%|.|.|.|. wrapper 90.4% islpy/__init__.py:876
0.2%|.|.|.|. <native symbol 0x53bf41> 4.5% -:0
0.1%|.|.|.|. <native symbol 0x529851> 3.1% -:0
0.1%|.|. _PyFunction_Vectorcall 0.1% -:0
30.7%|.|. _split_iname_backend 78.7% loopy/loopy/transform/iname.py:211
1.3%|.|.|. wrapper 4.3% pytools/__init__.py:690
1.3%|.|.|.|. all_inames 100.0% loopy/loopy/kernel/__init__.py:795
1.4%|.|.|. get_var_name_generator 4.7% loopy/loopy/kernel/__init__.py:509
0.1%|.|.|.|. __init__ 7.1% loopy/loopy/kernel/__init__.py:51
1.3%|.|.|.|. wrapper 92.9% pytools/__init__.py:690
3.9%|.|.|. <listcomp> 12.8% loopy/loopy/transform/iname.py:253
3.9%|.|.|.|. _split_iname_in_set 98.9% loopy/loopy/transform/iname.py:160
2.6%|.|.|. copy 8.4% loopy/loopy/kernel/__init__.py:1614
2.6%|.|.|.|. copy 100.0% pytools/__init__.py:371
8.3%|.|.|. map_kernel 27.2% loopy/loopy/symbolic.py:1034
0.9%|.|.|.|. <listcomp> 10.4% loopy/loopy/symbolic.py:1035
4.8%|.|.|.|. <listcomp> 57.1% loopy/loopy/symbolic.py:1053
2.7%|.|.|.|. copy 32.5% loopy/loopy/kernel/__init__.py:1614
2.5%|.|.|. finish_kernel 8.3% loopy/loopy/symbolic.py:941
2.5%|.|.|.|. copy 100.0% loopy/loopy/kernel/__init__.py:1614
3.8%|.|.|. tag_inames 12.5% loopy/loopy/transform/iname.py:657
1.3%|.|.|.|. wrapper 33.7% pytools/__init__.py:690
2.5%|.|.|.|. copy 66.0% loopy/loopy/kernel/__init__.py:1614
6.6%|.|.|. remove_unused_inames 21.5% loopy/loopy/transform/iname.py:1173
0.1%|.|.|.|. get_used_inames 0.8% loopy/loopy/transform/iname.py:1160
3.9%|.|.|.|. wrapper 58.9% islpy/__init__.py:876
2.5%|.|.|.|. copy 38.4% loopy/loopy/kernel/__init__.py:1614
61.0%|. split_iname 61.0% loopy/loopy/transform/iname.py:325
60.9%|.|. _split_iname_backend 100.0% loopy/loopy/transform/iname.py:211
2.6%|.|.|. wrapper 4.3% pytools/__init__.py:690
2.6%|.|.|.|. all_inames 100.0% loopy/loopy/kernel/__init__.py:795
2.9%|.|.|. get_var_name_generator 4.7% loopy/loopy/kernel/__init__.py:509
0.2%|.|.|.|. __init__ 7.7% loopy/loopy/kernel/__init__.py:51
2.7%|.|.|.|. wrapper 92.3% pytools/__init__.py:690
7.8%|.|.|. <listcomp> 12.9% loopy/loopy/transform/iname.py:253
7.8%|.|.|.|. _split_iname_in_set 99.3% loopy/loopy/transform/iname.py:160
5.2%|.|.|. copy 8.6% loopy/loopy/kernel/__init__.py:1614
5.2%|.|.|.|. copy 100.0% pytools/__init__.py:371
16.4%|.|.|. map_kernel 26.9% loopy/loopy/symbolic.py:1034
1.7%|.|.|.|. <listcomp> 10.6% loopy/loopy/symbolic.py:1035
9.5%|.|.|.|. <listcomp> 57.9% loopy/loopy/symbolic.py:1053
5.2%|.|.|.|. copy 31.5% loopy/loopy/kernel/__init__.py:1614
5.1%|.|.|. finish_kernel 8.3% loopy/loopy/symbolic.py:941
5.1%|.|.|.|. copy 100.0% loopy/loopy/kernel/__init__.py:1614
7.7%|.|.|. tag_inames 12.7% loopy/loopy/transform/iname.py:657
2.7%|.|.|.|. wrapper 34.7% pytools/__init__.py:690
5.1%|.|.|.|. copy 65.2% loopy/loopy/kernel/__init__.py:1614
13.1%|.|.|. remove_unused_inames 21.6% loopy/loopy/transform/iname.py:1173
0.1%|.|.|.|. get_used_inames 0.6% loopy/loopy/transform/iname.py:1160
7.7%|.|.|.|. wrapper 58.9% islpy/__init__.py:876
5.1%|.|.|.|. copy 38.6% loopy/loopy/kernel/__init__.py:1614
0.1%|.|.|.|. _PyFunction_Vectorcall 0.6% -:0
Kernel copying takes the longest time. I'm not sure why copying takes that amount of time.
@isuruf: Yep, LoopKernel has a very huge domains list and we traverse it every LoopKernel.__init__.
Rough breakdown of costs in terms of disjoint high level operations:
| Routine name | Time spent | Possible solution |
|---|---|---|
LoopKernel.__init__ |
38% | Better way of representing domains. |
InameSplitter.map_kernel |
15% | Traverses expressions in args, unnecessary |
_split_iname_in_set |
12% | Better way of representing domains. |
islpy.obj_get_var_dict or islpy.get_var_names |
16% | cache them in islpy |
LoopKernel.get_var_name_generator |
5% | ?? |
LoopKernel.all_variable_names |
?? |
I don't even understand why LoopKernel.all_inames() doesn't return frozenset(self.inames.keys()).
@inducer, what are your thoughts on memoizing get_var_dict and get_var_names in islpy?
@inducer, what are your thoughts on memoizing
get_var_dictandget_var_namesin islpy?
If it helps, go for it. You'll probably need to add a pytools dep.