mlir-aie
mlir-aie copied to clipboard
[experiment] Support unplaced TileOps
This branch is an experiment to see what it takes to support unplaced aie dialect.
It is based on a a simple extension to aie.tile op to support ? as the row or column operand, meaning the row or column is not physically placed:
// unplaced tile
%tile_c_r = aie.tile(?, ?)
// unplaced shim
%shim_noc_tile_c_0 = aie.tile(?, 0)
// unplaced memtile
%mem_tile_c_0 = aie.tile(?, 1)
To test this I add a "null placer" to iron placers.py:
class NullPlacer(Placer):
"""NullPlacer is a simple implementation of a placer. The NullPlacer does not do any placement.
"""
def __init__(self):
super().__init__()
def make_placement(
self,
device: Device,
rt: Runtime,
workers: list[Worker],
object_fifos: list[ObjectFifoHandle],
):
for worker in workers:
if worker.tile == AnyComputeTile:
worker.place(Tile(-1, -1))
for buffer in worker.buffers:
buffer.place(worker.tile)
for of in object_fifos:
of_endpoints = of.all_of_endpoints()
for ofe in of_endpoints:
if ofe.tile == AnyMemTile:
ofe.place(Tile(-1, 1))
elif ofe.tile == AnyComputeTile:
ofe.place(Tile(-1, -1))
elif ofe.tile == AnyShimTile:
ofe.place(Tile(-1, 0))
So that unplaced MLIR is emitted from unplaced IRON:
# place_test.py
@construct_and_print_module
def shim_three_in(module):
N = 4096
n = 1024
n_ty = np.ndarray[(n,), np.dtype[np.int32]]
n_inputs = 3
of_ins = []
for i in range(n_inputs):
of_ins.append(ObjectFifo(n_ty, name=f"in_{i}"))
def core_fn(of_in):
pass
workers = []
for i in range(n_inputs):
workers.append(Worker(core_fn, [of_ins[i].cons()]))
rt = Runtime()
with rt.sequence(n_ty, n_ty, n_ty) as (A, B, C):
rt.start(*workers)
rt.fill(of_ins[0].prod(), A)
rt.fill(of_ins[1].prod(), B)
rt.fill(of_ins[2].prod(), C)
module = Program(NPU2Col2(), rt).resolve_program(NullPlacer())
return module
emits:
module {
aie.device(npu2_2col) {
%shim_noc_tile_c_0 = aie.tile(?, 0)
%tile_c_r = aie.tile(?, ?)
%shim_noc_tile_c_0_0 = aie.tile(?, 0)
%tile_c_r_1 = aie.tile(?, ?)
%shim_noc_tile_c_0_2 = aie.tile(?, 0)
%tile_c_r_3 = aie.tile(?, ?)
aie.objectfifo @in_0(%shim_noc_tile_c_0, {%tile_c_r}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
aie.objectfifo @in_2(%shim_noc_tile_c_0_0, {%tile_c_r_1}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
aie.objectfifo @in_1(%shim_noc_tile_c_0_2, {%tile_c_r_3}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
%core_c_r = aie.core(%tile_c_r) {
aie.end
}
%core_c_r_4 = aie.core(%tile_c_r_3) {
aie.end
}
%core_c_r_5 = aie.core(%tile_c_r_1) {
aie.end
}
aiex.runtime_sequence @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
%0 = aiex.dma_configure_task_for @in_0 {
aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
aie.end
}
aiex.dma_start_task(%0)
%1 = aiex.dma_configure_task_for @in_1 {
aie.dma_bd(%arg1 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
aie.end
}
aiex.dma_start_task(%1)
%2 = aiex.dma_configure_task_for @in_2 {
aie.dma_bd(%arg2 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
aie.end
}
aiex.dma_start_task(%2)
}
}
}
which can be placed with the mlir pass in this branch:
$ python place_test.py | aie-opt -canonicalize -aie-sequential-placer -canonicalize
module {
aie.device(npu2_2col) {
%tile_0_2 = aie.tile(0, 2)
%tile_0_3 = aie.tile(0, 3)
%shim_noc_tile_0_0 = aie.tile(0, 0)
%tile_0_4 = aie.tile(0, 4)
aie.objectfifo @in_0(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
aie.objectfifo @in_2(%shim_noc_tile_0_0, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
aie.objectfifo @in_1(%shim_noc_tile_0_0, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
%core_0_2 = aie.core(%tile_0_2) {
aie.end
}
%core_0_4 = aie.core(%tile_0_4) {
aie.end
}
%core_0_3 = aie.core(%tile_0_3) {
aie.end
}
aiex.runtime_sequence @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
%0 = aiex.dma_configure_task_for @in_0 {
aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
aie.end
}
aiex.dma_start_task(%0)
%1 = aiex.dma_configure_task_for @in_1 {
aie.dma_bd(%arg1 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
aie.end
}
aiex.dma_start_task(%1)
%2 = aiex.dma_configure_task_for @in_2 {
aie.dma_bd(%arg2 : memref<1024xi32>, 0, 1024, [<size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1, stride = 0>, <size = 1024, stride = 1>]) {burst_length = 0 : i32}
aie.end
}
aiex.dma_start_task(%2)
}
}
}