taichi Stuck when running demo_quantized_simulation

Stuck when running demo_quantized_simulation_letters with bls on

Open strongoier opened this issue 2 years ago • 0 comments

Describe the bug Running https://github.com/taichi-dev/taichi_elements/blob/2552c476838ee02071afd16ff2465613610c6f2f/demo/demo_quantized_simulation_letters.py with use_bls=True (L95) will get stuck.

To Reproduce I managed to shorten the script from 1000+ lines to 100+ lines but I could not go further. Almost any further change will save the script. Here is the shortened version:

import taichi as ti

ti.init(arch=ti.cuda,
        device_memory_GB=20,
        offline_cache=False,
        print_kernel_nvptx=True,
        print_ir=True)
res = (256, 256, 256)
dim = 3
grid_size = 4096
dx = 1 / res[0]
inv_dx = 1.0 / dx
default_dt = 2e-2 * dx
input_grid = 0
x = ti.Vector.field(dim, dtype=ti.f32)
F = ti.Matrix.field(dim, dim, dtype=ti.f32)
indices = ti.ijk
offset = tuple(-grid_size // 2 for _ in range(dim))
num_grids = 2
grid_block_size = 128
leaf_block_size = 4
grids = []
grid_vs = []
grid_ms = []
pids = []
for g in range(num_grids):
    # Grid node momentum/velocity
    grid_v = ti.Vector.field(dim, dtype=ti.f32)
    grid_m = ti.field(dtype=ti.f32)
    pid = ti.field(ti.i32)
    grid_vs.append(grid_v)
    # Grid node mass
    grid_ms.append(grid_m)
    grid = ti.root.pointer(indices, grid_size // grid_block_size)
    block = grid.pointer(indices, grid_block_size // leaf_block_size)
    grids.append(grid)
    def block_component(c):
        block.dense(indices, leaf_block_size).place(c, offset=offset)
    block_component(grid_m)
    for d in range(dim):
        block_component(grid_v.get_scalar_field(d))
    pids.append(pid)
    block_offset = tuple(o // leaf_block_size for o in offset)
    block.dynamic(ti.axes(dim), 1024 * 1024, chunk_size=leaf_block_size**dim * 8).place(
                  pid, offset=block_offset + (0, ))
# An empirically optimal chunk size is 1/10 of the expected particle number
chunk_size = 2**23
particle = ti.root.dynamic(ti.i, 2**30, chunk_size)
particle.place(x, F)

def stencil_range():
    return ti.ndrange(*((3, ) * dim))

@ti.kernel
def build_pid(pid: ti.template(), grid_m: ti.template()):
    """
    grid has blocking (e.g. 4x4x4), we wish to put the particles from each block into a GPU block,
    then used shared memory (ti.block_local) to accelerate
    """
    ti.loop_config(block_dim=64)
    for p in x:
        base = int(ti.floor(x[p] * inv_dx - 0.5)) - ti.Vector(list(offset))
        # Pid grandparent is `block`
        base_pid = ti.rescale_index(grid_m, pid.parent(2), base)
        ti.append(pid.parent(), base_pid, p)

@ti.kernel
def g2p2g(dt: ti.f32, pid: ti.template(), grid_v_in: ti.template(),
          grid_v_out: ti.template(), grid_m_out: ti.template()):
    ti.loop_config(block_dim=256)
    #ti.no_activate(self.particle)
    ti.block_local(grid_m_out)
    for d in ti.static(range(dim)):
        ti.block_local(grid_v_in.get_scalar_field(d))
        ti.block_local(grid_v_out.get_scalar_field(d))
    for I in ti.grouped(pid):
        p = pid[I]
        # G2P
        base = ti.floor(x[p] * 256.0 - 0.5).cast(int)
        Im = ti.rescale_index(pid, grid_m_out, I)
        for D in ti.static(range(dim)):
            base[D] = ti.assume_in_range(base[D], Im[D], 0, 1)
        fx = x[p] * 256.0 - base.cast(float)
        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2]
        new_v = ti.Vector.zero(ti.f32, dim)
        C = ti.Matrix.zero(ti.f32, dim, dim)
        # Loop over 3x3 grid node neighborhood
        for offset in ti.static(ti.grouped(stencil_range())):
            dpos = offset.cast(float) - fx
            g_v = grid_v_in[base + offset]
            weight = 1.0
            for d in ti.static(range(dim)):
                weight *= w[offset[d]][d]
            new_v += g_v
            C += weight * g_v.outer_product(dpos)

        # P2G
        base = ti.floor(x[p] * 256.0 - 0.5).cast(int)
        for D in ti.static(range(dim)):
            base[D] = ti.assume_in_range(base[D], Im[D], -1, 2)
        fx = x[p] * 256.0 - base.cast(float)
        w2 = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
        F[p] = C
        # Loop over 3x3 grid node neighborhood
        for offset in ti.static(ti.grouped(stencil_range())):
            weight = 1.0
            for d in ti.static(range(dim)):
                weight *= w2[offset[d]][d]
            grid_v_out[base + offset] += weight * new_v
            grid_m_out[base + offset] += weight

def step(frame_dt):
    substeps = int(frame_dt / default_dt) + 1
    dt = frame_dt / substeps
    frame_time_left = frame_dt
    while frame_time_left > 0:
        print('.', end='', flush=True)
        frame_time_left -= dt
        output_grid = 1 - input_grid
        grids[output_grid].deactivate_all()
        build_pid(pids[input_grid], grid_ms[input_grid])
        g2p2g(dt, pids[input_grid], grid_vs[input_grid], grid_vs[output_grid], grid_ms[output_grid])

@ti.kernel
def seed_from_voxels():
    for i in range(30000):
        x[i] = ti.Vector([-0.5 * 0.6 - 0.5, 1.1, 0.1])
        F[i] = ti.Matrix.identity(ti.f32, dim)

seed_from_voxels()
step(1e-2)

Aug 30 '22 07:08 strongoier

taichi taichi copied to clipboard

Stuck when running demo_quantized_simulation_letters with bls on

taichi
taichi copied to clipboard