taichi
taichi copied to clipboard
Stuck when running demo_quantized_simulation_letters with bls on
Describe the bug
Running https://github.com/taichi-dev/taichi_elements/blob/2552c476838ee02071afd16ff2465613610c6f2f/demo/demo_quantized_simulation_letters.py with use_bls=True
(L95) will get stuck.
To Reproduce I managed to shorten the script from 1000+ lines to 100+ lines but I could not go further. Almost any further change will save the script. Here is the shortened version:
import taichi as ti
ti.init(arch=ti.cuda,
device_memory_GB=20,
offline_cache=False,
print_kernel_nvptx=True,
print_ir=True)
res = (256, 256, 256)
dim = 3
grid_size = 4096
dx = 1 / res[0]
inv_dx = 1.0 / dx
default_dt = 2e-2 * dx
input_grid = 0
x = ti.Vector.field(dim, dtype=ti.f32)
F = ti.Matrix.field(dim, dim, dtype=ti.f32)
indices = ti.ijk
offset = tuple(-grid_size // 2 for _ in range(dim))
num_grids = 2
grid_block_size = 128
leaf_block_size = 4
grids = []
grid_vs = []
grid_ms = []
pids = []
for g in range(num_grids):
# Grid node momentum/velocity
grid_v = ti.Vector.field(dim, dtype=ti.f32)
grid_m = ti.field(dtype=ti.f32)
pid = ti.field(ti.i32)
grid_vs.append(grid_v)
# Grid node mass
grid_ms.append(grid_m)
grid = ti.root.pointer(indices, grid_size // grid_block_size)
block = grid.pointer(indices, grid_block_size // leaf_block_size)
grids.append(grid)
def block_component(c):
block.dense(indices, leaf_block_size).place(c, offset=offset)
block_component(grid_m)
for d in range(dim):
block_component(grid_v.get_scalar_field(d))
pids.append(pid)
block_offset = tuple(o // leaf_block_size for o in offset)
block.dynamic(ti.axes(dim), 1024 * 1024, chunk_size=leaf_block_size**dim * 8).place(
pid, offset=block_offset + (0, ))
# An empirically optimal chunk size is 1/10 of the expected particle number
chunk_size = 2**23
particle = ti.root.dynamic(ti.i, 2**30, chunk_size)
particle.place(x, F)
def stencil_range():
return ti.ndrange(*((3, ) * dim))
@ti.kernel
def build_pid(pid: ti.template(), grid_m: ti.template()):
"""
grid has blocking (e.g. 4x4x4), we wish to put the particles from each block into a GPU block,
then used shared memory (ti.block_local) to accelerate
"""
ti.loop_config(block_dim=64)
for p in x:
base = int(ti.floor(x[p] * inv_dx - 0.5)) - ti.Vector(list(offset))
# Pid grandparent is `block`
base_pid = ti.rescale_index(grid_m, pid.parent(2), base)
ti.append(pid.parent(), base_pid, p)
@ti.kernel
def g2p2g(dt: ti.f32, pid: ti.template(), grid_v_in: ti.template(),
grid_v_out: ti.template(), grid_m_out: ti.template()):
ti.loop_config(block_dim=256)
#ti.no_activate(self.particle)
ti.block_local(grid_m_out)
for d in ti.static(range(dim)):
ti.block_local(grid_v_in.get_scalar_field(d))
ti.block_local(grid_v_out.get_scalar_field(d))
for I in ti.grouped(pid):
p = pid[I]
# G2P
base = ti.floor(x[p] * 256.0 - 0.5).cast(int)
Im = ti.rescale_index(pid, grid_m_out, I)
for D in ti.static(range(dim)):
base[D] = ti.assume_in_range(base[D], Im[D], 0, 1)
fx = x[p] * 256.0 - base.cast(float)
w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2]
new_v = ti.Vector.zero(ti.f32, dim)
C = ti.Matrix.zero(ti.f32, dim, dim)
# Loop over 3x3 grid node neighborhood
for offset in ti.static(ti.grouped(stencil_range())):
dpos = offset.cast(float) - fx
g_v = grid_v_in[base + offset]
weight = 1.0
for d in ti.static(range(dim)):
weight *= w[offset[d]][d]
new_v += g_v
C += weight * g_v.outer_product(dpos)
# P2G
base = ti.floor(x[p] * 256.0 - 0.5).cast(int)
for D in ti.static(range(dim)):
base[D] = ti.assume_in_range(base[D], Im[D], -1, 2)
fx = x[p] * 256.0 - base.cast(float)
w2 = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
F[p] = C
# Loop over 3x3 grid node neighborhood
for offset in ti.static(ti.grouped(stencil_range())):
weight = 1.0
for d in ti.static(range(dim)):
weight *= w2[offset[d]][d]
grid_v_out[base + offset] += weight * new_v
grid_m_out[base + offset] += weight
def step(frame_dt):
substeps = int(frame_dt / default_dt) + 1
dt = frame_dt / substeps
frame_time_left = frame_dt
while frame_time_left > 0:
print('.', end='', flush=True)
frame_time_left -= dt
output_grid = 1 - input_grid
grids[output_grid].deactivate_all()
build_pid(pids[input_grid], grid_ms[input_grid])
g2p2g(dt, pids[input_grid], grid_vs[input_grid], grid_vs[output_grid], grid_ms[output_grid])
@ti.kernel
def seed_from_voxels():
for i in range(30000):
x[i] = ti.Vector([-0.5 * 0.6 - 0.5, 1.1, 0.1])
F[i] = ti.Matrix.identity(ti.f32, dim)
seed_from_voxels()
step(1e-2)