diffsim
diffsim copied to clipboard
Memory Leak problem in arcsim.step()
Hi, when I am running modified exp_inverse.py example to fold a cloth, it seems that there is memory leakage, for each epoch when I check the memory using htop, the memory of exp_inverse.py is always increasing. And the process will be automatically killed if the epoch is long. Here is our code
import torch
import arcsim
import gc
import time
import json
import sys
import gc
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
now = datetime.now()
timestamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
steps = 30
epochs= 10
node_number = 0
handles = [25, 60, 30, 54]
losses = []
param_g = torch.zeros([steps, 12],dtype=torch.float64, requires_grad=True)
default_dir = 'results/'+time.ctime()
os.mkdir(default_dir)
out_path = default_dir + '/default_out'
os.mkdir(out_path)
with open('conf/rigidcloth/drag/drag.json','r') as f:
config = json.load(f)
def save_config(config, file):
with open(file,'w') as f:
json.dump(config, f)
save_config(config, out_path+'/conf.json')
torch.set_num_threads(16)
scalev=1
def reset_sim(sim, epoch):
if epoch < 20:
arcsim.init_physics(out_path+'/conf.json', out_path+'/out%d'%epoch,False)
else:
arcsim.init_physics(out_path+'/conf.json',out_path+'/out',False)
def get_target_mesh():
sim = arcsim.get_sim()
arcsim.init_physics('conf/rigidcloth/fold_targets/target1.json',out_path+'/target',False)
global node_number
node_number = len(sim.cloths[0].mesh.nodes)
ref = [sim.cloths[0].mesh.nodes[i].x.numpy() for i in range(node_number)]
ref = torch.from_numpy(np.vstack(ref))
return ref
def get_loss(sim,ref):
reg = torch.norm(param_g, p=2)*0.001
loss = 0
for i in range(ref.shape[0]):
loss += torch.norm(ref[i]-sim.cloths[0].mesh.nodes[i].x)**2
loss /= node_number
loss += reg
return loss
def run_sim(steps,sim,ref):
# sim.obstacles[2].curr_state_mesh.dummy_node.x = param_g[1]
print("step")
for step in range(steps):
print(step)
for i in range(len(handles)):
inc_v = param_g[step,3*i:3*i+3]
sim.cloths[0].mesh.nodes[handles[i]].v += inc_v
del inc_v
arcsim.sim_step()
loss = get_loss(sim,ref)
return loss
@profile
def do_train(cur_step,optimizer,scheduler,sim):
epoch = 0
ref = get_target_mesh()
print(ref)
while True:
reset_sim(sim, epoch)
st = time.time()
loss = run_sim(steps, sim,ref)
en0 = time.time()
optimizer.zero_grad()
loss.backward()
en1 = time.time()
print("=======================================")
f.write('epoch {}: loss={} \n'.format(epoch, loss.data))
print('epoch {}: loss={} \n'.format(epoch, loss.data))
print('forward time={}'.format(en0-st))
print('backward time={}'.format(en1-en0))
optimizer.step()
#scheduler.step(epoch)
losses.append(loss)
if epoch>=epochs:
break
epoch = epoch + 1
# break
def visualize_loss(losses,dir_name):
plt.plot(losses)
plt.title('losses')
plt.xlabel('epochs')
plt.ylabel('losses')
plt.savefig(dir_name+'/'+'loss.jpg')
with open(out_path+('/log%s.txt'%timestamp),'w',buffering=1) as f:
tot_step = 1
sim=arcsim.get_sim()
# reset_sim(sim)
lr = 10
momentum = 0.4
f.write('lr={} momentum={}\n'.format(lr,momentum))
optimizer = torch.optim.SGD([{'params':param_g,'lr':lr}],momentum=momentum)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,10,2,eta_min=0.0001)
for cur_step in range(tot_step):
do_train(cur_step,optimizer,scheduler,sim)
visualize_loss(losses,default_dir)
print("done")
Also, I used memory profiler to inspect the code and find that arcsim.step() takes most memory without releasing them.
Try adding arcsim.delete_mesh(sim.cloths[0].mesh)
After your get_target_mesh()
call is done, and then again after each loop where you call reset_sim()
, after the optimizer.step()
call.