continual-skeletons
continual-skeletons copied to clipboard
Out of Memory
Hello,thanks for your great work. And now,when i want to train model on PKUMMD dataset with ,for example cos_tr.py,however,i just adjusted the GraphDataset.py,the code as follow:
class GraphDataset(Dataset):
def __init__(
self,
data_path,
label_path,
random_choose=False,
random_shift=False,
random_move=False,
window_size=-1,
normalization=False,
mmap_mode="rb",
istraining=True,
):
self.data_path = data_path
self.label_path = label_path
self.random_choose = random_choose
self.random_shift = random_shift
self.random_move = random_move
self.window_size = window_size
self.normalization = normalization
self.mmap_mode = mmap_mode
self.istraining = istraining
self.inputs = []
self.load_data()
if normalization:
self.get_mean_map()
def load_data(self):
self.gen_data(self.data_path,self.label_path)
def gen_data(self,data_path,label_path):
with open(label_path,self.mmap_mode) as f:
target_all = pickle.load(f)
f.close()
with open(data_path,self.mmap_mode) as f:
self.skeleton_all = pickle.load(f)
f.close()
# self.skeleton_all = self.skeleton_all.astype(np.float32)
# target_all = target_all.astype(np.float32)
data = []
labels = []
sessions = target_all.keys()
self.enc_steps = 300
self.dec_steps = 8
for session in sessions:
target = target_all[session]
seed = np.random.randint(self.enc_steps) if self.istraining else 0
for start,end in zip(
range(seed,target.shape[0],1),
range(seed+self.enc_steps,target.shape[0]-self.dec_steps,1)
):
enc_target = target[start:end]
dec_target = target[end : end + self.dec_steps]
distance_target, class_h_target = self.get_distance_target(
target[start:end]
)
self.inputs.append(
[
session,
start,
end,
enc_target,
distance_target,
class_h_target,
dec_target,
]
)
def get_distance_target(self,target_vector):
target_matrix = np.zeros(self.enc_steps - 1)
target_argmax = target_vector[self.enc_steps - 1].argmax()
for i in range(self.enc_steps - 1):
if target_vector[i].argmax() == target_argmax:
target_matrix[i] = 1.0
return target_matrix, target_vector[self.enc_steps - 1]
def get_mean_map(self):
data = self.data
N, C, T, V, M = data.shape
self.mean_map = (
data.mean(axis=2, keepdims=True).mean(axis=4, keepdims=True).mean(axis=0)
)
self.std_map = (
data.transpose((0, 2, 4, 1, 3))
.reshape((N * T * M, C * V))
.std(axis=0)
.reshape((C, 1, V, 1))
)
def __len__(self):
return len(self.inputs)
def __iter__(self):
return self
def __getitem__(self, index):
(
session,
start,
end,
enc_target,
distance_target,
class_h_target,
dec_target,
) = self.inputs[index]
data_numpy = self.skeleton_all[session][start:end]
data_numpy = data_numpy.transpose((2,0,1,3))
C,T,V,S = data_numpy.shape
if data_numpy.shape[1] < self.enc_steps:
data_new = np.zeros((C,300,V,S),dtype=np.float32)
data_new[:,:T,:,:] = data_numpy
data_numpy = data_new
label = class_h_target
if self.normalization:
data_numpy = (data_numpy - self.mean_map) / self.std_map
if self.random_shift:
data_numpy = tools.random_shift(data_numpy)
if self.random_choose:
data_numpy = tools.random_choose(data_numpy, self.window_size)
elif self.window_size > 0:
data_numpy = tools.auto_pading(data_numpy, self.window_size)
if self.random_move:
data_numpy = tools.random_move(data_numpy)
return data_numpy, label, index
but when i run the code by cmdline as :python models/cos_tr/cos_tr.py --train --max_epochs 30 --id benchmark_costr_pkummdv1 --gpus "0,1,2,3" --profile_model --profile_model_num_runs 10 --forward_mode clip --batch_size 128 --num_workers 8 --dataset_name pkummd --dataset_classes ./datasets/pkummd/classes.yaml --dataset_train_data /data/pkummdv1_float32/train_subject_data_v1.pkl --dataset_val_data /data/pkummdv1_float32/test_subject_data_v1.pkl --dataset_train_labels /data/pkummdv1_float32/train_subject_label_thoum_v1.pkl --dataset_val_labels /data/pkummdv1_float32/test_subject_label_thoum_v1.pkl
the used memory would quickly rise to more than 100 G, but my total data is about 5G. and the output log is
`lightning: Global seed set to 123
ride: Running on host gpu-task-nod5
ride: ⭐️ View project repository at [email protected]:LukasHedegaard/continual-skeletons/tree/a8fe2937a33f24cce65c1f8c2fc41081bceda721
ride: Run data is saved locally at logs/run_logs/benchmark_costr_pkummdv1/version_6
ride: Logging using Tensorboard
ride: 💾 Saving logs/run_logs/benchmark_costr_pkummdv1/version_6/hparams.yaml
ride: 🚀 Running training
continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps
continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps
continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps
continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps
models: Input shape (C, T, V, S) = (3, 300, 25, 2)
models: Receptive field 449
models: Init frames 144
models: Pool size 75
models: Stride 4
models: Padding 152
models: Using Continual CallMode.FORWARD
ride: ✅ Checkpointing on val/loss with optimisation direction min
/home/yaoning.li/Anaconda/yes/envs/mmlab/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:110: LightningDeprecationWarning: Trainer(distrib uted_backend=ddp)
has been deprecated and will be removed in v1.5. Use Trainer(accelerator=ddp)
instead.
rank_zero_deprecation(
lightning: GPU available: True, used: True
lightning: TPU available: False, using: 0 TPU cores
lightning: IPU available: False, using: 0 IPUs
lightning: Global seed set to 123
lightning: initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4`
i dont know why. Can you help me?Thank you !