continual-skeletons icon indicating copy to clipboard operation
continual-skeletons copied to clipboard

Out of Memory

Open SugarMuzi opened this issue 1 year ago • 0 comments

Hello,thanks for your great work. And now,when i want to train model on PKUMMD dataset with ,for example cos_tr.py,however,i just adjusted the GraphDataset.py,the code as follow:

class GraphDataset(Dataset):
def __init__(
    self,
    data_path,
    label_path,
    random_choose=False,
    random_shift=False,
    random_move=False,
    window_size=-1,
    normalization=False,
    mmap_mode="rb",
    istraining=True,
     ):
    self.data_path = data_path
    self.label_path = label_path
    self.random_choose = random_choose
    self.random_shift = random_shift
    self.random_move = random_move
    self.window_size = window_size
    self.normalization = normalization
    self.mmap_mode = mmap_mode
    self.istraining = istraining
    self.inputs = []
    self.load_data()
    if normalization:
        self.get_mean_map()
    
def load_data(self):
    self.gen_data(self.data_path,self.label_path)

def gen_data(self,data_path,label_path):
    with open(label_path,self.mmap_mode) as f:
        target_all = pickle.load(f)
    f.close()
    with open(data_path,self.mmap_mode) as f:
        self.skeleton_all = pickle.load(f)
    f.close()
    # self.skeleton_all = self.skeleton_all.astype(np.float32)
    # target_all = target_all.astype(np.float32)
    data = []
    labels = []
    sessions = target_all.keys()
    self.enc_steps = 300
    self.dec_steps = 8
    for session in sessions:
        target = target_all[session]
        
        seed = np.random.randint(self.enc_steps) if self.istraining else 0
        
        for start,end in zip(
            range(seed,target.shape[0],1),
            range(seed+self.enc_steps,target.shape[0]-self.dec_steps,1)
        ):
            enc_target = target[start:end]
            dec_target = target[end : end + self.dec_steps]
            distance_target, class_h_target = self.get_distance_target(
                target[start:end]
            )
            self.inputs.append(
                        [
                            session,
                            start,
                            end,
                            enc_target,
                            distance_target,
                            class_h_target,
                            dec_target,
                        ]
                )

def get_distance_target(self,target_vector):
    target_matrix = np.zeros(self.enc_steps - 1)
    target_argmax = target_vector[self.enc_steps - 1].argmax()
    for i in range(self.enc_steps - 1):
        if target_vector[i].argmax() == target_argmax:
            target_matrix[i] = 1.0
    return target_matrix, target_vector[self.enc_steps - 1]
        
def get_mean_map(self):
    data = self.data
    N, C, T, V, M = data.shape
    self.mean_map = (
        data.mean(axis=2, keepdims=True).mean(axis=4, keepdims=True).mean(axis=0)
    )
    self.std_map = (
        data.transpose((0, 2, 4, 1, 3))
        .reshape((N * T * M, C * V))
        .std(axis=0)
        .reshape((C, 1, V, 1))
    )

def __len__(self):
    return len(self.inputs)

def __iter__(self):
    return self
def __getitem__(self, index):
    (
        session,
        start,
        end,
        enc_target,
        distance_target,
        class_h_target,
        dec_target,
    ) = self.inputs[index]
    data_numpy = self.skeleton_all[session][start:end]
    data_numpy = data_numpy.transpose((2,0,1,3))
    C,T,V,S = data_numpy.shape
    if data_numpy.shape[1] < self.enc_steps:
        data_new = np.zeros((C,300,V,S),dtype=np.float32)
        data_new[:,:T,:,:] = data_numpy
        data_numpy = data_new
    label = class_h_target

    if self.normalization:
        data_numpy = (data_numpy - self.mean_map) / self.std_map
    if self.random_shift:
        data_numpy = tools.random_shift(data_numpy)
    if self.random_choose:
        data_numpy = tools.random_choose(data_numpy, self.window_size)
    elif self.window_size > 0:
        data_numpy = tools.auto_pading(data_numpy, self.window_size)
    if self.random_move:
        data_numpy = tools.random_move(data_numpy)

    return data_numpy, label, index

but when i run the code by cmdline as :python models/cos_tr/cos_tr.py --train --max_epochs 30 --id benchmark_costr_pkummdv1 --gpus "0,1,2,3" --profile_model --profile_model_num_runs 10 --forward_mode clip --batch_size 128 --num_workers 8 --dataset_name pkummd --dataset_classes ./datasets/pkummd/classes.yaml --dataset_train_data /data/pkummdv1_float32/train_subject_data_v1.pkl --dataset_val_data /data/pkummdv1_float32/test_subject_data_v1.pkl --dataset_train_labels /data/pkummdv1_float32/train_subject_label_thoum_v1.pkl --dataset_val_labels /data/pkummdv1_float32/test_subject_label_thoum_v1.pkl the used memory would quickly rise to more than 100 G, but my total data is about 5G. and the output log is `lightning: Global seed set to 123

ride: Running on host gpu-task-nod5 ride: ⭐️ View project repository at [email protected]:LukasHedegaard/continual-skeletons/tree/a8fe2937a33f24cce65c1f8c2fc41081bceda721 ride: Run data is saved locally at logs/run_logs/benchmark_costr_pkummdv1/version_6 ride: Logging using Tensorboard ride: 💾 Saving logs/run_logs/benchmark_costr_pkummdv1/version_6/hparams.yaml ride: 🚀 Running training continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps continual: Temporal stride of 2 will result in skipped outputs every 1 / 2 steps models: Input shape (C, T, V, S) = (3, 300, 25, 2) models: Receptive field 449 models: Init frames 144 models: Pool size 75 models: Stride 4 models: Padding 152 models: Using Continual CallMode.FORWARD ride: ✅ Checkpointing on val/loss with optimisation direction min /home/yaoning.li/Anaconda/yes/envs/mmlab/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:110: LightningDeprecationWarning: Trainer(distrib uted_backend=ddp) has been deprecated and will be removed in v1.5. Use Trainer(accelerator=ddp) instead. rank_zero_deprecation( lightning: GPU available: True, used: True lightning: TPU available: False, using: 0 TPU cores lightning: IPU available: False, using: 0 IPUs lightning: Global seed set to 123 lightning: initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4`

i dont know why. Can you help me?Thank you !

SugarMuzi avatar Aug 30 '23 08:08 SugarMuzi