pytorch-ts Unstability in training for TransformerTempFlow model, requiring help

Unstability in training for TransformerTempFlow model, requiring help

Open vamp-ire-tap opened this issue 2 years ago • 6 comments

Hello, I have the following code as an example, abstracted from the original provided example, but when I start training the network, it gives me NANs and other stability issues even for random sample data, and also for real application data. Could the authors please help me debug this issue? I would be thankful!

from typing import List, Optional, Tuple

import torch
import torch.nn as nn

from gluonts.core.component import validated
from pts.modules import RealNVP, MAF, FlowOutput, MeanScaler, NOPScaler
from gluonts.torch.util import copy_parameters


class TransformerTempFlowTrainingNetwork(nn.Module):
    @validated()
    def __init__(
        self,
        input_size: int,
        d_model: int,
        num_heads: int,
        act_type: str,
        dropout_rate: float,
        dim_feedforward_scale: int,
        num_encoder_layers: int,
        num_decoder_layers: int,
        history_length: int,
        context_length: int,
        prediction_length: int,
        lags_seq: List[int],
        target_dim: int,
        conditioning_length: int,
        flow_type: str,
        n_blocks: int,
        hidden_size: int,
        n_hidden: int,
        dequantize: bool,
        scaling: bool = True,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        
        # print('input_size', input_size)
        # print('d_model', d_model)
        # print('num_heads', num_heads)
        # print('act_type', act_type)
        # print('dropout_rate', dropout_rate)
        # print('dim_feedforward_scale', dim_feedforward_scale)
        # print('num_encoder_layers', num_encoder_layers)
        # print('num_decoder_layers', num_decoder_layers)
        # print('history_length', history_length)
        # print('context_length', context_length)
        # print('prediction_length', prediction_length)
        # print('lags_seq', lags_seq)
        # print('target_dim', target_dim)
        # print('conditioning_length', conditioning_length)
        # print('flow_type', flow_type)
        # print('n_blocks', n_blocks)
        # print('hidden_size', hidden_size)
        # print('n_hidden', n_hidden)
        # print('dequantize', dequantize)
        # print('scaling', scaling)
        
        self.target_dim = target_dim
        self.prediction_length = prediction_length
        self.context_length = context_length
        self.history_length = history_length
        self.scaling = scaling

        assert len(set(lags_seq)) == len(lags_seq), "no duplicated lags allowed!"
        lags_seq.sort()
        self.lags_seq = lags_seq

        self.encoder_input = nn.Linear(input_size, d_model)
        self.decoder_input = nn.Linear(input_size, d_model)

        # [B, T, d_model] where d_model / num_heads is int
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward_scale * d_model,
            dropout=dropout_rate,
            activation=act_type,
        )

        flow_cls = {
            "RealNVP": RealNVP,
            "MAF": MAF,
        }[flow_type]
        self.flow = flow_cls(
            input_size=target_dim,
            n_blocks=n_blocks,
            n_hidden=n_hidden,
            hidden_size=hidden_size,
            cond_label_size=conditioning_length,
        )
        self.dequantize = dequantize

        self.distr_output = FlowOutput(
            self.flow, input_size=target_dim, cond_size=conditioning_length
        )
        self.proj_dist_args = self.distr_output.get_args_proj(d_model)

        self.embed_dim = 1
        self.embed = nn.Embedding(
            num_embeddings=self.target_dim, embedding_dim=self.embed_dim
        )

        if self.scaling:
            self.scaler = MeanScaler(keepdim=True)
        else:
            self.scaler = NOPScaler(keepdim=True)

        # mask
        self.register_buffer(
            "tgt_mask",
            self.transformer.generate_square_subsequent_mask(prediction_length),
        )

    @staticmethod
    def get_lagged_subsequences(
        sequence: torch.Tensor,
        sequence_length: int,
        indices: List[int],
        subsequences_length: int = 1,
    ) -> torch.Tensor:
        """
        Returns lagged subsequences of a given sequence.
        Parameters
        ----------
        sequence
            the sequence from which lagged subsequences should be extracted.
            Shape: (N, T, C).
        sequence_length
            length of sequence in the T (time) dimension (axis = 1).
        indices
            list of lag indices to be used.
        subsequences_length
            length of the subsequences to be extracted.
        Returns
        --------
        lagged : Tensor
            a tensor of shape (N, S, C, I),
            where S = subsequences_length and I = len(indices),
            containing lagged subsequences.
            Specifically, lagged[i, :, j, k] = sequence[i, -indices[k]-S+j, :].
        """
        # we must have: history_length + begin_index >= 0
        # that is: history_length - lag_index - sequence_length >= 0
        # hence the following assert
        assert max(indices) + subsequences_length <= sequence_length, (
            f"lags cannot go further than history length, found lag "
            f"{max(indices)} while history length is only {sequence_length}"
        )
        assert all(lag_index >= 0 for lag_index in indices)

        lagged_values = []
        for lag_index in indices:
            begin_index = -lag_index - subsequences_length
            end_index = -lag_index if lag_index > 0 else None
            lagged_values.append(sequence[:, begin_index:end_index, ...].unsqueeze(1))
        return torch.cat(lagged_values, dim=1).permute(0, 2, 3, 1)

    def create_network_input(
        self,
        past_time_feat: torch.Tensor,
        past_target_cdf: torch.Tensor,
        past_observed_values: torch.Tensor,
        past_is_pad: torch.Tensor,
        future_time_feat: Optional[torch.Tensor],
        future_target_cdf: Optional[torch.Tensor],
        target_dimension_indicator: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]:
        """
        Unrolls the RNN encoder over past and, if present, future data.
        Returns outputs and state of the encoder, plus the scale of
        past_target_cdf and a vector of static features that was constructed
        and fed as input to the encoder. All tensor arguments should have NTC
        layout.

        Parameters
        ----------
        past_time_feat
            Past time features (batch_size, history_length, num_features)
        past_target_cdf
            Past marginal CDF transformed target values (batch_size,
            history_length, target_dim)
        past_observed_values
            Indicator whether or not the values were observed (batch_size,
            history_length, target_dim)
        past_is_pad
            Indicator whether the past target values have been padded
            (batch_size, history_length)
        future_time_feat
            Future time features (batch_size, prediction_length, num_features)
        future_target_cdf
            Future marginal CDF transformed target values (batch_size,
            prediction_length, target_dim)
        target_dimension_indicator
            Dimensionality of the time series (batch_size, target_dim)

        Returns
        -------
        outputs
            RNN outputs (batch_size, seq_len, num_cells)
        states
            RNN states. Nested list with (batch_size, num_cells) tensors with
        dimensions target_dim x num_layers x (batch_size, num_cells)
        scale
            Mean scales for the time series (batch_size, 1, target_dim)
        lags_scaled
            Scaled lags(batch_size, sub_seq_len, target_dim, num_lags)
        inputs
            inputs to the RNN

        """

        past_observed_values = torch.min(
            past_observed_values, 1 - past_is_pad.unsqueeze(-1)
        )

        if future_time_feat is None or future_target_cdf is None:
            time_feat = past_time_feat[:, -self.context_length :, ...]
            sequence = past_target_cdf
            sequence_length = self.history_length
            subsequences_length = self.context_length
        else:
            time_feat = torch.cat(
                (
                    past_time_feat[:, -self.context_length :, ...],
                    future_time_feat,
                ),
                dim=1,
            )
            sequence = torch.cat((past_target_cdf, future_target_cdf), dim=1)
            sequence_length = self.history_length + self.prediction_length
            subsequences_length = self.context_length + self.prediction_length

        # (batch_size, sub_seq_len, target_dim, num_lags)
        lags = self.get_lagged_subsequences(
            sequence=sequence,
            sequence_length=sequence_length,
            indices=self.lags_seq,
            subsequences_length=subsequences_length,
        )

        # scale is computed on the context length last units of the past target
        # scale shape is (batch_size, 1, target_dim)
        _, scale = self.scaler(
            past_target_cdf[:, -self.context_length :, ...],
            past_observed_values[:, -self.context_length :, ...],
        )

        # (batch_size, sub_seq_len, target_dim, num_lags)
        lags_scaled = lags / scale.unsqueeze(-1)

        # assert_shape(
        #     lags_scaled, (-1, unroll_length, self.target_dim, len(self.lags_seq)),
        # )

        input_lags = lags_scaled.reshape(
            (-1, subsequences_length, len(self.lags_seq) * self.target_dim)
        )

        # (batch_size, target_dim, embed_dim)
        index_embeddings = self.embed(target_dimension_indicator)
        # assert_shape(index_embeddings, (-1, self.target_dim, self.embed_dim))

        # (batch_size, seq_len, target_dim * embed_dim)
        repeated_index_embeddings = (
            index_embeddings.unsqueeze(1)
            .expand(-1, subsequences_length, -1, -1)
            .reshape((-1, subsequences_length, self.target_dim * self.embed_dim))
        )

        # (batch_size, sub_seq_len, input_dim)
        inputs = torch.cat((input_lags, repeated_index_embeddings, time_feat), dim=-1)

        return inputs, scale, index_embeddings

    def distr_args(self, decoder_output: torch.Tensor):
        """
        Returns the distribution of DeepVAR with respect to the RNN outputs.

        Parameters
        ----------
        rnn_outputs
            Outputs of the unrolled RNN (batch_size, seq_len, num_cells)
        scale
            Mean scale for each time series (batch_size, 1, target_dim)

        Returns
        -------
        distr
            Distribution instance
        distr_args
            Distribution arguments
        """
        (distr_args,) = self.proj_dist_args(decoder_output)#decoder_output: torch.Size([64, 24, 16])
        #distr_args: torch.Size([64, 24, 100])

        # # compute likelihood of target given the predicted parameters
        # distr = self.distr_output.distribution(distr_args, scale=scale)

        # return distr, distr_args
        return distr_args

    def forward(
        self,
        target_dimension_indicator: torch.Tensor,
        past_time_feat: torch.Tensor,
        past_target_cdf: torch.Tensor,
        past_observed_values: torch.Tensor,
        past_is_pad: torch.Tensor,
        future_time_feat: torch.Tensor,
        future_target_cdf: torch.Tensor,
        future_observed_values: torch.Tensor,
    ) -> Tuple[torch.Tensor, ...]:
        """
        Computes the loss for training DeepVAR, all inputs tensors representing
        time series have NTC layout.

        Parameters
        ----------
        target_dimension_indicator
            Indices of the target dimension (batch_size, target_dim)
        past_time_feat
            Dynamic features of past time series (batch_size, history_length,
            num_features)
        past_target_cdf
            Past marginal CDF transformed target values (batch_size,
            history_length, target_dim)
        past_observed_values
            Indicator whether or not the values were observed (batch_size,
            history_length, target_dim)
        past_is_pad
            Indicator whether the past target values have been padded
            (batch_size, history_length)
        future_time_feat
            Future time features (batch_size, prediction_length, num_features)
        future_target_cdf
            Future marginal CDF transformed target values (batch_size,
            prediction_length, target_dim)
        future_observed_values
            Indicator whether or not the future values were observed
            (batch_size, prediction_length, target_dim)

        Returns
        -------
        distr
            Loss with shape (batch_size, 1)
        likelihoods
            Likelihoods for each time step
            (batch_size, context + prediction_length, 1)
        distr_args
            Distribution arguments (context + prediction_length,
            number_of_arguments)
        """

        # seq_len = self.context_length + self.prediction_length

        # unroll the decoder in "training mode", i.e. by providing future data
        # as well
        inputs, scale, _ = self.create_network_input(
            past_time_feat=past_time_feat,
            past_target_cdf=past_target_cdf,
            past_observed_values=past_observed_values,
            past_is_pad=past_is_pad,
            future_time_feat=future_time_feat,
            future_target_cdf=future_target_cdf,
            target_dimension_indicator=target_dimension_indicator,
        )

        enc_inputs = inputs[:, : self.context_length, ...]
        dec_inputs = inputs[:, self.context_length :, ...]

        enc_out = self.transformer.encoder(
            self.encoder_input(enc_inputs).permute(1, 0, 2)
        )

        dec_output = self.transformer.decoder(
            self.decoder_input(dec_inputs).permute(1, 0, 2),
            enc_out,
            tgt_mask=self.tgt_mask,
        )

        if self.scaling:
            self.flow.scale = scale

        # we sum the last axis to have the same shape for all likelihoods
        # (batch_size, subseq_length, 1)
        if self.dequantize:
            future_target_cdf += torch.rand_like(future_target_cdf)

        distr_args = self.distr_args(decoder_output=dec_output.permute(1, 0, 2))
        # likelihoods = -self.flow.log_prob(target, distr_args).unsqueeze(-1)
        loss = -self.flow.log_prob(future_target_cdf, distr_args).unsqueeze(-1)

        # # assert_shape(likelihoods, (-1, seq_len, 1))

        # past_observed_values = torch.min(
        #     past_observed_values, 1 - past_is_pad.unsqueeze(-1)
        # )

        # # (batch_size, subseq_length, target_dim)
        # observed_values = torch.cat(
        #     (
        #         past_observed_values[:, -self.context_length :, ...],
        #         future_observed_values,
        #     ),
        #     dim=1,
        # )

        # # mask the loss at one time step if one or more observations is missing
        # # in the target dimensions (batch_size, subseq_length, 1)
        # loss_weights, _ = observed_values.min(dim=-1, keepdim=True)

        # # assert_shape(loss_weights, (-1, seq_len, 1))

        # loss = weighted_average(likelihoods, weights=loss_weights, dim=1)

        # assert_shape(loss, (-1, -1, 1))

        # self.distribution = distr

        # return (loss.mean(), likelihoods, distr_args)
        return loss.mean()#, distr_args


class TransformerTempFlowPredictionNetwork(TransformerTempFlowTrainingNetwork):
    def __init__(self, num_parallel_samples: int, **kwargs) -> None:
        super().__init__(**kwargs)
        self.num_parallel_samples = num_parallel_samples

        # for decoding the lags are shifted by one,
        # at the first time-step of the decoder a lag of one corresponds to
        # the last target value
        self.shifted_lags = [l - 1 for l in self.lags_seq]

    def sampling_decoder(
        self,
        past_target_cdf: torch.Tensor,
        target_dimension_indicator: torch.Tensor,
        time_feat: torch.Tensor,
        scale: torch.Tensor,
        enc_out: torch.Tensor,
    ) -> torch.Tensor:
        """
        Computes sample paths by unrolling the RNN starting with a initial
        input and state.

        Parameters
        ----------
        past_target_cdf
            Past marginal CDF transformed target values (batch_size,
            history_length, target_dim)
        target_dimension_indicator
            Indices of the target dimension (batch_size, target_dim)
        time_feat
            Dynamic features of future time series (batch_size, history_length,
            num_features)
        scale
            Mean scale for each time series (batch_size, 1, target_dim)
        begin_states
            List of initial states for the RNN layers (batch_size, num_cells)
        Returns
        --------
        sample_paths : Tensor
            A tensor containing sampled paths. Shape: (1, num_sample_paths,
            prediction_length, target_dim).
        """

        def repeat(tensor, dim=0):
            return tensor.repeat_interleave(repeats=self.num_parallel_samples, dim=dim)

        # blows-up the dimension of each tensor to
        # batch_size * self.num_sample_paths for increasing parallelism
        repeated_past_target_cdf = repeat(past_target_cdf)
        repeated_time_feat = repeat(time_feat)
        repeated_scale = repeat(scale)
        if self.scaling:
            self.flow.scale = repeated_scale
        repeated_target_dimension_indicator = repeat(target_dimension_indicator)
        repeated_enc_out = repeat(enc_out, dim=1)

        future_samples = []

        # for each future time-units we draw new samples for this time-unit
        # and update the state
        for k in range(self.prediction_length):
            print("k: ", k, '/',self.prediction_length)
            lags = self.get_lagged_subsequences(
                sequence=repeated_past_target_cdf,
                sequence_length=self.history_length + k,
                indices=self.shifted_lags,
                subsequences_length=1,
            )

            lags_scaled = lags / repeated_scale.unsqueeze(-1)

            input_lags = lags_scaled.reshape(
                (-1, 1, len(self.lags_seq) * self.target_dim)
            )

            # (batch_size, target_dim, embed_dim)
            index_embeddings = self.embed(repeated_target_dimension_indicator)
            # assert_shape(index_embeddings, (-1, self.target_dim, self.embed_dim))

            # (batch_size, seq_len, target_dim * embed_dim)
            repeated_index_embeddings = (
                index_embeddings.unsqueeze(1)
                .expand(-1, 1, -1, -1)
                .reshape((-1, 1, self.target_dim * self.embed_dim))
            )

            # (batch_size, sub_seq_len, input_dim)
            dec_input = torch.cat(
                (
                    input_lags,
                    repeated_index_embeddings,
                    repeated_time_feat[:, k : k + 1, ...],
                ),
                dim=-1,
            )

            dec_output = self.transformer.decoder(
                self.decoder_input(dec_input).permute(1, 0, 2), repeated_enc_out
            )

            distr_args = self.distr_args(decoder_output=dec_output.permute(1, 0, 2))

            # (batch_size, 1, target_dim)
            new_samples = self.flow.sample(cond=distr_args)

            # (batch_size, seq_len, target_dim)
            future_samples.append(new_samples)
            repeated_past_target_cdf = torch.cat(
                (repeated_past_target_cdf, new_samples), dim=1
            )

        # (batch_size * num_samples, prediction_length, target_dim)
        samples = torch.cat(future_samples, dim=1)

        # (batch_size, num_samples, prediction_length, target_dim)
        return samples.reshape(
            (
                -1,
                self.num_parallel_samples,
                self.prediction_length,
                self.target_dim,
            )
        )

    def forward(
        self,
        target_dimension_indicator: torch.Tensor,
        past_time_feat: torch.Tensor,
        past_target_cdf: torch.Tensor,
        past_observed_values: torch.Tensor,
        past_is_pad: torch.Tensor,
        future_time_feat: torch.Tensor,
    ) -> torch.Tensor:
        """
        Predicts samples given the trained DeepVAR model.
        All tensors should have NTC layout.
        Parameters
        ----------
        target_dimension_indicator
            Indices of the target dimension (batch_size, target_dim)
        past_time_feat
            Dynamic features of past time series (batch_size, history_length,
            num_features)
        past_target_cdf
            Past marginal CDF transformed target values (batch_size,
            history_length, target_dim)
        past_observed_values
            Indicator whether or not the values were observed (batch_size,
            history_length, target_dim)
        past_is_pad
            Indicator whether the past target values have been padded
            (batch_size, history_length)
        future_time_feat
            Future time features (batch_size, prediction_length, num_features)

        Returns
        -------
        sample_paths : Tensor
            A tensor containing sampled paths (1, num_sample_paths,
            prediction_length, target_dim).

        """

        # mark padded data as unobserved
        # (batch_size, target_dim, seq_len)
        past_observed_values = torch.min(
            past_observed_values, 1 - past_is_pad.unsqueeze(-1)
        )

        inputs, scale, static_feat = self.create_network_input(
            past_time_feat=past_time_feat,
            past_target_cdf=past_target_cdf,
            past_observed_values=past_observed_values,
            past_is_pad=past_is_pad,
            future_time_feat=None,
            future_target_cdf=None,
            target_dimension_indicator=target_dimension_indicator,
        )

        enc_out = self.transformer.encoder(self.encoder_input(inputs).permute(1, 0, 2))

        return self.sampling_decoder(
            past_target_cdf=past_target_cdf,
            target_dimension_indicator=target_dimension_indicator,
            time_feat=future_time_feat,
            scale=scale,
            enc_out=enc_out,
        )


input_size=1484
d_model=16
num_heads=4
act_type='gelu'
dropout_rate=0.1
dim_feedforward_scale=4
num_encoder_layers=3
num_decoder_layers=3
history_length=168
context_length=24
prediction_length=24
lags_seq=[1,24,144]
target_dim=370
conditioning_length=100
flow_type='RealNVP'
n_blocks=3
hidden_size=100
n_hidden=2
dequantize=True
scaling=True

num_parallel_samples=9

transformer_training = TransformerTempFlowTrainingNetwork(input_size,
                                    d_model,
                                    num_heads,
                                    act_type,
                                    dropout_rate,
                                    dim_feedforward_scale,
                                    num_encoder_layers,
                                    num_decoder_layers,
                                    history_length,
                                    context_length,
                                    prediction_length,
                                    lags_seq,
                                    target_dim,
                                    conditioning_length,
                                    flow_type,
                                    n_blocks,
                                    hidden_size,
                                    n_hidden,
                                    dequantize,
                                    scaling)



target_dim_for_unrolling = 370
target_dimension_indicator = torch.arange(target_dim_for_unrolling).repeat(64).reshape(64,target_dim_for_unrolling)
past_time_feat = torch.rand(64, 168, 4)
past_target_cdf = torch.rand(64, 168, target_dim_for_unrolling)#just the normal time series, no cdf, refer to the rename fields line of code in the original timegrad estimator
past_observed_values = torch.ones(64, 168, target_dim_for_unrolling)
past_is_pad = torch.zeros(64, 168)
future_time_feat = torch.rand(64, 24, 4)
future_target_cdf = torch.rand(64, 24, target_dim_for_unrolling)#just the normal time series
future_observed_values = torch.ones(64, 24, target_dim_for_unrolling)

optimizer = torch.optim.Adam(transformer_training.parameters())

for i in range(100):
    optimizer.zero_grad()
    loss = transformer_training(
            target_dimension_indicator,
            past_time_feat,
            past_target_cdf, 
            past_observed_values,
            past_is_pad,
            future_time_feat,
            future_target_cdf,
            future_observed_values) 
    print("i--->", i, "loss: ", loss.item())
    
    optimizer.step()
    
    
num_parallel_samples = 3

time_grad_prediction_net = TransformerTempFlowPredictionNetwork(num_parallel_samples=num_parallel_samples,
                                                                input_size=input_size,
                                                                d_model=d_model,
                                                                num_heads=num_heads,
                                                                act_type=act_type,
                                                                dropout_rate=dropout_rate,
                                                                dim_feedforward_scale=dim_feedforward_scale,
                                                                num_encoder_layers=num_encoder_layers,
                                                                num_decoder_layers=num_decoder_layers,
                                                                history_length=history_length,
                                                                context_length=context_length,
                                                                prediction_length=prediction_length,
                                                                lags_seq=lags_seq,
                                                                target_dim=target_dim,
                                                                conditioning_length=conditioning_length,
                                                                flow_type=flow_type,
                                                                n_blocks=n_blocks,
                                                                hidden_size=hidden_size,
                                                                n_hidden=n_hidden,
                                                                dequantize=dequantize,
                                                                scaling=scaling)

copy_parameters(transformer_training, time_grad_prediction_net)      
time_grad_prediction_net.eval()

pred = time_grad_prediction_net(target_dimension_indicator,
                                past_time_feat,
                                past_target_cdf,
                                past_observed_values,
                                past_is_pad,
                                future_time_feat
                                )

i---> 0 loss:  476.4208984375
i---> 1 loss:  821.38037109375
i---> 2 loss:  1180.3878173828125
i---> 3 loss:  1547.8414306640625
i---> 4 loss:  1917.1341552734375
i---> 5 loss:  2286.867919921875
i---> 6 loss:  2656.928466796875
i---> 7 loss:  3027.243896484375
i---> 8 loss:  3397.939208984375
i---> 9 loss:  3769.500732421875
i---> 10 loss:  4141.57763671875
i---> 11 loss:  4514.36328125
i---> 12 loss:  4888.3359375
i---> 13 loss:  5263.00927734375
i---> 14 loss:  5638.74755859375
i---> 15 loss:  6015.81689453125
i---> 16 loss:  6394.91650390625
i---> 17 loss:  6776.17626953125
i---> 18 loss:  7160.25732421875
i---> 19 loss:  7547.38330078125
i---> 20 loss:  7937.96875
i---> 21 loss:  8332.2666015625
i---> 22 loss:  8730.123046875
i---> 23 loss:  9131.365234375
i---> 24 loss:  9535.662109375
i---> 25 loss:  9942.6552734375
i---> 26 loss:  10352.0693359375
i---> 27 loss:  10763.4697265625
i---> 28 loss:  11176.57421875
i---> 29 loss:  11591.3662109375
i---> 30 loss:  12007.4755859375
i---> 31 loss:  12424.828125
i---> 32 loss:  12843.2275390625
i---> 33 loss:  13262.4892578125
i---> 34 loss:  13682.5009765625
i---> 35 loss:  inf
Traceback (most recent call last):

  File line: 682 in <module>
    loss = transformer_training(

  File ~/anaconda3/envs/yellow/lib/python3.9/site-packages/torch/nn/modules/module.py:1102 in _call_impl
    return forward_call(*input, **kwargs)

  File line:399 in forward
    loss = -self.flow.log_prob(future_target_cdf, distr_args).unsqueeze(-1)

  File /pts/modules/flows.py:345 in log_prob
    return torch.sum(self.base_dist.log_prob(u) + sum_log_abs_det_jacobians, dim=-1)

  File ~/anaconda3/envs/yellow/lib/python3.9/site-packages/torch/distributions/normal.py:73 in log_prob
    self._validate_sample(value)

  File ~/anaconda3/envs/yellow/lib/python3.9/site-packages/torch/distributions/distribution.py:288 in _validate_sample
    raise ValueError(

ValueError: Expected value argument (Tensor of shape (64, 24, 370)) to be within the support (Real()) of the distribution Normal(loc: torch.Size([370]), scale: torch.Size([370])), but found invalid values:
tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        ...,

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], grad_fn=<AddBackward0>)

Mar 15 '22 13:03 vamp-ire-tap

thanks for the detailed report.... since the model trains for a while I suppose the issue is with the data being considered. Can you kindly see if it works with DeepVAR estimator?

Recall that normalizing flows don't work well with discrete data so that could be one issue?

Mar 18 '22 07:03 kashif

Please, did you solve your problem? I have the exact same problem with the solar dataset.

Nov 08 '22 09:11 hanlaoshi

I believe with solar since the values in the dataset are somewhat discrete you will need to set the dequantize=True, in the estimator... can you kindly try?

Nov 08 '22 09:11 kashif

I believe with solar since the values in the dataset are somewhat discrete you will need to set the dequantize=True, in the estimator... can you kindly try?

@kashif Actually, 'dequantize=True 'was what I had set in the 'Transformer-MAF'.

Nov 08 '22 09:11 hanlaoshi

I believe with solar since the values in the dataset are somewhat discrete you will need to set the dequantize=True, in the estimator... can you kindly try?

Hi, @kashif @vamp-ire-tap In the past several days, I have conducted a number of experiments on "Transformer-MAF," and I have discovered that this problem may be caused by the version of PyTorch. Specifically, this exception vanished when I downgraded PyTorch from 1.13 to 1.10, and training and prediction progress are now fluid. I hope that anybody confronts this issue can benefit from this method.

Nov 09 '22 16:11 hanlaoshi

hmm ok strange... wonder why that could be...

Nov 09 '22 17:11 kashif

pytorch-ts pytorch-ts copied to clipboard

Unstability in training for TransformerTempFlow model, requiring help

pytorch-ts
pytorch-ts copied to clipboard