pytorch-ts
pytorch-ts copied to clipboard
Unstability in training for TransformerTempFlow model, requiring help
Hello, I have the following code as an example, abstracted from the original provided example, but when I start training the network, it gives me NANs and other stability issues even for random sample data, and also for real application data. Could the authors please help me debug this issue? I would be thankful!
from typing import List, Optional, Tuple
import torch
import torch.nn as nn
from gluonts.core.component import validated
from pts.modules import RealNVP, MAF, FlowOutput, MeanScaler, NOPScaler
from gluonts.torch.util import copy_parameters
class TransformerTempFlowTrainingNetwork(nn.Module):
@validated()
def __init__(
self,
input_size: int,
d_model: int,
num_heads: int,
act_type: str,
dropout_rate: float,
dim_feedforward_scale: int,
num_encoder_layers: int,
num_decoder_layers: int,
history_length: int,
context_length: int,
prediction_length: int,
lags_seq: List[int],
target_dim: int,
conditioning_length: int,
flow_type: str,
n_blocks: int,
hidden_size: int,
n_hidden: int,
dequantize: bool,
scaling: bool = True,
**kwargs,
) -> None:
super().__init__(**kwargs)
# print('input_size', input_size)
# print('d_model', d_model)
# print('num_heads', num_heads)
# print('act_type', act_type)
# print('dropout_rate', dropout_rate)
# print('dim_feedforward_scale', dim_feedforward_scale)
# print('num_encoder_layers', num_encoder_layers)
# print('num_decoder_layers', num_decoder_layers)
# print('history_length', history_length)
# print('context_length', context_length)
# print('prediction_length', prediction_length)
# print('lags_seq', lags_seq)
# print('target_dim', target_dim)
# print('conditioning_length', conditioning_length)
# print('flow_type', flow_type)
# print('n_blocks', n_blocks)
# print('hidden_size', hidden_size)
# print('n_hidden', n_hidden)
# print('dequantize', dequantize)
# print('scaling', scaling)
self.target_dim = target_dim
self.prediction_length = prediction_length
self.context_length = context_length
self.history_length = history_length
self.scaling = scaling
assert len(set(lags_seq)) == len(lags_seq), "no duplicated lags allowed!"
lags_seq.sort()
self.lags_seq = lags_seq
self.encoder_input = nn.Linear(input_size, d_model)
self.decoder_input = nn.Linear(input_size, d_model)
# [B, T, d_model] where d_model / num_heads is int
self.transformer = nn.Transformer(
d_model=d_model,
nhead=num_heads,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dim_feedforward=dim_feedforward_scale * d_model,
dropout=dropout_rate,
activation=act_type,
)
flow_cls = {
"RealNVP": RealNVP,
"MAF": MAF,
}[flow_type]
self.flow = flow_cls(
input_size=target_dim,
n_blocks=n_blocks,
n_hidden=n_hidden,
hidden_size=hidden_size,
cond_label_size=conditioning_length,
)
self.dequantize = dequantize
self.distr_output = FlowOutput(
self.flow, input_size=target_dim, cond_size=conditioning_length
)
self.proj_dist_args = self.distr_output.get_args_proj(d_model)
self.embed_dim = 1
self.embed = nn.Embedding(
num_embeddings=self.target_dim, embedding_dim=self.embed_dim
)
if self.scaling:
self.scaler = MeanScaler(keepdim=True)
else:
self.scaler = NOPScaler(keepdim=True)
# mask
self.register_buffer(
"tgt_mask",
self.transformer.generate_square_subsequent_mask(prediction_length),
)
@staticmethod
def get_lagged_subsequences(
sequence: torch.Tensor,
sequence_length: int,
indices: List[int],
subsequences_length: int = 1,
) -> torch.Tensor:
"""
Returns lagged subsequences of a given sequence.
Parameters
----------
sequence
the sequence from which lagged subsequences should be extracted.
Shape: (N, T, C).
sequence_length
length of sequence in the T (time) dimension (axis = 1).
indices
list of lag indices to be used.
subsequences_length
length of the subsequences to be extracted.
Returns
--------
lagged : Tensor
a tensor of shape (N, S, C, I),
where S = subsequences_length and I = len(indices),
containing lagged subsequences.
Specifically, lagged[i, :, j, k] = sequence[i, -indices[k]-S+j, :].
"""
# we must have: history_length + begin_index >= 0
# that is: history_length - lag_index - sequence_length >= 0
# hence the following assert
assert max(indices) + subsequences_length <= sequence_length, (
f"lags cannot go further than history length, found lag "
f"{max(indices)} while history length is only {sequence_length}"
)
assert all(lag_index >= 0 for lag_index in indices)
lagged_values = []
for lag_index in indices:
begin_index = -lag_index - subsequences_length
end_index = -lag_index if lag_index > 0 else None
lagged_values.append(sequence[:, begin_index:end_index, ...].unsqueeze(1))
return torch.cat(lagged_values, dim=1).permute(0, 2, 3, 1)
def create_network_input(
self,
past_time_feat: torch.Tensor,
past_target_cdf: torch.Tensor,
past_observed_values: torch.Tensor,
past_is_pad: torch.Tensor,
future_time_feat: Optional[torch.Tensor],
future_target_cdf: Optional[torch.Tensor],
target_dimension_indicator: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,]:
"""
Unrolls the RNN encoder over past and, if present, future data.
Returns outputs and state of the encoder, plus the scale of
past_target_cdf and a vector of static features that was constructed
and fed as input to the encoder. All tensor arguments should have NTC
layout.
Parameters
----------
past_time_feat
Past time features (batch_size, history_length, num_features)
past_target_cdf
Past marginal CDF transformed target values (batch_size,
history_length, target_dim)
past_observed_values
Indicator whether or not the values were observed (batch_size,
history_length, target_dim)
past_is_pad
Indicator whether the past target values have been padded
(batch_size, history_length)
future_time_feat
Future time features (batch_size, prediction_length, num_features)
future_target_cdf
Future marginal CDF transformed target values (batch_size,
prediction_length, target_dim)
target_dimension_indicator
Dimensionality of the time series (batch_size, target_dim)
Returns
-------
outputs
RNN outputs (batch_size, seq_len, num_cells)
states
RNN states. Nested list with (batch_size, num_cells) tensors with
dimensions target_dim x num_layers x (batch_size, num_cells)
scale
Mean scales for the time series (batch_size, 1, target_dim)
lags_scaled
Scaled lags(batch_size, sub_seq_len, target_dim, num_lags)
inputs
inputs to the RNN
"""
past_observed_values = torch.min(
past_observed_values, 1 - past_is_pad.unsqueeze(-1)
)
if future_time_feat is None or future_target_cdf is None:
time_feat = past_time_feat[:, -self.context_length :, ...]
sequence = past_target_cdf
sequence_length = self.history_length
subsequences_length = self.context_length
else:
time_feat = torch.cat(
(
past_time_feat[:, -self.context_length :, ...],
future_time_feat,
),
dim=1,
)
sequence = torch.cat((past_target_cdf, future_target_cdf), dim=1)
sequence_length = self.history_length + self.prediction_length
subsequences_length = self.context_length + self.prediction_length
# (batch_size, sub_seq_len, target_dim, num_lags)
lags = self.get_lagged_subsequences(
sequence=sequence,
sequence_length=sequence_length,
indices=self.lags_seq,
subsequences_length=subsequences_length,
)
# scale is computed on the context length last units of the past target
# scale shape is (batch_size, 1, target_dim)
_, scale = self.scaler(
past_target_cdf[:, -self.context_length :, ...],
past_observed_values[:, -self.context_length :, ...],
)
# (batch_size, sub_seq_len, target_dim, num_lags)
lags_scaled = lags / scale.unsqueeze(-1)
# assert_shape(
# lags_scaled, (-1, unroll_length, self.target_dim, len(self.lags_seq)),
# )
input_lags = lags_scaled.reshape(
(-1, subsequences_length, len(self.lags_seq) * self.target_dim)
)
# (batch_size, target_dim, embed_dim)
index_embeddings = self.embed(target_dimension_indicator)
# assert_shape(index_embeddings, (-1, self.target_dim, self.embed_dim))
# (batch_size, seq_len, target_dim * embed_dim)
repeated_index_embeddings = (
index_embeddings.unsqueeze(1)
.expand(-1, subsequences_length, -1, -1)
.reshape((-1, subsequences_length, self.target_dim * self.embed_dim))
)
# (batch_size, sub_seq_len, input_dim)
inputs = torch.cat((input_lags, repeated_index_embeddings, time_feat), dim=-1)
return inputs, scale, index_embeddings
def distr_args(self, decoder_output: torch.Tensor):
"""
Returns the distribution of DeepVAR with respect to the RNN outputs.
Parameters
----------
rnn_outputs
Outputs of the unrolled RNN (batch_size, seq_len, num_cells)
scale
Mean scale for each time series (batch_size, 1, target_dim)
Returns
-------
distr
Distribution instance
distr_args
Distribution arguments
"""
(distr_args,) = self.proj_dist_args(decoder_output)#decoder_output: torch.Size([64, 24, 16])
#distr_args: torch.Size([64, 24, 100])
# # compute likelihood of target given the predicted parameters
# distr = self.distr_output.distribution(distr_args, scale=scale)
# return distr, distr_args
return distr_args
def forward(
self,
target_dimension_indicator: torch.Tensor,
past_time_feat: torch.Tensor,
past_target_cdf: torch.Tensor,
past_observed_values: torch.Tensor,
past_is_pad: torch.Tensor,
future_time_feat: torch.Tensor,
future_target_cdf: torch.Tensor,
future_observed_values: torch.Tensor,
) -> Tuple[torch.Tensor, ...]:
"""
Computes the loss for training DeepVAR, all inputs tensors representing
time series have NTC layout.
Parameters
----------
target_dimension_indicator
Indices of the target dimension (batch_size, target_dim)
past_time_feat
Dynamic features of past time series (batch_size, history_length,
num_features)
past_target_cdf
Past marginal CDF transformed target values (batch_size,
history_length, target_dim)
past_observed_values
Indicator whether or not the values were observed (batch_size,
history_length, target_dim)
past_is_pad
Indicator whether the past target values have been padded
(batch_size, history_length)
future_time_feat
Future time features (batch_size, prediction_length, num_features)
future_target_cdf
Future marginal CDF transformed target values (batch_size,
prediction_length, target_dim)
future_observed_values
Indicator whether or not the future values were observed
(batch_size, prediction_length, target_dim)
Returns
-------
distr
Loss with shape (batch_size, 1)
likelihoods
Likelihoods for each time step
(batch_size, context + prediction_length, 1)
distr_args
Distribution arguments (context + prediction_length,
number_of_arguments)
"""
# seq_len = self.context_length + self.prediction_length
# unroll the decoder in "training mode", i.e. by providing future data
# as well
inputs, scale, _ = self.create_network_input(
past_time_feat=past_time_feat,
past_target_cdf=past_target_cdf,
past_observed_values=past_observed_values,
past_is_pad=past_is_pad,
future_time_feat=future_time_feat,
future_target_cdf=future_target_cdf,
target_dimension_indicator=target_dimension_indicator,
)
enc_inputs = inputs[:, : self.context_length, ...]
dec_inputs = inputs[:, self.context_length :, ...]
enc_out = self.transformer.encoder(
self.encoder_input(enc_inputs).permute(1, 0, 2)
)
dec_output = self.transformer.decoder(
self.decoder_input(dec_inputs).permute(1, 0, 2),
enc_out,
tgt_mask=self.tgt_mask,
)
if self.scaling:
self.flow.scale = scale
# we sum the last axis to have the same shape for all likelihoods
# (batch_size, subseq_length, 1)
if self.dequantize:
future_target_cdf += torch.rand_like(future_target_cdf)
distr_args = self.distr_args(decoder_output=dec_output.permute(1, 0, 2))
# likelihoods = -self.flow.log_prob(target, distr_args).unsqueeze(-1)
loss = -self.flow.log_prob(future_target_cdf, distr_args).unsqueeze(-1)
# # assert_shape(likelihoods, (-1, seq_len, 1))
# past_observed_values = torch.min(
# past_observed_values, 1 - past_is_pad.unsqueeze(-1)
# )
# # (batch_size, subseq_length, target_dim)
# observed_values = torch.cat(
# (
# past_observed_values[:, -self.context_length :, ...],
# future_observed_values,
# ),
# dim=1,
# )
# # mask the loss at one time step if one or more observations is missing
# # in the target dimensions (batch_size, subseq_length, 1)
# loss_weights, _ = observed_values.min(dim=-1, keepdim=True)
# # assert_shape(loss_weights, (-1, seq_len, 1))
# loss = weighted_average(likelihoods, weights=loss_weights, dim=1)
# assert_shape(loss, (-1, -1, 1))
# self.distribution = distr
# return (loss.mean(), likelihoods, distr_args)
return loss.mean()#, distr_args
class TransformerTempFlowPredictionNetwork(TransformerTempFlowTrainingNetwork):
def __init__(self, num_parallel_samples: int, **kwargs) -> None:
super().__init__(**kwargs)
self.num_parallel_samples = num_parallel_samples
# for decoding the lags are shifted by one,
# at the first time-step of the decoder a lag of one corresponds to
# the last target value
self.shifted_lags = [l - 1 for l in self.lags_seq]
def sampling_decoder(
self,
past_target_cdf: torch.Tensor,
target_dimension_indicator: torch.Tensor,
time_feat: torch.Tensor,
scale: torch.Tensor,
enc_out: torch.Tensor,
) -> torch.Tensor:
"""
Computes sample paths by unrolling the RNN starting with a initial
input and state.
Parameters
----------
past_target_cdf
Past marginal CDF transformed target values (batch_size,
history_length, target_dim)
target_dimension_indicator
Indices of the target dimension (batch_size, target_dim)
time_feat
Dynamic features of future time series (batch_size, history_length,
num_features)
scale
Mean scale for each time series (batch_size, 1, target_dim)
begin_states
List of initial states for the RNN layers (batch_size, num_cells)
Returns
--------
sample_paths : Tensor
A tensor containing sampled paths. Shape: (1, num_sample_paths,
prediction_length, target_dim).
"""
def repeat(tensor, dim=0):
return tensor.repeat_interleave(repeats=self.num_parallel_samples, dim=dim)
# blows-up the dimension of each tensor to
# batch_size * self.num_sample_paths for increasing parallelism
repeated_past_target_cdf = repeat(past_target_cdf)
repeated_time_feat = repeat(time_feat)
repeated_scale = repeat(scale)
if self.scaling:
self.flow.scale = repeated_scale
repeated_target_dimension_indicator = repeat(target_dimension_indicator)
repeated_enc_out = repeat(enc_out, dim=1)
future_samples = []
# for each future time-units we draw new samples for this time-unit
# and update the state
for k in range(self.prediction_length):
print("k: ", k, '/',self.prediction_length)
lags = self.get_lagged_subsequences(
sequence=repeated_past_target_cdf,
sequence_length=self.history_length + k,
indices=self.shifted_lags,
subsequences_length=1,
)
lags_scaled = lags / repeated_scale.unsqueeze(-1)
input_lags = lags_scaled.reshape(
(-1, 1, len(self.lags_seq) * self.target_dim)
)
# (batch_size, target_dim, embed_dim)
index_embeddings = self.embed(repeated_target_dimension_indicator)
# assert_shape(index_embeddings, (-1, self.target_dim, self.embed_dim))
# (batch_size, seq_len, target_dim * embed_dim)
repeated_index_embeddings = (
index_embeddings.unsqueeze(1)
.expand(-1, 1, -1, -1)
.reshape((-1, 1, self.target_dim * self.embed_dim))
)
# (batch_size, sub_seq_len, input_dim)
dec_input = torch.cat(
(
input_lags,
repeated_index_embeddings,
repeated_time_feat[:, k : k + 1, ...],
),
dim=-1,
)
dec_output = self.transformer.decoder(
self.decoder_input(dec_input).permute(1, 0, 2), repeated_enc_out
)
distr_args = self.distr_args(decoder_output=dec_output.permute(1, 0, 2))
# (batch_size, 1, target_dim)
new_samples = self.flow.sample(cond=distr_args)
# (batch_size, seq_len, target_dim)
future_samples.append(new_samples)
repeated_past_target_cdf = torch.cat(
(repeated_past_target_cdf, new_samples), dim=1
)
# (batch_size * num_samples, prediction_length, target_dim)
samples = torch.cat(future_samples, dim=1)
# (batch_size, num_samples, prediction_length, target_dim)
return samples.reshape(
(
-1,
self.num_parallel_samples,
self.prediction_length,
self.target_dim,
)
)
def forward(
self,
target_dimension_indicator: torch.Tensor,
past_time_feat: torch.Tensor,
past_target_cdf: torch.Tensor,
past_observed_values: torch.Tensor,
past_is_pad: torch.Tensor,
future_time_feat: torch.Tensor,
) -> torch.Tensor:
"""
Predicts samples given the trained DeepVAR model.
All tensors should have NTC layout.
Parameters
----------
target_dimension_indicator
Indices of the target dimension (batch_size, target_dim)
past_time_feat
Dynamic features of past time series (batch_size, history_length,
num_features)
past_target_cdf
Past marginal CDF transformed target values (batch_size,
history_length, target_dim)
past_observed_values
Indicator whether or not the values were observed (batch_size,
history_length, target_dim)
past_is_pad
Indicator whether the past target values have been padded
(batch_size, history_length)
future_time_feat
Future time features (batch_size, prediction_length, num_features)
Returns
-------
sample_paths : Tensor
A tensor containing sampled paths (1, num_sample_paths,
prediction_length, target_dim).
"""
# mark padded data as unobserved
# (batch_size, target_dim, seq_len)
past_observed_values = torch.min(
past_observed_values, 1 - past_is_pad.unsqueeze(-1)
)
inputs, scale, static_feat = self.create_network_input(
past_time_feat=past_time_feat,
past_target_cdf=past_target_cdf,
past_observed_values=past_observed_values,
past_is_pad=past_is_pad,
future_time_feat=None,
future_target_cdf=None,
target_dimension_indicator=target_dimension_indicator,
)
enc_out = self.transformer.encoder(self.encoder_input(inputs).permute(1, 0, 2))
return self.sampling_decoder(
past_target_cdf=past_target_cdf,
target_dimension_indicator=target_dimension_indicator,
time_feat=future_time_feat,
scale=scale,
enc_out=enc_out,
)
input_size=1484
d_model=16
num_heads=4
act_type='gelu'
dropout_rate=0.1
dim_feedforward_scale=4
num_encoder_layers=3
num_decoder_layers=3
history_length=168
context_length=24
prediction_length=24
lags_seq=[1,24,144]
target_dim=370
conditioning_length=100
flow_type='RealNVP'
n_blocks=3
hidden_size=100
n_hidden=2
dequantize=True
scaling=True
num_parallel_samples=9
transformer_training = TransformerTempFlowTrainingNetwork(input_size,
d_model,
num_heads,
act_type,
dropout_rate,
dim_feedforward_scale,
num_encoder_layers,
num_decoder_layers,
history_length,
context_length,
prediction_length,
lags_seq,
target_dim,
conditioning_length,
flow_type,
n_blocks,
hidden_size,
n_hidden,
dequantize,
scaling)
target_dim_for_unrolling = 370
target_dimension_indicator = torch.arange(target_dim_for_unrolling).repeat(64).reshape(64,target_dim_for_unrolling)
past_time_feat = torch.rand(64, 168, 4)
past_target_cdf = torch.rand(64, 168, target_dim_for_unrolling)#just the normal time series, no cdf, refer to the rename fields line of code in the original timegrad estimator
past_observed_values = torch.ones(64, 168, target_dim_for_unrolling)
past_is_pad = torch.zeros(64, 168)
future_time_feat = torch.rand(64, 24, 4)
future_target_cdf = torch.rand(64, 24, target_dim_for_unrolling)#just the normal time series
future_observed_values = torch.ones(64, 24, target_dim_for_unrolling)
optimizer = torch.optim.Adam(transformer_training.parameters())
for i in range(100):
optimizer.zero_grad()
loss = transformer_training(
target_dimension_indicator,
past_time_feat,
past_target_cdf,
past_observed_values,
past_is_pad,
future_time_feat,
future_target_cdf,
future_observed_values)
print("i--->", i, "loss: ", loss.item())
optimizer.step()
num_parallel_samples = 3
time_grad_prediction_net = TransformerTempFlowPredictionNetwork(num_parallel_samples=num_parallel_samples,
input_size=input_size,
d_model=d_model,
num_heads=num_heads,
act_type=act_type,
dropout_rate=dropout_rate,
dim_feedforward_scale=dim_feedforward_scale,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
history_length=history_length,
context_length=context_length,
prediction_length=prediction_length,
lags_seq=lags_seq,
target_dim=target_dim,
conditioning_length=conditioning_length,
flow_type=flow_type,
n_blocks=n_blocks,
hidden_size=hidden_size,
n_hidden=n_hidden,
dequantize=dequantize,
scaling=scaling)
copy_parameters(transformer_training, time_grad_prediction_net)
time_grad_prediction_net.eval()
pred = time_grad_prediction_net(target_dimension_indicator,
past_time_feat,
past_target_cdf,
past_observed_values,
past_is_pad,
future_time_feat
)
i---> 0 loss: 476.4208984375
i---> 1 loss: 821.38037109375
i---> 2 loss: 1180.3878173828125
i---> 3 loss: 1547.8414306640625
i---> 4 loss: 1917.1341552734375
i---> 5 loss: 2286.867919921875
i---> 6 loss: 2656.928466796875
i---> 7 loss: 3027.243896484375
i---> 8 loss: 3397.939208984375
i---> 9 loss: 3769.500732421875
i---> 10 loss: 4141.57763671875
i---> 11 loss: 4514.36328125
i---> 12 loss: 4888.3359375
i---> 13 loss: 5263.00927734375
i---> 14 loss: 5638.74755859375
i---> 15 loss: 6015.81689453125
i---> 16 loss: 6394.91650390625
i---> 17 loss: 6776.17626953125
i---> 18 loss: 7160.25732421875
i---> 19 loss: 7547.38330078125
i---> 20 loss: 7937.96875
i---> 21 loss: 8332.2666015625
i---> 22 loss: 8730.123046875
i---> 23 loss: 9131.365234375
i---> 24 loss: 9535.662109375
i---> 25 loss: 9942.6552734375
i---> 26 loss: 10352.0693359375
i---> 27 loss: 10763.4697265625
i---> 28 loss: 11176.57421875
i---> 29 loss: 11591.3662109375
i---> 30 loss: 12007.4755859375
i---> 31 loss: 12424.828125
i---> 32 loss: 12843.2275390625
i---> 33 loss: 13262.4892578125
i---> 34 loss: 13682.5009765625
i---> 35 loss: inf
Traceback (most recent call last):
File line: 682 in <module>
loss = transformer_training(
File ~/anaconda3/envs/yellow/lib/python3.9/site-packages/torch/nn/modules/module.py:1102 in _call_impl
return forward_call(*input, **kwargs)
File line:399 in forward
loss = -self.flow.log_prob(future_target_cdf, distr_args).unsqueeze(-1)
File /pts/modules/flows.py:345 in log_prob
return torch.sum(self.base_dist.log_prob(u) + sum_log_abs_det_jacobians, dim=-1)
File ~/anaconda3/envs/yellow/lib/python3.9/site-packages/torch/distributions/normal.py:73 in log_prob
self._validate_sample(value)
File ~/anaconda3/envs/yellow/lib/python3.9/site-packages/torch/distributions/distribution.py:288 in _validate_sample
raise ValueError(
ValueError: Expected value argument (Tensor of shape (64, 24, 370)) to be within the support (Real()) of the distribution Normal(loc: torch.Size([370]), scale: torch.Size([370])), but found invalid values:
tensor([[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
...,
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]]], grad_fn=<AddBackward0>)
thanks for the detailed report.... since the model trains for a while I suppose the issue is with the data being considered. Can you kindly see if it works with DeepVAR estimator?
Recall that normalizing flows don't work well with discrete data so that could be one issue?
Please, did you solve your problem? I have the exact same problem with the solar dataset.
I believe with solar since the values in the dataset are somewhat discrete you will need to set the dequantize=True, in the estimator... can you kindly try?
I believe with solar since the values in the dataset are somewhat discrete you will need to set the
dequantize=True,in the estimator... can you kindly try?
@kashif Actually, 'dequantize=True 'was what I had set in the 'Transformer-MAF'.
I believe with solar since the values in the dataset are somewhat discrete you will need to set the
dequantize=True,in the estimator... can you kindly try?
Hi, @kashif @vamp-ire-tap In the past several days, I have conducted a number of experiments on "Transformer-MAF," and I have discovered that this problem may be caused by the version of PyTorch. Specifically, this exception vanished when I downgraded PyTorch from 1.13 to 1.10, and training and prediction progress are now fluid. I hope that anybody confronts this issue can benefit from this method.
hmm ok strange... wonder why that could be...