WeatherBench
WeatherBench copied to clipboard
use multivariate series with data generator
Hi, I want to use a multivariate series.
So, I have for example t2m
and sm100
data. I want to use both in order to train the model but predict on t2m
.
I tried to use the data generator from here but when when I call the fit method, it throws
ValueError: applied function returned data with unexpected number of dimensions. Received 1 dimension(s) but expected 0 dimensions with names: ()
at the line y = self.data.isel(forecast_time=idxs + self.lead_time).values
in __getitem__
method.
Also, not that in the data generator , I have commented out the lines
self.n_samples = self.data.isel(forecast_time=slice(0, -lead_time)).shape[0]
self.init_time = self.data.isel(forecast_time=slice(None, -lead_time)).forecast_time
self.valid_time = self.data.isel(forecast_time=slice(lead_time, None)).forecast_time
( I am using self.n_samples = self.data.forecast_time.size
instead)
If I use it , it throws me:
TypeError: 'DataArray' object cannot be interpreted as an integer
Any ideas about that? Thanks!
the code:
import xarray as xr
import tensorflow as tf
import numpy as np
from collections import OrderedDict
#from tensorflow.keras.layers import Input, Conv2D, Dense
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input, Conv2D, TimeDistributed,\
MaxPooling2D, Flatten, RepeatVector, Reshape, Lambda, GlobalAveragePooling2D,\
Bidirectional, ConvLSTM2D, BatchNormalization
t2m = xr.open_dataset("/home/ggousios/s2s-ai-challenge-agroapps/t2m.nc")
sm100 = xr.open_dataset("/home/ggousios/s2s-ai-challenge-agroapps/sm100.nc")
lead_time = t2m.isel(lead_time=0).lead_time
class PeriodicPadding2D(tf.keras.layers.Layer):
def __init__(self,
pad_width,
**kwargs):
super().__init__(**kwargs)
self.pad_width = pad_width
def call(self,
inputs,
**kwargs):
if self.pad_width == 0:
return inputs
inputs_padded = tf.concat(
[inputs[:, :, -self.pad_width:, :],
inputs,
inputs[:, :, :self.pad_width, :]],
axis=2)
# Zero padding in the lat direction
inputs_padded = tf.pad(inputs_padded,
[[0, 0],
[self.pad_width,
self.pad_width],
[0, 0],
[0, 0]])
return inputs_padded
def get_config(self):
config = super().get_config()
config.update({'pad_width': self.pad_width})
return config
class PeriodicConv2D(tf.keras.layers.Layer):
def __init__(self,
filters,
kernel_size,
conv_kwargs={},
**kwargs, ):
super().__init__(**kwargs)
self.filters = filters
self.kernel_size = kernel_size
self.conv_kwargs = conv_kwargs
if type(kernel_size) is not int:
assert kernel_size[0] == kernel_size[1], \
'PeriodicConv2D only works for square kernels'
kernel_size = kernel_size[0]
pad_width = (kernel_size - 1) // 2
self.padding = PeriodicPadding2D(pad_width)
self.conv = Conv2D(
filters, kernel_size, padding='valid', **conv_kwargs
)
def call(self, inputs):
return self.conv(self.padding(inputs))
def get_config(self):
config = super().get_config()
config.update({'filters': self.filters,
'kernel_size': self.kernel_size,
'conv_kwargs': self.conv_kwargs})
return config
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self,
ds,
var_dict,
lead_time,
batch_size,
shuffle=True,
load=True,
mean=None,
std=None):
"""
Data generator for WeatherBench data.
Template from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
Args:
ds: Dataset containing all variables
var_dict: Dictionary of the form {'var': level}. Use None for level if data is of single level
lead_time: Lead time in hours
batch_size: Batch size
shuffle: bool. If True, data is shuffled.
load: bool. If True, datadet is loaded into RAM.
mean: If None, compute mean from data.
std: If None, compute standard deviation from data.
"""
self.ds = ds
self.var_dict = var_dict
self.batch_size = batch_size
self.shuffle = shuffle
self.lead_time = lead_time
data = []
generic_level = xr.DataArray([1])#, coords={'realization': [1]}, dims=['realization'])
for var, levels in var_dict.items():
#try:
data.append(ds[var])#.sel(realization=levels))
#except ValueError:
# data.append(ds[var].expand_dims({'realization': generic_level}, 1))
self.data = xr.concat(data, 'realization').transpose('forecast_time', ...)
self.mean = self.data.mean(('forecast_time')).compute() if mean is None else mean
self.std = self.data.std('forecast_time').compute() if std is None else std
# Normalize
self.data = (self.data - self.mean) / self.std
#self.n_samples = self.data.isel(forecast_time=slice(0, -lead_time)).shape[0]
self.n_samples = self.data.forecast_time.size
# self.init_time = self.data.isel(forecast_time=slice(None, -lead_time)).forecast_time
# self.valid_time = self.data.isel(forecast_time=slice(lead_time, None)).forecast_time
self.on_epoch_end()
# For some weird reason calling .load() earlier messes up the mean and std computations
if load: print('Loading data into RAM'); self.data.load()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.ceil(self.n_samples / self.batch_size))
def __getitem__(self, i):
'Generate one batch of data'
idxs = self.idxs[i * self.batch_size:(i + 1) * self.batch_size]
X = self.data.isel(forecast_time=idxs).values
y = self.data.isel(forecast_time=idxs + self.lead_time).values
return X, y
def on_epoch_end(self):
'Updates indexes after each epoch'
self.idxs = np.arange(self.n_samples)
if self.shuffle == True:
np.random.shuffle(self.idxs)
datasets = [t2m, sm100]
ds = xr.merge(datasets)
# I am choosing only t2m (not tp) and sm100 here
dic = OrderedDict({'t2m': None, 'sm100': None})
dg_train = DataGenerator(
ds.sel(forecast_time=slice('2000', '2001')),
dic,
lead_time=lead_time,
batch_size=8,
load=True)
dg_valid = DataGenerator(
ds.sel(forecast_time=slice('2018', '2019')),
dic,
lead_time=lead_time,
batch_size=8,
mean=dg_train.mean,
std=dg_train.std,
shuffle=False)
def custom_categ_crossentropy(y_true, y_pred, sample_weight=None):
y_true = tf.one_hot(tf.cast(y_true,'int32'), depth=3)
loss = tf.keras.losses.CategoricalCrossentropy()(y_true, y_pred)
return loss
def build_cnn(filters,
kernels,
input_shape):
inputs = Input(batch_shape=(8,
121,
240,
1))
print(inputs.shape)
x = (PeriodicConv2D(filters,
kernels,
conv_kwargs={'activation':'relu'}))((inputs))
x = PeriodicConv2D(32,
5,
conv_kwargs={'activation':'relu'})(x)
output = Dense(3, activation='softmax')(x)
model = Model(inputs, output)
model.compile(optimizer='sgd',
loss=custom_categ_crossentropy,
metrics=['accuracy'])
print(model.summary())
return model
def fit():
model = build_cnn(64, 9, (121, 240, 1))
history = model.fit(dg_train,
epochs=10,
validation_data=dg_valid)
return history
history = fit()
The data