WeatherBench icon indicating copy to clipboard operation
WeatherBench copied to clipboard

use multivariate series with data generator

Open ggous opened this issue 3 years ago • 0 comments

Hi, I want to use a multivariate series.

So, I have for example t2m and sm100 data. I want to use both in order to train the model but predict on t2m.

I tried to use the data generator from here but when when I call the fit method, it throws

ValueError: applied function returned data with unexpected number of dimensions. Received 1 dimension(s) but expected 0 dimensions with names: ()

at the line y = self.data.isel(forecast_time=idxs + self.lead_time).values in __getitem__ method.

Also, not that in the data generator , I have commented out the lines

self.n_samples = self.data.isel(forecast_time=slice(0, -lead_time)).shape[0]
self.init_time = self.data.isel(forecast_time=slice(None, -lead_time)).forecast_time
self.valid_time = self.data.isel(forecast_time=slice(lead_time, None)).forecast_time

( I am using self.n_samples = self.data.forecast_time.size instead)

If I use it , it throws me:

TypeError: 'DataArray' object cannot be interpreted as an integer

Any ideas about that? Thanks!

the code:

import xarray as xr
import tensorflow as tf
import numpy as np
from collections import OrderedDict
#from tensorflow.keras.layers import Input, Conv2D, Dense
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Input, Conv2D, TimeDistributed,\
    MaxPooling2D, Flatten, RepeatVector, Reshape, Lambda, GlobalAveragePooling2D,\
        Bidirectional, ConvLSTM2D, BatchNormalization
        
t2m =  xr.open_dataset("/home/ggousios/s2s-ai-challenge-agroapps/t2m.nc")
sm100 =  xr.open_dataset("/home/ggousios/s2s-ai-challenge-agroapps/sm100.nc")

lead_time = t2m.isel(lead_time=0).lead_time

class PeriodicPadding2D(tf.keras.layers.Layer):
    def __init__(self,
                 pad_width, 
                 **kwargs):
        super().__init__(**kwargs)
        self.pad_width = pad_width

    def call(self, 
             inputs,
             **kwargs):
        if self.pad_width == 0:
            return inputs
        inputs_padded = tf.concat(
            [inputs[:, :, -self.pad_width:, :],
             inputs,
             inputs[:, :, :self.pad_width, :]],
             axis=2)
        # Zero padding in the lat direction
        inputs_padded = tf.pad(inputs_padded, 
                               [[0, 0],
                                [self.pad_width,
                                 self.pad_width],
                                [0, 0], 
                                [0, 0]])
        return inputs_padded

    def get_config(self):
        config = super().get_config()
        config.update({'pad_width': self.pad_width})
        return config


class PeriodicConv2D(tf.keras.layers.Layer):
    def __init__(self,
                 filters,
                 kernel_size,
                 conv_kwargs={},
                 **kwargs, ):
        super().__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.conv_kwargs = conv_kwargs
        if type(kernel_size) is not int:
            assert kernel_size[0] == kernel_size[1], \
                'PeriodicConv2D only works for square kernels'
            kernel_size = kernel_size[0]
        pad_width = (kernel_size - 1) // 2
        self.padding = PeriodicPadding2D(pad_width)
        self.conv = Conv2D(
            filters, kernel_size, padding='valid', **conv_kwargs
        )

    def call(self, inputs):
        return self.conv(self.padding(inputs))

    def get_config(self):
        config = super().get_config()
        config.update({'filters': self.filters,
                       'kernel_size': self.kernel_size, 
                       'conv_kwargs': self.conv_kwargs})
        return config
    
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, 
                 ds,
                 var_dict,
                 lead_time,
                 batch_size, 
                 shuffle=True,
                 load=True,
                 mean=None,
                 std=None):
        """
        Data generator for WeatherBench data.
        Template from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
        Args:
            ds: Dataset containing all variables
            var_dict: Dictionary of the form {'var': level}. Use None for level if data is of single level
            lead_time: Lead time in hours
            batch_size: Batch size
            shuffle: bool. If True, data is shuffled.
            load: bool. If True, datadet is loaded into RAM.
            mean: If None, compute mean from data.
            std: If None, compute standard deviation from data.
        """
        self.ds = ds
        self.var_dict = var_dict
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.lead_time = lead_time

        data = []
        generic_level = xr.DataArray([1])#, coords={'realization': [1]}, dims=['realization'])
        for var, levels in var_dict.items():
            #try:
            data.append(ds[var])#.sel(realization=levels))
            #except ValueError:
            #    data.append(ds[var].expand_dims({'realization': generic_level}, 1))

        self.data = xr.concat(data, 'realization').transpose('forecast_time', ...)
        self.mean = self.data.mean(('forecast_time')).compute() if mean is None else mean
        self.std = self.data.std('forecast_time').compute() if std is None else std
        # Normalize
        self.data = (self.data - self.mean) / self.std
    
        #self.n_samples = self.data.isel(forecast_time=slice(0, -lead_time)).shape[0]
        self.n_samples = self.data.forecast_time.size
        # self.init_time = self.data.isel(forecast_time=slice(None, -lead_time)).forecast_time
        # self.valid_time = self.data.isel(forecast_time=slice(lead_time, None)).forecast_time

        self.on_epoch_end()

        # For some weird reason calling .load() earlier messes up the mean and std computations
        if load: print('Loading data into RAM'); self.data.load()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.n_samples / self.batch_size))

    def __getitem__(self, i):
        'Generate one batch of data'
        idxs = self.idxs[i * self.batch_size:(i + 1) * self.batch_size]
        X = self.data.isel(forecast_time=idxs).values
        y = self.data.isel(forecast_time=idxs + self.lead_time).values
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.idxs = np.arange(self.n_samples)
        if self.shuffle == True:
            np.random.shuffle(self.idxs)
            
            
datasets = [t2m, sm100]
ds = xr.merge(datasets)
# I am choosing only t2m (not tp) and sm100 here
dic = OrderedDict({'t2m': None, 'sm100': None})


dg_train = DataGenerator(
    ds.sel(forecast_time=slice('2000', '2001')),
    dic,
    lead_time=lead_time,
    batch_size=8,
    load=True)

dg_valid = DataGenerator(
    ds.sel(forecast_time=slice('2018', '2019')),
    dic,
    lead_time=lead_time,
    batch_size=8,
    mean=dg_train.mean,
    std=dg_train.std,
    shuffle=False)

def custom_categ_crossentropy(y_true, y_pred, sample_weight=None):
    y_true = tf.one_hot(tf.cast(y_true,'int32'), depth=3)
    loss = tf.keras.losses.CategoricalCrossentropy()(y_true, y_pred)
    return loss

    
def build_cnn(filters,
              kernels,
              input_shape):
    
        inputs = Input(batch_shape=(8,
                                    121,
                                    240,
                                    1))
       
        print(inputs.shape)
        x = (PeriodicConv2D(filters,
                            kernels,
                            conv_kwargs={'activation':'relu'}))((inputs))
       
        x = PeriodicConv2D(32,
                           5,
                           conv_kwargs={'activation':'relu'})(x)
       
        
        output = Dense(3, activation='softmax')(x)
       
        model = Model(inputs, output)
        model.compile(optimizer='sgd',
                      loss=custom_categ_crossentropy,
                      metrics=['accuracy'])
        print(model.summary())
        return model
    
def fit():
    
    model = build_cnn(64, 9, (121, 240, 1))
    history = model.fit(dg_train,
                       epochs=10,
                       validation_data=dg_valid)
                
    return history
    
history = fit()
    

The data

ggous avatar Sep 02 '21 14:09 ggous