ml_drought
ml_drought copied to clipboard
`Nowcast` models need to do some extra steps to REMOVE the nan values
In the final month and for the final feature (VHI) the values are ALL nan
(in the DataLoader
these are coded up as -inf
). This is because the feature (VHI) is hidden from the model and so we will have to do some extra preprocessing for this step to avoid errors in fitting the models.
x.shape
(35134, 12, 5)
x[:,-1,-1]
array([-inf, -inf, -inf, ..., -inf, -inf, -inf])
So for the linear model:
# ... scripts/models.py
experiment = 'nowcast'
predictor = LinearRegression(data_path, experiment=experiment)
predictor.train()
# ... src/models/regression.py
self.model.partial_fit(batch_x, batch_y.ravel())
Creates the error:
ValueError Traceback (most recent call last)
<ipython-input-23-5bc95e9d1f4d> in <module>
----> 1 regression(experiment='nowcast')
<ipython-input-15-acf066373942> in regression(experiment)
8
9 predictor = LinearRegression(data_path, experiment=experiment)
---> 10 predictor.train()
11 predictor.evaluate(save_preds=True)
12
~/ml_drought/src/models/regression.py in train(self, num_epochs, early_stopping, batch_size)
61 batch_x = batch_x.reshape(batch_x.shape[0],
62 batch_x.shape[1] * batch_x.shape[2])
---> 63 self.model.partial_fit(batch_x, batch_y.ravel())
64
65 train_pred_y = self.model.predict(batch_x)
~/miniconda3/envs/esowc-drought/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py in partial_fit(self, X, y, sample_weight)
1151 learning_rate=self.learning_rate, max_iter=1,
1152 sample_weight=sample_weight, coef_init=None,
-> 1153 intercept_init=None)
1154
1155 def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None,
~/miniconda3/envs/esowc-drought/lib/python3.7/site-packages/sklearn/linear_model/stochastic_gradient.py in _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, intercept_init)
1097 max_iter, sample_weight, coef_init, intercept_init):
1098 X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64,
-> 1099 accept_large_sparse=False)
1100 y = y.astype(np.float64, copy=False)
1101
~/miniconda3/envs/esowc-drought/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
717 ensure_min_features=ensure_min_features,
718 warn_on_dtype=warn_on_dtype,
--> 719 estimator=estimator)
720 if multi_output:
721 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~/miniconda3/envs/esowc-drought/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
540 if force_all_finite:
541 _assert_all_finite(array,
--> 542 allow_nan=force_all_finite == 'allow-nan')
543
544 if ensure_min_samples > 0:
~/miniconda3/envs/esowc-drought/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan)
54 not allow_nan and not np.isfinite(X).all()):
55 type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 56 raise ValueError(msg_err.format(type_err, X.dtype))
57 # for object dtype data, we only check for NaNs (GH-13254)
58 elif X.dtype == np.dtype('object') and not allow_nan:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
Maybe we can have some sort of 'is dimension all nan/inf' and then just drop that coordinate from the axes:
So we would select x[:, :-1, :-1]
from our x
values if the final dimension of the 2nd and 3rd axis were all inf
i'm sure this is a relatively easy one liner in the src/models/data.py
file? Maybe i'm wrong. Where else are we checking for nans and dropping them?