skforecast
skforecast copied to clipboard
RandomForestRegressor predicts constant values
I am trying to fit RandomForestRegressor on the training dataset, when using the predict function to predict the values, I receive constant values through time. What is the reason for this? My goal is to forecast up to 72 hours ahead. Here is my code: `from data_preparation import Preparation from missing_timestamps import remove_duplicates import pandas as pd from skforecast.ForecasterAutoreg import ForecasterAutoreg from skforecast.model_selection import grid_search_forecaster from sklearn.ensemble import RandomForestRegressor from skforecast.utils import save_forecaster from skforecast.utils import load_forecaster
marvin
data = Preparation(r'/home/ieftimska/operato-meteo-1/data/MAS_processed/ELES-MAS-5001.csv.gz', "AMBIENT_TEMPERATURE")
#data = Preparation(r'/home/iva/Desktop/operato-meteo-1/data/MAS_processed/ELES-MAS-5001.csv.gz', "AMBIENT_TEMPERATURE")
train, test = data.split()
train_processed = remove_duplicates(train)
#train_processed_ = train_processed["AMBIENT_TEMPERATURE"].copy().squeeze()
test_processed = remove_duplicates(test)
#test_processed_ = test_processed["AMBIENT_TEMPERATURE"].copy().squeeze()
whole_data = pd.concat([train_processed, test_processed])
whole_data = whole_data.rename(columns={"AMBIENT_TEMPERATURE": "y"})
whole_data.index = whole_data.index.rename("datetime")
forecaster.fit(y=whole_data.loc[:"2022", "y"])
save_forecaster(forecaster, file_name='forecaster_random_forest.py', verbose=False)
forecaster_loaded = load_forecaster('forecaster_random_forest.py', verbose=True)
predictions = forecaster_loaded.predict(steps=864)Here is the dataset: [ELES-MAS-5001.csv.gz](https://github.com/JoaquinAmatRodrigo/skforecast/files/12646861/ELES-MAS-5001.csv.gz) This is the missing_timestamps script:
import pandas as pd
from datetime import timedelta
import numpy as np
def remove_duplicates(data): """ A function that removes duplicates in timestamps and removes timezone information from the timestamps """ data["timestamp"] = pd.to_datetime(data["timestamp"]) time = data["timestamp"].dt.tz_localize(None) data["timestamp"] = time data_processed = data.drop_duplicates(subset="timestamp", keep='last') data_processed = data_processed.set_index("timestamp") data_processed = data_processed.sort_index()
return data_processed
def missing_data(data_processed): """ A function that finds the dates of the missing data and fills it with the previous non-missing timestamp for a better visualization to present train/test data with the missing data """ timestamps_series = pd.Series(data_processed.index) diff_5 = timedelta(hours=0, minutes=5) diff = timestamps_series.diff() more_than_5 = np.where(diff > diff_5)[0] previous_timestamp = more_than_5 - 1 gaps = list() # a list of lists, where the first element is the previous timestamp of the missing data (succesive timestamps differ more than 5 min) # and the second element is the end date of the missing data for i, j in zip(data_processed.iloc[previous_timestamp].index, data_processed.iloc[more_than_5].index): gaps.append([i, j]) missing_timestamps = list() # dates of missing data where the start date is not a missing data for i in gaps: missing_timestamps.append(pd.date_range(start=i[0], end=i[1], freq='5T')) missing_timestamps_one_by_one = list() for i in missing_timestamps: # exclude the non-missing timestamp for j in i[1:]: missing_timestamps_one_by_one.append(j) values = data_processed.iloc[previous_timestamp].values values_list = list() for i, v in enumerate(values): values_list.append(v[0]) # fill the missing timestamps with the values of their previous non-missing timestamps missing_values_filled_previous = list() for t, v in zip(missing_timestamps, values_list): missing_values_filled_previous.append([v] * len(t[1:])) missing_values_filled_previous_one_by_one = list() for i in missing_values_filled_previous: for j in i: missing_values_filled_previous_one_by_one.append(j)
return missing_timestamps_one_by_one, missing_values_filled_previous_one_by_one`