skpro
skpro copied to clipboard
Intervals/quantiles can be negative for models that can only make non-negative predictions
For models with log-link like Tweedie, Poisson, etc., that are constrained to only make non-negative predictions, one would expect the intervals to be non-negative as well. Below is a reproducible example.
#pip install tweedie
from tweedie import tweedie
import numpy as np
import pandas as pd
TWEEDIE_VARIANCE_POWER = 1.5
TWEEDIE_DISPERSION_PARAMETER = 20
def simulate_tweedie_glm_data():
# Number of parameters for model
P = 10
N = 5000
SEED = 2022
rng = np.random.RandomState(SEED)
X = rng.rand(N, P - 1)
X = np.hstack((np.ones((N, 1)), X))
beta = np.concatenate(([370], rng.randint(-10, 200, P - 1))) / 100
eta = np.dot(X, beta)
mu = np.exp(eta)
tweedie_variance_power = TWEEDIE_VARIANCE_POWER
dispersion_parameter = TWEEDIE_DISPERSION_PARAMETER
y = tweedie(mu=mu, p=tweedie_variance_power, phi=dispersion_parameter).rvs(N, random_state = SEED)
X = pd.DataFrame(X)
X.columns = ['intcpt' if i ==0 else 'feat_' + str(i) for i in range(P)]
X.drop('intcpt', axis=1, inplace=True)
return X, y
X, y = simulate_tweedie_glm_data()
(y<0).mean()
# 0.0
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import TweedieRegressor
from skpro.regression.residual import ResidualDouble
reg_mean = TweedieRegressor(power=TWEEDIE_VARIANCE_POWER)
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)
reg_proba.fit(X, y)
y_pred_quantiles = reg_proba.predict_quantiles(X, alpha=[0.05, 0.5, 0.95])
(y_pred_quantiles < 0).mean(axis=0)
#array([0.1868, 0. , 0. ])