skpro icon indicating copy to clipboard operation
skpro copied to clipboard

Intervals/quantiles can be negative for models that can only make non-negative predictions

Open fsaforo1 opened this issue 7 months ago • 3 comments

For models with log-link like Tweedie, Poisson, etc., that are constrained to only make non-negative predictions, one would expect the intervals to be non-negative as well. Below is a reproducible example.

#pip install tweedie

from tweedie import tweedie
import numpy as np
import pandas as pd

TWEEDIE_VARIANCE_POWER = 1.5
TWEEDIE_DISPERSION_PARAMETER = 20

def simulate_tweedie_glm_data():
    # Number of parameters for model
    P = 10

    N = 5000

    SEED = 2022
    rng = np.random.RandomState(SEED)
    X = rng.rand(N, P - 1)
    X = np.hstack((np.ones((N, 1)), X))
    beta = np.concatenate(([370], rng.randint(-10, 200, P - 1))) / 100
    eta = np.dot(X, beta)
    mu = np.exp(eta)

    tweedie_variance_power = TWEEDIE_VARIANCE_POWER
    dispersion_parameter = TWEEDIE_DISPERSION_PARAMETER

    y = tweedie(mu=mu, p=tweedie_variance_power, phi=dispersion_parameter).rvs(N, random_state = SEED)
    X = pd.DataFrame(X)
    X.columns = ['intcpt' if i ==0 else 'feat_' + str(i) for i in range(P)]
    X.drop('intcpt', axis=1, inplace=True)

    return X, y


X, y = simulate_tweedie_glm_data()

(y<0).mean()
# 0.0


from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import TweedieRegressor
from skpro.regression.residual import ResidualDouble

reg_mean = TweedieRegressor(power=TWEEDIE_VARIANCE_POWER)
reg_resid = RandomForestRegressor()
reg_proba = ResidualDouble(reg_mean, reg_resid)
reg_proba.fit(X, y)

y_pred_quantiles = reg_proba.predict_quantiles(X, alpha=[0.05, 0.5, 0.95])

(y_pred_quantiles < 0).mean(axis=0)

#array([0.1868, 0.    , 0.    ])

fsaforo1 avatar Jul 11 '24 01:07 fsaforo1