cvxpylayers icon indicating copy to clipboard operation
cvxpylayers copied to clipboard

Large errors on hypergradient of logistic regression

Open agramfort opened this issue 3 years ago • 0 comments

we want to use cvxpylayers to compute the gradient of a test loss w.r.t. the regularization parameter used on the train data for sparse models (Lasso and L1 regularized logistic regression). It's also called hypergradients (gradient w.r.t. hyperparameters).

Basically we aim to do with cvxpylayers what we did in our recent ICML paper: http://proceedings.mlr.press/v119/bertrand20a.html

While it runs fine for both models we get surprisingly high errors for the logistic model using scipy.optimize.check_grad. We get:

4.49142934577651e-06 for Lasso 1.0356275707793758 for Logistic Regression

It can be even worse on other data.

Could it be a bug? Or just numerical issues due to log and exp?

cc @QB3 @Klopfe @vaiter @JosephSalmon @mblondel

See code below.

import numpy as np
from scipy.optimize import check_grad
import cvxpy as cp
import torch
from cvxpylayers.torch import CvxpyLayer

torch.set_default_dtype(torch.double)


def lasso_cvxpy(X, y, lam, idx_train, idx_val):
    Xtrain, Xtest, ytrain, ytest = map(
        torch.from_numpy,
        [X[idx_train, :], X[idx_val], y[idx_train], y[idx_val]],
    )

    n_samples_train, n_features = Xtrain.shape

    # set up variables and parameters
    beta_cp = cp.Variable(n_features)
    lambda_cp = cp.Parameter(nonneg=True)

    # set up objective
    loss = (1 / (2 * n_samples_train)) * cp.sum(
        cp.square(Xtrain @ beta_cp - ytrain)
    )
    reg = lambda_cp * cp.norm1(beta_cp)
    objective = loss + reg

    # define problem
    problem = cp.Problem(cp.Minimize(objective))
    assert problem.is_dpp()

    # solve problem
    layer = CvxpyLayer(problem, [lambda_cp], [beta_cp])
    lambda_th = torch.tensor(np.squeeze(lam), requires_grad=True)
    (beta_,) = layer(lambda_th)

    # get test loss and it's gradient
    test_loss = (Xtest @ beta_ - ytest).pow(2).mean()
    test_loss.backward()

    val = test_loss.detach().numpy()
    grad = np.array(lambda_th.grad)
    return val, grad


def logreg_cvxpy(X, y, lam, idx_train, idx_val):
    assert np.all(np.unique(y) == np.array([-1, 1]))
    Xtrain, Xtest, ytrain, ytest = map(
        torch.from_numpy,
        [X[idx_train, :], X[idx_val], y[idx_train], y[idx_val]],
    )

    n_samples_train, n_features = Xtrain.shape

    # set up variables and parameters
    beta_cp = cp.Variable(n_features)
    lambda_cp = cp.Parameter(nonneg=True)

    # set up objective
    loss = (
        cp.sum(cp.logistic(cp.multiply(-ytrain, Xtrain @ beta_cp)))
        / n_samples_train
    )
    reg = lambda_cp * cp.norm(beta_cp, 1)
    objective = loss + reg

    # define problem
    problem = cp.Problem(cp.Minimize(objective))
    assert problem.is_dpp()

    # solve problem
    layer = CvxpyLayer(problem, parameters=[lambda_cp], variables=[beta_cp])
    alpha_th = torch.tensor(np.squeeze(lam), requires_grad=True)
    (beta_,) = layer(alpha_th)

    # get test loss and it's gradient
    # test_loss = torch.mean(torch.nn.LogSigmoid()(ytest * (Xtest @ beta_)))
    test_loss = torch.mean(torch.log(1 + torch.exp(-ytest * (Xtest @ beta_))))
    test_loss.backward()

    val = test_loss.detach().numpy()
    grad = np.array(alpha_th.grad)
    return val, grad


n, m = 3, 20
rng = np.random.RandomState(42)
coef = rng.randn(n)
X = rng.randn(m, n)
y = X @ coef + 0.1 * rng.randn(m)
y_sign = np.sign(y)
rng.shuffle(y_sign)  # add some label noise

idx_train = np.arange(0, m // 2)
idx_val = np.arange(m // 2, m)

lam = 0.1

val, grad = lasso_cvxpy(X, y, lam, idx_train, idx_val)

dict_cvxpy_func = {
    "lasso": lasso_cvxpy,
    "logreg": logreg_cvxpy,
}


def test_hypergradient(model_name):
    cvxpy_func = dict_cvxpy_func[model_name]
    this_y = y
    if model_name == "logreg":
        this_y = y_sign

    def get_val(lam):
        val_cvxpy, _ = cvxpy_func(X, this_y, lam, idx_train, idx_val)
        return val_cvxpy

    def get_grad(lam):
        _, grad_cvxpy = cvxpy_func(X, this_y, lam, idx_train, idx_val)
        return grad_cvxpy

    grad_error = check_grad(get_val, get_grad, lam)
    print(grad_error)
    assert grad_error < 1


for model_name in dict_cvxpy_func.keys():
    test_hypergradient(model_name)

agramfort avatar Jan 15 '21 18:01 agramfort