cvxpylayers
cvxpylayers copied to clipboard
Large errors on hypergradient of logistic regression
we want to use cvxpylayers to compute the gradient of a test loss w.r.t. the regularization parameter used on the train data for sparse models (Lasso and L1 regularized logistic regression). It's also called hypergradients (gradient w.r.t. hyperparameters).
Basically we aim to do with cvxpylayers what we did in our recent ICML paper: http://proceedings.mlr.press/v119/bertrand20a.html
While it runs fine for both models we get surprisingly high errors for the logistic model
using scipy.optimize.check_grad
. We get:
4.49142934577651e-06 for Lasso 1.0356275707793758 for Logistic Regression
It can be even worse on other data.
Could it be a bug? Or just numerical issues due to log and exp?
cc @QB3 @Klopfe @vaiter @JosephSalmon @mblondel
See code below.
import numpy as np
from scipy.optimize import check_grad
import cvxpy as cp
import torch
from cvxpylayers.torch import CvxpyLayer
torch.set_default_dtype(torch.double)
def lasso_cvxpy(X, y, lam, idx_train, idx_val):
Xtrain, Xtest, ytrain, ytest = map(
torch.from_numpy,
[X[idx_train, :], X[idx_val], y[idx_train], y[idx_val]],
)
n_samples_train, n_features = Xtrain.shape
# set up variables and parameters
beta_cp = cp.Variable(n_features)
lambda_cp = cp.Parameter(nonneg=True)
# set up objective
loss = (1 / (2 * n_samples_train)) * cp.sum(
cp.square(Xtrain @ beta_cp - ytrain)
)
reg = lambda_cp * cp.norm1(beta_cp)
objective = loss + reg
# define problem
problem = cp.Problem(cp.Minimize(objective))
assert problem.is_dpp()
# solve problem
layer = CvxpyLayer(problem, [lambda_cp], [beta_cp])
lambda_th = torch.tensor(np.squeeze(lam), requires_grad=True)
(beta_,) = layer(lambda_th)
# get test loss and it's gradient
test_loss = (Xtest @ beta_ - ytest).pow(2).mean()
test_loss.backward()
val = test_loss.detach().numpy()
grad = np.array(lambda_th.grad)
return val, grad
def logreg_cvxpy(X, y, lam, idx_train, idx_val):
assert np.all(np.unique(y) == np.array([-1, 1]))
Xtrain, Xtest, ytrain, ytest = map(
torch.from_numpy,
[X[idx_train, :], X[idx_val], y[idx_train], y[idx_val]],
)
n_samples_train, n_features = Xtrain.shape
# set up variables and parameters
beta_cp = cp.Variable(n_features)
lambda_cp = cp.Parameter(nonneg=True)
# set up objective
loss = (
cp.sum(cp.logistic(cp.multiply(-ytrain, Xtrain @ beta_cp)))
/ n_samples_train
)
reg = lambda_cp * cp.norm(beta_cp, 1)
objective = loss + reg
# define problem
problem = cp.Problem(cp.Minimize(objective))
assert problem.is_dpp()
# solve problem
layer = CvxpyLayer(problem, parameters=[lambda_cp], variables=[beta_cp])
alpha_th = torch.tensor(np.squeeze(lam), requires_grad=True)
(beta_,) = layer(alpha_th)
# get test loss and it's gradient
# test_loss = torch.mean(torch.nn.LogSigmoid()(ytest * (Xtest @ beta_)))
test_loss = torch.mean(torch.log(1 + torch.exp(-ytest * (Xtest @ beta_))))
test_loss.backward()
val = test_loss.detach().numpy()
grad = np.array(alpha_th.grad)
return val, grad
n, m = 3, 20
rng = np.random.RandomState(42)
coef = rng.randn(n)
X = rng.randn(m, n)
y = X @ coef + 0.1 * rng.randn(m)
y_sign = np.sign(y)
rng.shuffle(y_sign) # add some label noise
idx_train = np.arange(0, m // 2)
idx_val = np.arange(m // 2, m)
lam = 0.1
val, grad = lasso_cvxpy(X, y, lam, idx_train, idx_val)
dict_cvxpy_func = {
"lasso": lasso_cvxpy,
"logreg": logreg_cvxpy,
}
def test_hypergradient(model_name):
cvxpy_func = dict_cvxpy_func[model_name]
this_y = y
if model_name == "logreg":
this_y = y_sign
def get_val(lam):
val_cvxpy, _ = cvxpy_func(X, this_y, lam, idx_train, idx_val)
return val_cvxpy
def get_grad(lam):
_, grad_cvxpy = cvxpy_func(X, this_y, lam, idx_train, idx_val)
return grad_cvxpy
grad_error = check_grad(get_val, get_grad, lam)
print(grad_error)
assert grad_error < 1
for model_name in dict_cvxpy_func.keys():
test_hypergradient(model_name)