metric-learn icon indicating copy to clipboard operation
metric-learn copied to clipboard

TypeError: _inplace_paired_L2() missing 2 required positional arguments: 'A' and 'B'

Open angelotc opened this issue 3 years ago • 12 comments

Description

I get this error TypeError: _inplace_paired_L2() missing 2 required positional arguments: 'A' and 'B'

Steps/Code to Reproduce

Example:

from sklearn.datasets import make_friedman1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def friedman_np_to_df(X,y):
  return pd.DataFrame(X,columns=['x0','x1', 'x2', 'x3', 'x4']), pd.Series(y)

# Make training set
X_train, NA = make_friedman1(n_samples=1000, n_features=5, random_state = 1) #dont care about Y so call it NA
X_train, NA = friedman_np_to_df(X_train,NA)



#categorize training set based off of x0
domain_list = []
for i in range(len(X_train)):
  if X_train.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)


X_train['domain'] = domain_list
# Set training set to where domain == 1 (x0 < 0.5)
X_train =  X_train[X_train['domain']==1]
y_train = X_train.copy()
X_train = X_train.drop(columns = ['domain'])
y_train = y_train['domain']


# Make testing set with a different random_state
X_test, NA2 = make_friedman1(n_samples=1000, n_features=5, random_state = 3)
X_test, NA2 = friedman_np_to_df(X_test,NA2)


#categorize testing set based off of x0
domain_list = []
for i in range(len(X_test)):
  if X_test.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)
X_test['domain'] = domain_list

y_test = X_test['domain'].copy()
X_test = X_test.drop(columns = ['domain'])


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from metric_learn import LMNN
lmnn_knn = Pipeline(steps=[('lmnn', LMNN()), ('knn', KNeighborsClassifier())])
parameters = {'lmnn__k':[1, 2,3], 'knn__n_neighbors':[1 , 2]}
grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters, n_jobs=-1, verbose=True)
grid_lmnn_knn.fit(X_train,y_train)
grid_lmnn_knn.score(X_test, y_test)

Expected Results

Example: No error is thrown. Score is calculated

Actual Results

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.5s finished
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-54-e89c6a61ea02> in <module>()
      6 parameters = {'lmnn__k':[1, 2,3], 'knn__n_neighbors':[1 , 2]}
      7 grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters, n_jobs=-1, verbose=True)
----> 8 grid_lmnn_knn.fit(X_train,y_train)
      9 grid_lmnn_knn.score(X_test, y_test)
     10 

7 frames
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    737             refit_start_time = time.time()
    738             if y is not None:
--> 739                 self.best_estimator_.fit(X, y, **fit_params)
    740             else:
    741                 self.best_estimator_.fit(X, **fit_params)

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    348             This estimator
    349         """
--> 350         Xt, fit_params = self._fit(X, y, **fit_params)
    351         with _print_elapsed_time('Pipeline',
    352                                  self._log_message(len(self.steps) - 1)):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    313                 message_clsname='Pipeline',
    314                 message=self._log_message(step_idx),
--> 315                 **fit_params_steps[name])
    316             # Replace the transformer of the step with the fitted
    317             # transformer. This is necessary when loading the transformer

/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    726     with _print_elapsed_time(message_clsname, message):
    727         if hasattr(transformer, 'fit_transform'):
--> 728             res = transformer.fit_transform(X, y, **fit_params)
    729         else:
    730             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    572         else:
    573             # fit method of arity 2 (supervised transformation)
--> 574             return self.fit(X, y, **fit_params).transform(X)
    575 
    576 

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in fit(self, X, y)
    180     G, objective, total_active = self._loss_grad(X, L, dfG, k,
    181                                                  reg, target_neighbors,
--> 182                                                  label_inds)
    183 
    184     it = 1  # we already made one iteration

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds)
    246                                      label_inds, L)
    247 
--> 248     g0 = _inplace_paired_L2(*Lx[impostors])
    249 
    250     # we reorder the target neighbors

TypeError: _inplace_paired_L2() missing 2 required positional arguments: 'A' and 'B'

Versions

Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic Python 3.7.10 (default, Feb 20 2021, 21:17:23) [GCC 7.5.0] NumPy 1.19.5 SciPy 1.4.1 Scikit-Learn 0.22.2.post1 Metric-Learn 0.6.2

angelotc avatar Mar 31 '21 23:03 angelotc

Looks like either Lx or impostors was empty when computing the gradient of the loss.

Is X_train a numpy array of Pandas dataframe in your call to grid_lmnn_knn.fit(X_train,y_train)? If it's a dataframe, could you try again using a plain numpy array?

In any case, we should do better checking to surface a less-opaque error message.

perimosocordiae avatar Apr 01 '21 12:04 perimosocordiae

Yep, I have tried that .

Replaced last 2 lines with this:

grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
grid_lmnn_knn.score(np.array(X_test), np.array(y_test))

Any other thoughts you can think of?

angelotc avatar Apr 01 '21 17:04 angelotc

I reproduced the issue locally, and it turns out that impostors is indeed empty when computing the gradient. See similar issue gh-17 which apparently didn't result in a fix for this same issue.

I haven't verified yet, but I suspect the new LMNN implementation coming in gh-309 will solve this for you. We should also make sure we add test coverage for the no-impostors case.

perimosocordiae avatar Apr 01 '21 20:04 perimosocordiae

Sounds good. I will patiently wait for that. If you have any workaround until then, let me know as I have to present on my findings by next Wednesday to my research group lol

angelotc avatar Apr 01 '21 20:04 angelotc

Here's a workaround. It just bails out entirely if no impostors can be found: https://github.com/scikit-learn-contrib/metric-learn/commit/612fcc4c74991dd377bc7aa9ea1741b9e8bc4f14

Not super elegant, but it should work okay.

perimosocordiae avatar Apr 01 '21 23:04 perimosocordiae

Thank you for your work @perimosocordiae .

We are trying to apply metric learning to the materials sciences space, but trying to apply this work to Friedman dataset first before going all in on the diffusion datasets. Super weird that my pipeline is now not predicting any of the 0s correctly in my test set. I am not sure if this is a correct approach. In case you were interested in our use case: my PI is telling me to frame a classification problem with the Friedman dataset. Categorize the dataset and set samples where x0 < 0.6 to 1 (sample is within domain), else 0. Then apply metric-learning to that and see if it performs well.

Code so far:

from sklearn.datasets import make_friedman1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def friedman_np_to_df(X,y):
  return pd.DataFrame(X,columns=['x0','x1', 'x2', 'x3', 'x4']), pd.Series(y)

# Make training set
X_train, NA = make_friedman1(n_samples=1000, n_features=5, random_state = 1) #dont care about Y so call it NA
X_train, NA = friedman_np_to_df(X_train,NA)



#categorize training set based off of x0
domain_list = []
for i in range(len(X_train)):
  if X_train.iloc[i]['x0'] < 0.6 :
    domain_list.append(1)
  else:
    domain_list.append(0)


X_train['domain'] = domain_list
# Set training set to where domain == 1 (x0 < 0.6)
X_train =  X_train[X_train['domain']==1]
y_train = X_train.copy()
X_train = X_train.drop(columns = ['domain'])
y_train = y_train['domain']


# Make testing set with a different random_state
X_test, NA2 = make_friedman1(n_samples=1000, n_features=5, random_state = 3)
X_test, NA2 = friedman_np_to_df(X_test,NA2)


#categorize testing set based off of x0
domain_list = []
for i in range(len(X_test)):
  if X_test.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)
X_test['domain'] = domain_list

y_test = X_test['domain'].copy()
X_test = X_test.drop(columns = ['domain'])


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from metric_learn import LMNN
lmnn_knn = Pipeline(steps=[('lmnn', LMNN()), ('knn', KNeighborsClassifier())])
parameters = {'lmnn__init': ['pca', 'lda', 'identity', 'random'],
              'lmnn__k':[2,3],
              'knn__n_neighbors':[2,3],
              'knn__weights': ['uniform','distance'],
              'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'knn__leaf_size': [x for x in np.arange(1,30,5)],
              'knn__metric': ['euclidian', 'manhattan', 'mahalanobis', 'seuclidian', 'minkowski']}
grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters,cv = 3, n_jobs=-1, verbose=True, scoring='f1')
grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
# grid_lmnn_knn.score(np.array(X_test), np.array(y_test))

predictions = grid_lmnn_knn.predict(X_test)
print(grid_lmnn_knn.best_estimator_)
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))



output is:

Pipeline(memory=None,
         steps=[('lmnn',
                 LMNN(convergence_tol=0.001, init='pca', k=2, learn_rate=1e-07,
                      max_iter=1000, min_iter=50, n_components=None,
                      preprocessor=None, random_state=None, regularization=0.5,
                      verbose=False)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=1,
                                      metric='manhattan', metric_params=None,
                                      n_jobs=None, n_neighbors=2, p=2,
                                      weights='uniform'))],
         verbose=False)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       387
           1       0.61      1.00      0.76       613

    accuracy                           0.61      1000
   macro avg       0.31      0.50      0.38      1000
weighted avg       0.38      0.61      0.47      1000

/usr/local/lib/python3.7/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

So yeah. Not sure why f1-score is 0 for my 0 cases. Maybe I am doing it wrong haha.

angelotc avatar Apr 02 '21 04:04 angelotc

So I think the reason why my last attempt had bad performance on the Friedman dataset was because there were no examples of 0-labeled data in the training set. Now, I include samples of 0-labeled data in the training set. @perimosocordiae I found another issue with your branch:


from sklearn.datasets import make_friedman1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def friedman_np_to_df(X,y):
  return pd.DataFrame(X,columns=['x0','x1', 'x2', 'x3', 'x4']), pd.Series(y)

# Make training set
X_train, NA = make_friedman1(n_samples=1000, n_features=5, random_state = 1) #dont care about Y so call it NA
X_train, NA = friedman_np_to_df(X_train,NA)

#categorize training set based off of x0
domain_list = []
for i in range(len(X_train)):
  if X_train.iloc[i]['x0'] < 0.6 :
    domain_list.append(1)
  else:
    domain_list.append(0)


X_train['domain'] = domain_list
# Set training set to where domain == 1 (x0 < 0.6)

out_of_domain = X_train[X_train['domain'] == 0][:60]
X_train =  X_train[X_train['domain']==1]

X_train = pd.concat([out_of_domain, X_train])

y_train = X_train.copy()
X_train = X_train.drop(columns = ['domain'])
y_train = y_train['domain']


# Make testing set with a different random_state
X_test, NA2 = make_friedman1(n_samples=1000, n_features=5, random_state = 3)
X_test, NA2 = friedman_np_to_df(X_test,NA2)


#categorize testing set based off of x0
domain_list = []
for i in range(len(X_test)):
  if X_test.iloc[i]['x0'] < 0.6:
    domain_list.append(1)
  else:
    domain_list.append(0)
X_test['domain'] = domain_list

y_test = X_test['domain'].copy()
X_test = X_test.drop(columns = ['domain'])


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from metric_learn import LMNN
lmnn_knn = Pipeline(steps=[('lmnn', LMNN()), ('knn', KNeighborsClassifier())])
parameters = {'lmnn__init': ['pca', 'lda', 'identity', 'random'],
              'lmnn__k':[2,3],
              'knn__n_neighbors':[2,3],
              'knn__weights': ['uniform','distance'],
              'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'knn__leaf_size': [x for x in np.arange(1,30,5)],
              'knn__metric': [ 'manhattan', 'mahalanobis', 'minkowski']}
grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters,cv = 5, n_jobs=-1, verbose=True, scoring='f1')
grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
grid_lmnn_knn.score(np.array(X_test), np.array(y_test))



Output:

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 1402 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 3402 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 6202 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 9802 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 11520 out of 11520 | elapsed:  3.1min finished
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-945bbdb8e331> in <module>()
     64               'knn__metric': [ 'manhattan', 'mahalanobis', 'minkowski']}
     65 grid_lmnn_knn = GridSearchCV(lmnn_knn, parameters,cv = 5, n_jobs=-1, verbose=True, scoring='f1')
---> 66 grid_lmnn_knn.fit(np.array(X_train),np.array(y_train))
     67 grid_lmnn_knn.score(np.array(X_test), np.array(y_test))
     68 

7 frames
/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    737             refit_start_time = time.time()
    738             if y is not None:
--> 739                 self.best_estimator_.fit(X, y, **fit_params)
    740             else:
    741                 self.best_estimator_.fit(X, **fit_params)

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    348             This estimator
    349         """
--> 350         Xt, fit_params = self._fit(X, y, **fit_params)
    351         with _print_elapsed_time('Pipeline',
    352                                  self._log_message(len(self.steps) - 1)):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    313                 message_clsname='Pipeline',
    314                 message=self._log_message(step_idx),
--> 315                 **fit_params_steps[name])
    316             # Replace the transformer of the step with the fitted
    317             # transformer. This is necessary when loading the transformer

/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    726     with _print_elapsed_time(message_clsname, message):
    727         if hasattr(transformer, 'fit_transform'):
--> 728             res = transformer.fit_transform(X, y, **fit_params)
    729         else:
    730             res = transformer.fit(X, y, **fit_params).transform(X)

/usr/local/lib/python3.7/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    572         else:
    573             # fit method of arity 2 (supervised transformation)
--> 574             return self.fit(X, y, **fit_params).transform(X)
    575 
    576 

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in fit(self, X, y)
    180     G, objective, total_active = self._loss_grad(X, L, dfG, k,
    181                                                  reg, target_neighbors,
--> 182                                                  label_inds)
    183     if G is None:
    184       # TODO: raise a warning

/usr/local/lib/python3.7/dist-packages/metric_learn/lmnn.py in _loss_grad(self, X, L, dfG, k, reg, target_neighbors, label_inds)
    249     impostors = self._find_impostors(furthest_neighbors.ravel(), X,
    250                                      label_inds, L)
--> 251     if not impostors:
    252       return None, 0, 0
    253 

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

angelotc avatar Apr 04 '21 03:04 angelotc

Tried swapping it to the following, but it now goes into an infinite loop 😂

if not impostors.any():
   return None, 0, 0

angelotc avatar Apr 04 '21 18:04 angelotc

Sorry, try if len(impostors) == 0: instead.

On Sun, Apr 4, 2021 at 2:31 PM Angelo Cortez @.***> wrote:

Tried swapping it to the following, but it now goes into an infinite loop 😂

if not impostors.any():

return None, 0, 0

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/scikit-learn-contrib/metric-learn/issues/312#issuecomment-813079329, or unsubscribe https://github.com/notifications/unsubscribe-auth/AAAUHJ3SQS4227W2QJ7FR4DTHCWARANCNFSM42FZLVAQ .

perimosocordiae avatar Apr 04 '21 20:04 perimosocordiae

image Super weird.

angelotc avatar Apr 04 '21 23:04 angelotc

Were you able to try the code from gh-309? I'm curious to see how it would handle this case.

perimosocordiae avatar Apr 05 '21 00:04 perimosocordiae

The code from the PR doesn't work as well 🤣I will post my results on that thread.

angelotc avatar Apr 05 '21 02:04 angelotc