xlearn icon indicating copy to clipboard operation
xlearn copied to clipboard

Only same or similar output from binary classifier model

Open myeonghak opened this issue 3 years ago • 0 comments

Hi! I'd like to share my problem I suffered using xlearn binary classifier model.

I modified the code from this tutorial. https://www.analyticsvidhya.com/blog/2018/01/factorization-machines/

import pandas as pd
import xlearn as xl
train = pd.read_csv('ffm_train.csv')
import warnings
warnings.filterwarnings('ignore')

cols = ['Education','ApplicantIncome','Loan_Status','Credit_History']
train_sub = train[cols]
train_sub['Credit_History'].fillna(0, inplace = True)
dict_ls = {'Y':1, 'N':0}
train_sub['Loan_Status'].replace(dict_ls, inplace = True)

from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(train_sub, test_size = 0.3, random_state = 5)


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.datasets import make_classification

class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})

########################### Lets build some data and test ############################
### 

ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(X_train, y='Loan_Status')

ffm_test = FFMFormatPandas()
ffm_test_data = ffm_train.fit_transform(X_test, y='Loan_Status')



ffm_train_data.to_csv("./model_out/train_ffm.txt") # these snippets
ffm_test_data.to_csv("./model_out/test_ffm.txt") # 

training loss was very low and thought it worked well, but the problem was that predicted values were almost the same like 0.99999, 0.99998 or something.

I tried to figure out why this happened and finally found that my processed .txt data included "indices" from the original pandas dataframe.

I added index=False argument to final code snippets, and I found it works.

hope this can help someone.

myeonghak avatar Nov 26 '20 08:11 myeonghak