xlearn
xlearn copied to clipboard
Only same or similar output from binary classifier model
Hi! I'd like to share my problem I suffered using xlearn binary classifier model.
I modified the code from this tutorial. https://www.analyticsvidhya.com/blog/2018/01/factorization-machines/
import pandas as pd
import xlearn as xl
train = pd.read_csv('ffm_train.csv')
import warnings
warnings.filterwarnings('ignore')
cols = ['Education','ApplicantIncome','Loan_Status','Credit_History']
train_sub = train[cols]
train_sub['Credit_History'].fillna(0, inplace = True)
dict_ls = {'Y':1, 'N':0}
train_sub['Loan_Status'].replace(dict_ls, inplace = True)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(train_sub, test_size = 0.3, random_state = 5)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.datasets import make_classification
class FFMFormatPandas:
def __init__(self):
self.field_index_ = None
self.feature_index_ = None
self.y = None
def fit(self, df, y=None):
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
if self.feature_index_ is not None:
last_idx = max(list(self.feature_index_.values()))
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 0
for col in df.columns:
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None):
self.fit(df, y)
return self.transform(df)
def transform_row_(self, row, t):
ffm = []
if self.y != None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
if col_type.kind == 'O':
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
return ' '.join(ffm)
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
########################### Lets build some data and test ############################
###
ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(X_train, y='Loan_Status')
ffm_test = FFMFormatPandas()
ffm_test_data = ffm_train.fit_transform(X_test, y='Loan_Status')
ffm_train_data.to_csv("./model_out/train_ffm.txt") # these snippets
ffm_test_data.to_csv("./model_out/test_ffm.txt") #
training loss was very low and thought it worked well, but the problem was that predicted values were almost the same like 0.99999, 0.99998 or something.
I tried to figure out why this happened and finally found that my processed .txt data included "indices" from the original pandas dataframe.
I added index=False argument to final code snippets, and I found it works.
hope this can help someone.