sfguide-getting-started-machine-learning
sfguide-getting-started-machine-learning copied to clipboard
Issue with XGBoost Solution in 4_1_SOLUTION_additional_models_xgboost.ipynb
When going through the example solution for the XGBoost enhancement (4_1_SOLUTION_additional_models_xgboost.ipynb) to the solution I get the following error in cell 19:
%%time scored_sdf = test_sdf.with_column('PREDICTION', udf_score_logistic_xgboost_model_vec_cached(*feature_cols)) scored_sdf.write.save_as_table(table_name='CREDIT_RISK_PREPARED_BALANCED_TEST_SCORED', mode='overwrite')
When executed there is an error related to feature name mismatch in the UDF udf_score_logistic_xgboost_model_vec_cached:
Failed to execute query [queryID: ] CREATE OR REPLACE TABLE CREDIT_RISK_PREPARED_BALANCED_TEST_SCORED AS SELECT * FROM ( SELECT "FLAG_WORK_PHONE", "FLAG_PHONE", "FLAG_EMAIL", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "CNT_CHILDREN_IND_0", "CNT_CHILDREN_IND_1", "CNT_CHILDREN_IND_2MORE", "AMT_INCOME_TOTAL", "AMT_INCOME_TOTAL_BUCKET_HIGH", "AMT_INCOME_TOTAL_BUCKET_LOW", "AMT_INCOME_TOTAL_BUCKET_MEDIUM", "AGE_BUCKET_HIGH", "AGE_BUCKET_HIGHEST", "AGE_BUCKET_LOW", "AGE_BUCKET_LOWEST", "AGE_BUCKET_MEDIUM", "WORKYEAR_BUCKET_HIGH", "WORKYEAR_BUCKET_HIGHEST", "WORKYEAR_BUCKET_LOW", "WORKYEAR_BUCKET_LOWEST", "WORKYEAR_BUCKET_MEDIUM", "CNT_FAM_MEMBERS_IND_1", "CNT_FAM_MEMBERS_IND_2", "CNT_FAM_MEMBERS_IND_3MORE", "NAME_INCOME_TYPE_COMMERCIAL_ASSOCIATE", "NAME_INCOME_TYPE_STATE_SERVANT", "NAME_INCOME_TYPE_WORKING", "OCCUPATION_TYPE_HIGHTECHWORK", "OCCUPATION_TYPE_LABOURWORK", "OCCUPATION_TYPE_OFFICEWORK", "OCCUPATION_TYPE_OTHER", "NAME_HOUSING_TYPE_CO_OP_APARTMENT", "NAME_HOUSING_TYPE_HOUSE_APARTMENT", "NAME_HOUSING_TYPE_MUNICIPAL_APARTMENT", "NAME_HOUSING_TYPE_OFFICE_APARTMENT", "NAME_HOUSING_TYPE_RENTED_APARTMENT", "NAME_HOUSING_TYPE_WITH_PARENTS", "NAME_EDUCATION_TYPE_HIGHER_EDUCATION", "NAME_EDUCATION_TYPE_INCOMPLETE_HIGHER", "NAME_EDUCATION_TYPE_LOWER_SECONDARY", "NAME_EDUCATION_TYPE_SECONDARY_SECONDARY_SPECIAL", "NAME_FAMILY_STATUS_CIVIL_MARRIAGE", "NAME_FAMILY_STATUS_MARRIED", "NAME_FAMILY_STATUS_SEPARATED", "NAME_FAMILY_STATUS_SINGLE_NOT_MARRIED", "NAME_FAMILY_STATUS_WIDOW", "TARGET", udf_score_logistic_xgboost_model_vec_cached("FLAG_WORK_PHONE", "FLAG_PHONE", "FLAG_EMAIL", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "CNT_CHILDREN_IND_0", "CNT_CHILDREN_IND_1", "CNT_CHILDREN_IND_2MORE", "AMT_INCOME_TOTAL", "AMT_INCOME_TOTAL_BUCKET_HIGH", "AMT_INCOME_TOTAL_BUCKET_LOW", "AMT_INCOME_TOTAL_BUCKET_MEDIUM", "AGE_BUCKET_HIGH", "AGE_BUCKET_HIGHEST", "AGE_BUCKET_LOW", "AGE_BUCKET_LOWEST", "AGE_BUCKET_MEDIUM", "WORKYEAR_BUCKET_HIGH", "WORKYEAR_BUCKET_HIGHEST", "WORKYEAR_BUCKET_LOW", "WORKYEAR_BUCKET_LOWEST", "WORKYEAR_BUCKET_MEDIUM", "CNT_FAM_MEMBERS_IND_1", "CNT_FAM_MEMBERS_IND_2", "CNT_FAM_MEMBERS_IND_3MORE", "NAME_INCOME_TYPE_COMMERCIAL_ASSOCIATE", "NAME_INCOME_TYPE_STATE_SERVANT", "NAME_INCOME_TYPE_WORKING", "OCCUPATION_TYPE_HIGHTECHWORK", "OCCUPATION_TYPE_LABOURWORK", "OCCUPATION_TYPE_OFFICEWORK", "OCCUPATION_TYPE_OTHER", "NAME_HOUSING_TYPE_CO_OP_APARTMENT", "NAME_HOUSING_TYPE_HOUSE_APARTMENT", "NAME_HOUSING_TYPE_MUNICIPAL_APARTMENT", "NAME_HOUSING_TYPE_OFFICE_APARTMENT", "NAME_HOUSING_TYPE_RENTED_APARTMENT", "NAME_HOUSING_TYPE_WITH_PARENTS", "NAME_EDUCATION_TYPE_HIGHER_EDUCATION", "NAME_EDUCATION_TYPE_INCOMPLETE_HIGHER", "NAME_EDUCATION_TYPE_LOWER_SECONDARY", "NAME_EDUCATION_TYPE_SECONDARY_SECONDARY_SPECIAL", "NAME_FAMILY_STATUS_CIVIL_MARRIAGE", "NAME_FAMILY_STATUS_MARRIED", "NAME_FAMILY_STATUS_SEPARATED", "NAME_FAMILY_STATUS_SINGLE_NOT_MARRIED", "NAME_FAMILY_STATUS_WIDOW") AS "PREDICTION" FROM ( SELECT * FROM (CREDIT_RISK_PREPARED_BALANCED_TEST))) 100357 (P0000): Python Interpreter Error: Traceback (most recent call last): File "_udf_code.py", line 52, in compute File "_udf_code.py", line 41, in wrapper File "C:\Users\astifora.BISONTRANSPORT\AppData\Local\Temp\ipykernel_16188\4020036755.py", line 19, in udf_score_logistic_xgboost_model_vec_cached File "/usr/lib/python_udf/c40e74599dd9deffee89cef7cfe7e0d86d5de0207a8dc18efb524438272a3cb2/lib/python3.8/site-packages/xgboost/sklearn.py", line 1525, in predict class_probs = super().predict( File "/usr/lib/python_udf/c40e74599dd9deffee89cef7cfe7e0d86d5de0207a8dc18efb524438272a3cb2/lib/python3.8/site-packages/xgboost/sklearn.py", line 1114, in predict predts = self.get_booster().inplace_predict( File "/usr/lib/python_udf/c40e74599dd9deffee89cef7cfe7e0d86d5de0207a8dc18efb524438272a3cb2/lib/python3.8/site-packages/xgboost/core.py", line 2285, in inplace_predict self._validate_features(fns) File "/usr/lib/python_udf/c40e74599dd9deffee89cef7cfe7e0d86d5de0207a8dc18efb524438272a3cb2/lib/python3.8/site-packages/xgboost/core.py", line 2779, in _validate_features raise ValueError(msg.format(self.feature_names, feature_names)) ValueError: feature_names mismatch: ['AGE_BUCKET_HIGH', 'AGE_BUCKET_HIGHEST', 'AGE_BUCKET_LOW', 'AGE_BUCKET_LOWEST', 'AGE_BUCKET_MEDIUM', 'AMT_INCOME_TOTAL', 'AMT_INCOME_TOTAL_BUCKET_HIGH', 'AMT_INCOME_TOTAL_BUCKET_LOW', 'AMT_INCOME_TOTAL_BUCKET_MEDIUM', 'CNT_CHILDREN_IND_0', 'CNT_CHILDREN_IND_1', 'CNT_CHILDREN_IND_2MORE', 'CNT_FAM_MEMBERS_IND_1', 'CNT_FAM_MEMBERS_IND_2', 'CNT_FAM_MEMBERS_IND_3MORE', 'CODE_GENDER', 'FLAG_EMAIL', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_PHONE', 'FLAG_WORK_PHONE', 'NAME_EDUCATION_TYPE_HIGHER_EDUCATION', 'NAME_EDUCATION_TYPE_INCOMPLETE_HIGHER', 'NAME_EDUCATION_TYPE_LOWER_SECONDARY', 'NAME_EDUCATION_TYPE_SECONDARY_SECONDARY_SPECIAL', 'NAME_FAMILY_STATUS_CIVIL_MARRIAGE', 'NAME_FAMILY_STATUS_MARRIED', 'NAME_FAMILY_STATUS_SEPARATED', 'NAME_FAMILY_STATUS_SINGLE_NOT_MARRIED', 'NAME_FAMILY_STATUS_WIDOW', 'NAME_HOUSING_TYPE_CO_OP_APARTMENT', 'NAME_HOUSING_TYPE_HOUSE_APARTMENT', 'NAME_HOUSING_TYPE_MUNICIPAL_APARTMENT', 'NAME_HOUSING_TYPE_OFFICE_APARTMENT', 'NAME_HOUSING_TYPE_RENTED_APARTMENT', 'NAME_HOUSING_TYPE_WITH_PARENTS', 'NAME_INCOME_TYPE_COMMERCIAL_ASSOCIATE', 'NAME_INCOME_TYPE_STATE_SERVANT', 'NAME_INCOME_TYPE_WORKING', 'OCCUPATION_TYPE_HIGHTECHWORK', 'OCCUPATION_TYPE_LABOURWORK', 'OCCUPATION_TYPE_OFFICEWORK', 'OCCUPATION_TYPE_OTHER', 'WORKYEAR_BUCKET_HIGH', 'WORKYEAR_BUCKET_HIGHEST', 'WORKYEAR_BUCKET_LOW', 'WORKYEAR_BUCKET_LOWEST', 'WORKYEAR_BUCKET_MEDIUM'] ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47'] expected CNT_FAM_MEMBERS_IND_1, NAME_INCOME_TYPE_STATE_SERVANT, NAME_FAMILY_STATUS_SEPARATED, NAME_EDUCATION_TYPE_HIGHER_EDUCATION, CNT_CHILDREN_IND_1, NAME_HOUSING_TYPE_CO_OP_APARTMENT, NAME_HOUSING_TYPE_RENTED_APARTMENT, WORKYEAR_BUCKET_LOWEST, AMT_INCOME_TOTAL_BUCKET_LOW, NAME_EDUCATION_TYPE_INCOMPLETE_HIGHER, AGE_BUCKET_HIGH, FLAG_OWN_CAR, WORKYEAR_BUCKET_HIGH, AMT_INCOME_TOTAL, NAME_FAMILY_STATUS_SINGLE_NOT_MARRIED, OCCUPATION_TYPE_OTHER, WORKYEAR_BUCKET_MEDIUM, NAME_FAMILY_STATUS_CIVIL_MARRIAGE, NAME_HOUSING_TYPE_WITH_PARENTS, OCCUPATION_TYPE_OFFICEWORK, CODE_GENDER, NAME_INCOME_TYPE_WORKING, NAME_HOUSING_TYPE_MUNICIPAL_APARTMENT, CNT_FAM_MEMBERS_IND_3MORE, AGE_BUCKET_LOW, NAME_HOUSING_TYPE_HOUSE_APARTMENT, CNT_FAM_MEMBERS_IND_2, CNT_CHILDREN_IND_0, CNT_CHILDREN_IND_2MORE, OCCUPATION_TYPE_HIGHTECHWORK, OCCUPATION_TYPE_LABOURWORK, WORKYEAR_BUCKET_LOW, FLAG_EMAIL, AGE_BUCKET_LOWEST, AMT_INCOME_TOTAL_BUCKET_MEDIUM, NAME_EDUCATION_TYPE_LOWER_SECONDARY, AMT_INCOME_TOTAL_BUCKET_HIGH, FLAG_WORK_PHONE, NAME_INCOME_TYPE_COMMERCIAL_ASSOCIATE, AGE_BUCKET_HIGHEST, FLAG_OWN_REALTY, NAME_HOUSING_TYPE_OFFICE_APARTMENT, NAME_FAMILY_STATUS_MARRIED, WORKYEAR_BUCKET_HIGHEST, NAME_EDUCATION_TYPE_SECONDARY_SECONDARY_SPECIAL, NAME_FAMILY_STATUS_WIDOW, AGE_BUCKET_MEDIUM, FLAG_PHONE in input data training data did not have the following fields: 31, 25, 28, 9, 4, 39, 1, 11, 19, 3, 40, 7, 14, 6, 5, 35, 18, 15, 46, 17, 22, 13, 0, 21, 29, 43, 20, 30, 24, 42, 33, 37, 12, 8, 16, 23, 45, 26, 10, 41, 2, 27, 38, 44, 32, 47, 34, 36 in function UDF_SCORE_LOGISTIC_XGBOOST_MODEL_VEC_CACHED with handler compute