import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.impute import SimpleImputer
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import PolynomialFeatures

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import RFECV

from sklearn.metrics import (roc_auc_score, classification_report,
    confusion_matrix)

from sklearn.model_selection import cross_val_predict


#Adapted from lecture notes
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)


credit_card_df = pd.read_csv("data/UCI_Credit_Card.csv")
credit_card_df.head(5)


credit_card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   30000 non-null  float64
 14  BILL_AMT3                   30000 non-null  float64
 15  BILL_AMT4                   30000 non-null  float64
 16  BILL_AMT5                   30000 non-null  float64
 17  BILL_AMT6                   30000 non-null  float64
 18  PAY_AMT1                    30000 non-null  float64
 19  PAY_AMT2                    30000 non-null  float64
 20  PAY_AMT3                    30000 non-null  float64
 21  PAY_AMT4                    30000 non-null  float64
 22  PAY_AMT5                    30000 non-null  float64
 23  PAY_AMT6                    30000 non-null  float64
 24  default.payment.next.month  30000 non-null  int64  
dtypes: float64(13), int64(12)
memory usage: 5.7 MB


credit_card_df.shape

(30000, 25)


train_df, test_df = train_test_split(credit_card_df, test_size = 0.3, random_state=123)

#target column name
target = 'default.payment.next.month'

X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]


# printing the number of observations for train and test sets
print('The number of observations for train set: ', train_df[target].shape[0])
print('The number of observations for test set: ', test_df[target].shape[0])

The number of observations for train set:  21000
The number of observations for test set:  9000


# percentage of zeros and ones in default column train set
train_percent_defaults = train_df[target].value_counts(normalize=True) * 100
train_percent_defaults.name = 'Default Count Percent'

# count of observations were default is one or zero in train set 
train_yes_default = len(train_df[train_df[target] == 1])
train_no_default = len(train_df[train_df[target] == 0])


# convert to a dataframe and make column names readable
default_percent_df = pd.DataFrame(train_percent_defaults)
default_percent_df = default_percent_df.rename(index = {0: 'No (0)', 1: 'Yes (1)'})

# make a dictionary of classes count values 
count_dic_no = {"Count": train_no_default 
               }

count_dic_yes = {"Count": train_yes_default
                }

# make a list from classes default payment counts 
list_default = [count_dic_no, count_dic_yes]

# convert to a dataframe
default_count = pd.DataFrame(list_default, index = ['No (0)', 'Yes (1)'])

# join two dataframes
df_default = default_percent_df.join(default_count)
df_default


# Make a pie chart
df_default['Default Count Percent'].plot(kind='pie',labels = ['',''], autopct='%1.1f%%',
                        explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['No Default', 'Default'])
plt.show()


scoring_metric = ["f1", "recall", "average_precision"]


#Plot the histograms: Extracted from skeleton code from lab2, 571 class 
ax = train_df.groupby(target)['LIMIT_BAL'].plot.hist(bins=20, alpha=0.5, legend=True)
plt.xlabel('LIMIT_BAL')
plt.title("Histogram of given credit")
plt.legend(labels=['No Default', 'Default'])
plt.ylabel("Default Frequency", fontsize= 12)
plt.ticklabel_format(style='plain', axis='x')

plt.show()


#Column names for bill payment
numeric_cols_bill = ['BILL_AMT1', 'BILL_AMT2',
                'BILL_AMT3', 'BILL_AMT4','BILL_AMT5']


#Specifying the size of figures
plt.figure(figsize=(15,12))

# Plot the histograms: Extracted from skeleton code from lab2, 571 class
for i, col in enumerate(numeric_cols_bill):
    plt.subplot(3, 2, i + 1)
    ax = train_df.groupby(target)[col].plot.hist(bins=50, alpha=0.5, legend=True)
    plt.xlabel(col)
    plt.title("Histogram of " + col)
    plt.legend(labels=['No Default', 'Default'])
    plt.ylabel("Default Frequency", fontsize= 12)
    plt.tight_layout()
    
plt.show()


#count of observations were default is one or zero in train set 
train_gender_male = len(train_df[train_df[target] == 1])
train_gender_female = len(train_df[train_df[target] == 0])

#make a dictionary of classes count values 
count_dic_male = {"Count": train_gender_male 
               }

count_dic_female = {"Count": train_gender_female
                }

#make a list 
list_gender = [count_dic_male, count_dic_female]

#make a dataframe
gender_count = pd.DataFrame(list_gender, index = ['Male', 'Female'])
gender_count


#get the percentage of the likelihood of default vs no default payment by gender
gender_default_percent = pd.DataFrame(train_df.groupby(target)['SEX'].value_counts(normalize = True) * 100)

#save in the dictionary the values for the percentage of default payment only
dict_gender = {'Default Percentage': gender_default_percent['SEX'].values.tolist()[2:]
              }

df_gender = pd.DataFrame(dict_gender, index = ["Male", "Female"])
df_gender


df_gender['Default Percentage'].plot(kind='pie',labels = ['',''], autopct='%1.1f%%',
                        explode = [0,0.05], textprops = {"fontsize":15})

plt.legend(labels=['Male', 'Female'])
plt.show()


X_train["avg_bill_amt"] = X_train.loc[:, "BILL_AMT1":"BILL_AMT6"].mean(axis=1)
X_train["avg_pay_amt"] = X_train.loc[:, "PAY_AMT1":"PAY_AMT6"].mean(axis=1)

X_test["avg_bill_amt"] = X_test.loc[:, "BILL_AMT1":"BILL_AMT6"].mean(axis=1)
X_test["avg_pay_amt"] = X_test.loc[:, "PAY_AMT1":"PAY_AMT6"].mean(axis=1)

# convert feature types for later value reordering
X_train = X_train.astype(
    {
        "PAY_0": "str",
        "PAY_2": "str",
        "PAY_3": "str",
        "PAY_4": "str",
        "PAY_5": "str",
        "PAY_6": "str",
    }
)

# convert feature types for later value reordering
X_test = X_test.astype(
    {
        "PAY_0": "str",
        "PAY_2": "str",
        "PAY_3": "str",
        "PAY_4": "str",
        "PAY_5": "str",
        "PAY_6": "str",
    }
)


# differing orders were requires as PAY_5 and PAY_6 were missing a value
ordering_pay0_pay4 = ['8', '7', '6', '5', '4', '3', '2', '1', '0', '-1', '-2']
ordering_pay5_pay6 = ['8', '7', '6', '5', '4', '3', '2', '0', '-1', '-2']


category_feats = ["SEX", "MARRIAGE", "EDUCATION"]
ordinal_feats_pay_1 = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4']
ordinal_feats_pay_2 = ['PAY_5', 'PAY_6']
ordering_pay_1 = [ordering_pay0_pay4] * len(ordinal_feats_pay_1)
ordering_pay_2 = [ordering_pay5_pay6] * len(ordinal_feats_pay_2)
drop_feats = ['ID']

numeric_feats  = list(set(X_train.columns)
            - set(category_feats)
            - set(ordinal_feats_pay_1)
            - set(ordinal_feats_pay_2)
            - set(drop_feats)
        )

preprocessor = make_column_transformer(
    ("drop", drop_feats),
    (StandardScaler(), numeric_feats),
    (OrdinalEncoder(categories=ordering_pay_1), ordinal_feats_pay_1),
    (OrdinalEncoder(categories=ordering_pay_2), ordinal_feats_pay_2),
    (OneHotEncoder(handle_unknown="ignore", sparse=False), category_feats),
)
preprocessor.fit(X_train, y_train)

ColumnTransformer(transformers=[('drop', 'drop', ['ID']),
                                ('standardscaler', StandardScaler(),
                                 ['PAY_AMT3', 'PAY_AMT2', 'PAY_AMT5',
                                  'BILL_AMT6', 'PAY_AMT4', 'BILL_AMT2',
                                  'PAY_AMT6', 'BILL_AMT5', 'avg_pay_amt', 'AGE',
                                  'BILL_AMT3', 'BILL_AMT1', 'PAY_AMT1',
                                  'avg_bill_amt', 'BILL_AMT4', 'LIMIT_BAL']),
                                ('ordinalencoder-1',
                                 OrdinalEncoder(categories=[['8', '7', '6', '5',
                                                             '4', '3',...
                                                             '0', '-1', '-2'],
                                                            ['8', '7', '6', '5',
                                                             '4', '3', '2', '1',
                                                             '0', '-1',
                                                             '-2']]),
                                 ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4']),
                                ('ordinalencoder-2',
                                 OrdinalEncoder(categories=[['8', '7', '6', '5',
                                                             '4', '3', '2', '0',
                                                             '-1', '-2'],
                                                            ['8', '7', '6', '5',
                                                             '4', '3', '2', '0',
                                                             '-1', '-2']]),
                                 ['PAY_5', 'PAY_6']),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['SEX', 'MARRIAGE', 'EDUCATION'])])


results = {}


dummy = DummyClassifier()
baseline_pipe = make_pipeline(preprocessor, dummy)
results['dummy'] = mean_std_cross_val_scores(make_pipeline(preprocessor, dummy), X_train, y_train, 
                                             return_train_score=True, scoring=scoring_metric)


pd.DataFrame(results).T


#pipe logistic regression
pipe_logisticregression = make_pipeline(preprocessor,
                           LogisticRegression(max_iter=2000, 
                                             random_state=123))


#save in the results logistic regression score
results["LogisticReg"] = mean_std_cross_val_scores(pipe_logisticregression, 
                                                   X_train, 
                                                   y_train, 
                                                   return_train_score=True,
                                                   scoring = scoring_metric)
pd.DataFrame(results)


#parameters for logistic regression
param_dist_lg = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                 'logisticregression__class_weight': ['balanced', None]}

#randomized search to find the best parameters
random_search_lg = RandomizedSearchCV(
    pipe_logisticregression, 
    param_dist_lg,
    n_jobs=-1,
    return_train_score=True,
    scoring = scoring_metric,
    refit='f1',
    random_state=123
)


random_search_lg.fit(X_train, y_train)
print("Best parameter values are:", random_search_lg.best_params_)
print("Best cv score is:", random_search_lg.best_score_)

Best parameter values are: {'logisticregression__class_weight': 'balanced', 'logisticregression__C': 0.1}
Best cv score is: 0.4919653206118707


#make optimized parameters pipeline
pipe_logisticregression_opt = make_pipeline(preprocessor,
                                            LogisticRegression(max_iter=2000,
                                                               C = 0.1,
                                                               class_weight = 'balanced',
                                                               random_state=123))


results['LogisticReg_opt'] = mean_std_cross_val_scores(pipe_logisticregression_opt, 
                                                       X_train, 
                                                       y_train, 
                                                       return_train_score=True,
                                                       scoring = scoring_metric)


pd.DataFrame(results)


lr = confusion_matrix(y_train, cross_val_predict(pipe_logisticregression_opt, X_train, y_train))
lr

array([[11758,  4554],
       [ 1672,  3016]])


models = {}


#Random Forest pipe
pipe_rf = make_pipeline(preprocessor, 
                        RandomForestClassifier(random_state=123))
models["RandomForest"] = pipe_rf

#XGBoost pipe
pipe_xgb = make_pipeline(preprocessor, 
                         XGBClassifier(random_state=123, 
                                       eval_metric="logloss", 
                                       verbosity=0,
                                       use_label_encoder=False))
models["XGBoost"] = pipe_xgb

#LGBM Classifier pipe
pipe_lgbm = make_pipeline(preprocessor, 
                          LGBMClassifier(random_state=123))
models["LGBM"] = pipe_lgbm

#Catboost pipe
pipe_catb = make_pipeline(preprocessor, 
                          CatBoostClassifier(verbose=0, 
                                             random_state=123))
models["Cat_Boost"] = pipe_catb

#summarize mean cv scores in result_non_linear
for (name, model) in models.items():
    results[name] = mean_std_cross_val_scores(
    model, X_train, y_train, return_train_score=True, scoring=scoring_metric
)


pd.DataFrame(results)


result_feature_select = {}


rfecv = RFECV(Ridge())

pipe_rf_rfecv = make_pipeline(
    preprocessor, rfecv, LGBMClassifier(random_state=123)
)

result_feature_select["rf+rfecv"] = mean_std_cross_val_scores(
    pipe_rf_rfecv, X_train, y_train, return_train_score=True, scoring=scoring_metric
)


pd.DataFrame(result_feature_select)


param_grid_rf = {
     "randomforestclassifier__n_estimators": np.linspace(start=200, stop=500, num=5, dtype=int),
     "randomforestclassifier__max_depth": np.linspace(start=10, stop=110, num=5, dtype=int),
     "randomforestclassifier__class_weight": ['balanced', None]
}


random_search_rf = RandomizedSearchCV(
    pipe_rf,
    param_grid_rf,
    n_jobs=-1,
    n_iter=6,
    scoring="f1",
    cv=5,
    random_state=123,
    return_train_score=True,
)

random_search_rf.fit(X_train, y_train)

/Users/jzhu/opt/miniconda3/envs/573/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py:702: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  warnings.warn(

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('drop',
                                                                               'drop',
                                                                               ['ID']),
                                                                              ('standardscaler',
                                                                               StandardScaler(),
                                                                               ['PAY_AMT3',
                                                                                'PAY_AMT2',
                                                                                'PAY_AMT5',
                                                                                'BILL_AMT6',
                                                                                'PAY_AMT4',
                                                                                'BILL_AMT2',
                                                                                'PAY_AMT6',
                                                                                'BILL_AMT5',
                                                                                'avg_pay_amt',
                                                                                'AGE',
                                                                                'BILL_AMT3',
                                                                                'BILL_AMT1',
                                                                                'PAY_AMT1',
                                                                                'avg_bill_amt',
                                                                                'BILL_AMT4',
                                                                                'LIMIT_BAL']),
                                                                              (...
                                             ('randomforestclassifier',
                                              RandomForestClassifier(random_state=123))]),
                   n_iter=6, n_jobs=-1,
                   param_distributions={'randomforestclassifier__class_weight': ['balanced',
                                                                                 None],
                                        'randomforestclassifier__max_depth': array([ 10,  35,  60,  85, 110]),
                                        'randomforestclassifier__n_estimators': array([200, 275, 350, 425, 500])},
                   random_state=123, return_train_score=True, scoring='f1')


print("Best hyperparameter values for Random Forest: ", random_search_rf.best_params_)
print("Best score: %0.3f" % (random_search_rf.best_score_))

Best hyperparameter values for Random Forest:  {'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__class_weight': 'balanced'}
Best score: 0.545


param_grid_xgb = {
     "xgbclassifier__max_depth": np.arange(5, 25, 3),
}


random_search_xgboost = RandomizedSearchCV(
    pipe_xgb,
    param_grid_xgb,
    n_jobs=-1,
    n_iter=6,
    scoring="f1",
    cv=5,
    random_state=123,
    return_train_score=True,
)

random_search_xgboost.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('drop',
                                                                               'drop',
                                                                               ['ID']),
                                                                              ('standardscaler',
                                                                               StandardScaler(),
                                                                               ['PAY_AMT3',
                                                                                'PAY_AMT2',
                                                                                'PAY_AMT5',
                                                                                'BILL_AMT6',
                                                                                'PAY_AMT4',
                                                                                'BILL_AMT2',
                                                                                'PAY_AMT6',
                                                                                'BILL_AMT5',
                                                                                'avg_pay_amt',
                                                                                'AGE',
                                                                                'BILL_AMT3',
                                                                                'BILL_AMT1',
                                                                                'PAY_AMT1',
                                                                                'avg_bill_amt',
                                                                                'BILL_AMT4',
                                                                                'LIMIT_BAL']),
                                                                              (...
                                                            n_jobs=None,
                                                            num_parallel_tree=None,
                                                            random_state=123,
                                                            reg_alpha=None,
                                                            reg_lambda=None,
                                                            scale_pos_weight=None,
                                                            subsample=None,
                                                            tree_method=None,
                                                            use_label_encoder=False,
                                                            validate_parameters=None,
                                                            verbosity=0))]),
                   n_iter=6, n_jobs=-1,
                   param_distributions={'xgbclassifier__max_depth': array([ 5,  8, 11, 14, 17, 20, 23])},
                   random_state=123, return_train_score=True, scoring='f1')


print("Best hyperparameter values for XGBClassifier: ", random_search_xgboost.best_params_)
print("Best score: %0.3f" % (random_search_xgboost.best_score_))

Best hyperparameter values for XGBClassifier:  {'xgbclassifier__max_depth': 20}
Best score: 0.466


param_grid_lgbm = {
     "lgbmclassifier__max_depth": np.arange(5, 25, 3),
     "lgbmclassifier__num_leaves": np.arange(20, 200, 20),
     "lgbmclassifier__class_weight": ['balanced', None]
}


random_search_lgbm = RandomizedSearchCV(
    pipe_lgbm,
    param_grid_lgbm,
    n_jobs=-1,
    n_iter=6,
    scoring="f1",
    cv=5,
    random_state=123,
    return_train_score=True,
)

random_search_lgbm.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('drop',
                                                                               'drop',
                                                                               ['ID']),
                                                                              ('standardscaler',
                                                                               StandardScaler(),
                                                                               ['PAY_AMT3',
                                                                                'PAY_AMT2',
                                                                                'PAY_AMT5',
                                                                                'BILL_AMT6',
                                                                                'PAY_AMT4',
                                                                                'BILL_AMT2',
                                                                                'PAY_AMT6',
                                                                                'BILL_AMT5',
                                                                                'avg_pay_amt',
                                                                                'AGE',
                                                                                'BILL_AMT3',
                                                                                'BILL_AMT1',
                                                                                'PAY_AMT1',
                                                                                'avg_bill_amt',
                                                                                'BILL_AMT4',
                                                                                'LIMIT_BAL']),
                                                                              (...
                                                                                'MARRIAGE',
                                                                                'EDUCATION'])])),
                                             ('lgbmclassifier',
                                              LGBMClassifier(random_state=123))]),
                   n_iter=6, n_jobs=-1,
                   param_distributions={'lgbmclassifier__class_weight': ['balanced',
                                                                         None],
                                        'lgbmclassifier__max_depth': array([ 5,  8, 11, 14, 17, 20, 23]),
                                        'lgbmclassifier__num_leaves': array([ 20,  40,  60,  80, 100, 120, 140, 160, 180])},
                   random_state=123, return_train_score=True, scoring='f1')


print("Best hyperparameter values for LGBMlassifier: ", random_search_lgbm.best_params_)
print("Best score: %0.3f" % (random_search_lgbm.best_score_))

Best hyperparameter values for LGBMlassifier:  {'lgbmclassifier__num_leaves': 100, 'lgbmclassifier__max_depth': 5, 'lgbmclassifier__class_weight': 'balanced'}
Best score: 0.546


param_grid_cat = {
     "catboostclassifier__max_depth": np.arange(5, 15, 2),
     "catboostclassifier__learning_rate": np.linspace(0.1,0.2,5),
     "catboostclassifier__auto_class_weights": ["None", "Balanced"]
}


random_search_cat = RandomizedSearchCV(
    pipe_catb,
    param_grid_cat,
    n_jobs=-1,
    n_iter=6,
    scoring="f1",
    cv=5,
    random_state=123,
    return_train_score=True,
)

random_search_cat.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('drop',
                                                                               'drop',
                                                                               ['ID']),
                                                                              ('standardscaler',
                                                                               StandardScaler(),
                                                                               ['PAY_AMT3',
                                                                                'PAY_AMT2',
                                                                                'PAY_AMT5',
                                                                                'BILL_AMT6',
                                                                                'PAY_AMT4',
                                                                                'BILL_AMT2',
                                                                                'PAY_AMT6',
                                                                                'BILL_AMT5',
                                                                                'avg_pay_amt',
                                                                                'AGE',
                                                                                'BILL_AMT3',
                                                                                'BILL_AMT1',
                                                                                'PAY_AMT1',
                                                                                'avg_bill_amt',
                                                                                'BILL_AMT4',
                                                                                'LIMIT_BAL']),
                                                                              (...
                                             ('catboostclassifier',
                                              <catboost.core.CatBoostClassifier object at 0x174aec7c0>)]),
                   n_iter=6, n_jobs=-1,
                   param_distributions={'catboostclassifier__auto_class_weights': ['None',
                                                                                   'Balanced'],
                                        'catboostclassifier__learning_rate': array([0.1  , 0.125, 0.15 , 0.175, 0.2  ]),
                                        'catboostclassifier__max_depth': array([ 5,  7,  9, 11, 13])},
                   random_state=123, return_train_score=True, scoring='f1')


print("Best hyperparameter values for CatBoost: ", random_search_cat.best_params_)
print("Best score: %0.3f" % (random_search_cat.best_score_))

Best hyperparameter values for CatBoost:  {'catboostclassifier__max_depth': 5, 'catboostclassifier__learning_rate': 0.125, 'catboostclassifier__auto_class_weights': 'Balanced'}
Best score: 0.522


classifiers = {
    "random_forest_opt": make_pipeline(preprocessor, 
                                   RandomForestClassifier(random_state=123,
                                                          n_estimators=200, 
                                                          max_depth=10, 
                                                          class_weight='balanced')),
    "XGBoost_opt": make_pipeline(preprocessor, 
                                 XGBClassifier(random_state=123, 
                                               eval_metric="logloss", 
                                               verbosity=0,
                                               max_depth = 20,
                                               use_label_encoder =False)),
    "LightGBM_opt": make_pipeline(preprocessor, 
                                  LGBMClassifier(random_state=123,
                                                 num_leaves=100,
                                                 max_depth= 5,
                                                 class_weight = 'balanced')),
    "CatBoost_opt": make_pipeline(preprocessor, 
                                  CatBoostClassifier(verbose=0,
                                                     random_state=123,
                                                     max_depth = 5, 
                                                     learning_rate = 0.125,
                                                     auto_class_weights = 'Balanced')),
}


for (name, model) in classifiers.items():
    results[name] = mean_std_cross_val_scores(
    model, X_train, y_train, return_train_score=True, scoring=scoring_metric
)


pd.DataFrame(results)


import eli5


pipe_feats = make_pipeline(preprocessor, LGBMClassifier(random_state=123,
                                            num_leaves=100,
                                            max_depth= 5,
                                            class_weight = 'balanced'))
pipe_feats.fit(X_train, y_train)

feature_names = list(numeric_feats + ordinal_feats_pay_1 + ordinal_feats_pay_2 +
                     pipe_feats.named_steps["columntransformer"].named_transformers_['onehotencoder'].get_feature_names_out().tolist())


random_search_lgbm

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('drop',
                                                                               'drop',
                                                                               ['ID']),
                                                                              ('standardscaler',
                                                                               StandardScaler(),
                                                                               ['PAY_AMT3',
                                                                                'PAY_AMT2',
                                                                                'PAY_AMT5',
                                                                                'BILL_AMT6',
                                                                                'PAY_AMT4',
                                                                                'BILL_AMT2',
                                                                                'PAY_AMT6',
                                                                                'BILL_AMT5',
                                                                                'avg_pay_amt',
                                                                                'AGE',
                                                                                'BILL_AMT3',
                                                                                'BILL_AMT1',
                                                                                'PAY_AMT1',
                                                                                'avg_bill_amt',
                                                                                'BILL_AMT4',
                                                                                'LIMIT_BAL']),
                                                                              (...
                                                                                'MARRIAGE',
                                                                                'EDUCATION'])])),
                                             ('lgbmclassifier',
                                              LGBMClassifier(random_state=123))]),
                   n_iter=6, n_jobs=-1,
                   param_distributions={'lgbmclassifier__class_weight': ['balanced',
                                                                         None],
                                        'lgbmclassifier__max_depth': array([ 5,  8, 11, 14, 17, 20, 23]),
                                        'lgbmclassifier__num_leaves': array([ 20,  40,  60,  80, 100, 120, 140, 160, 180])},
                   random_state=123, return_train_score=True, scoring='f1')


eli5.explain_weights(pipe_feats.named_steps['lgbmclassifier'], feature_names=feature_names)


X_train_enc = pd.DataFrame(
    data=preprocessor.transform(X_train),
    columns=feature_names,
    index=X_train.index,
)


import shap

lgbm_explainer = shap.TreeExplainer(pipe_feats.named_steps["lgbmclassifier"])
train_lgbm_shap_values = lgbm_explainer.shap_values(X_train_enc)
shap.summary_plot(train_lgbm_shap_values[1], X_train_enc, plot_type='bar')

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


from sklearn.metrics import average_precision_score, f1_score, recall_score

predictions = random_search_lgbm.predict(X_test)

print("Test Set f1 score: ", round(f1_score(y_test, predictions), 3))
print(
    "Test Set average precision score: ",
    round(average_precision_score(y_test, random_search_lgbm.predict_proba(X_test)[:, 1]),3))
print("Test Set recall score: ", round(recall_score(y_test, predictions), 3))

Test Set f1 score:  0.532
Test Set average precision score:  0.541
Test Set recall score:  0.609


print("Validation Set f1 score: ", 
      pd.DataFrame(results).T.loc[['LightGBM_opt'], ['test_f1']]['test_f1'][0])
print("Validation Set average precision score: ",
      pd.DataFrame(results).T.loc[['LightGBM_opt'], ['test_average_precision']]['test_average_precision'][0])
print("Validation Set recall score: ", 
      pd.DataFrame(results).T.loc[['LightGBM_opt'], ['test_recall']]['test_recall'][0])

Validation Set f1 score:  0.546 (+/- 0.014)
Validation Set average precision score:  0.565 (+/- 0.021)
Validation Set recall score:  0.615 (+/- 0.015)


# Encoding X_test for SHAP force plot
X_test_enc = pd.DataFrame(
    data=preprocessor.transform(X_test),
    columns=feature_names,
    index=X_test.index,
)
X_test_enc.shape

(9000, 35)


# Create an explainer for X_test_enc
test_lgbm_shap_values = lgbm_explainer.shap_values(X_test_enc)
test_lgbm_shap_values[1].shape

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray

(9000, 35)


#Resetting index to explain X_test predictions
y_test_reset = y_test.reset_index(drop=True)


nodefault = y_test_reset[y_test_reset == 0].index.tolist()
yesdefault = y_test_reset[y_test_reset == 1].index.tolist()

# chose the 10th row of each yes/no prediction in X_test
ex_nodefault = nodefault[10]
ex_yesdefault = yesdefault[10]


# Force Plot for prediction of 10th row (no_default)

shap.force_plot(
    lgbm_explainer.expected_value[1],
    test_lgbm_shap_values[1][ex_nodefault, :],
    X_test_enc.iloc[ex_nodefault, :],
    matplotlib=True,
)


# Force Plot for prediction of 10th row (yes_default)

shap.force_plot(
    lgbm_explainer.expected_value[1],
    test_lgbm_shap_values[1][ex_yesdefault, :],
    X_test_enc.iloc[ex_yesdefault, :],
    matplotlib=True,
)


pd.DataFrame(results).T.loc[['dummy', 'RandomForest', 'random_forest_opt', 'XGBoost_opt', 'LightGBM_opt', 'CatBoost_opt']]


!jupyter nbconvert --to html default_credit_card_client_predictor.ipynb --output "index.html" --HTMLExporter.theme=dark

[NbConvertApp] Converting notebook default_credit_card_client_predictor.ipynb to html
[NbConvertApp] Writing 1051056 bytes to index.html

Type	Reason	Features
category	All of these features have fixed number of categories. `MARRIAGE` has 4 classes, and we consider it as categorical instead of ordinal feature because we didn't want to be biased and rank different status in any particular order, similar reasoning for `SEX` feature. For `EDUCATION`, there are some values shown as `unknown` and `others`. Since the meanings behind them are vague, we do not want to group them together as they may have unique patterns among all the `unknowns` or all the `others`, thus we decided to treat them as categorical feature.	`SEX`, `MARRIAGE`, `EDUCATION`
ordinal	Sequential numbers ranged from -2 to 8 (-2 as the best, 8 as the worst).	`PAY_0`, `PAY_2`, `PAY_3`, `PAY_4`, `PAY_5`, `PAY_6`
numeric	Numeric columns needs standardization	`PAY_AMT1`, `PAY_AMT2`, `PAY_AMT5`, `AGE`, `BILL_AMT5`, `PAY_AMT4`, `BILL_AMT6`, `BILL_AMT2`, `BILL_AMT4`, `LIMIT_BAL`, `BILL_AMT1`, `BILL_AMT3`, `avg_bill_amt`, `avg_pay_amt`, `PAY_AMT3`, `PAY_AMT6`
drop	Unique identifier for every record in the data set (wouldn't be helpful for model training)	`ID`

	dummy	LogisticReg
fit_time	0.036 (+/- 0.007)	1.523 (+/- 0.200)
score_time	0.022 (+/- 0.001)	0.025 (+/- 0.002)
test_f1	0.000 (+/- 0.000)	0.372 (+/- 0.015)
train_f1	0.000 (+/- 0.000)	0.374 (+/- 0.008)
test_recall	0.000 (+/- 0.000)	0.254 (+/- 0.013)
train_recall	0.000 (+/- 0.000)	0.255 (+/- 0.007)
test_average_precision	0.223 (+/- 0.000)	0.507 (+/- 0.016)
train_average_precision	0.223 (+/- 0.000)	0.509 (+/- 0.004)

	dummy	LogisticReg	LogisticReg_opt
fit_time	0.036 (+/- 0.007)	1.523 (+/- 0.200)	0.856 (+/- 0.144)
score_time	0.022 (+/- 0.001)	0.025 (+/- 0.002)	0.030 (+/- 0.002)
test_f1	0.000 (+/- 0.000)	0.372 (+/- 0.015)	0.492 (+/- 0.006)
train_f1	0.000 (+/- 0.000)	0.374 (+/- 0.008)	0.493 (+/- 0.003)
test_recall	0.000 (+/- 0.000)	0.254 (+/- 0.013)	0.643 (+/- 0.017)
train_recall	0.000 (+/- 0.000)	0.255 (+/- 0.007)	0.643 (+/- 0.003)
test_average_precision	0.223 (+/- 0.000)	0.507 (+/- 0.016)	0.504 (+/- 0.016)
train_average_precision	0.223 (+/- 0.000)	0.509 (+/- 0.004)	0.505 (+/- 0.004)

	dummy	LogisticReg	LogisticReg_opt	RandomForest	XGBoost	LGBM	Cat_Boost
fit_time	0.036 (+/- 0.007)	1.523 (+/- 0.200)	0.856 (+/- 0.144)	4.804 (+/- 0.184)	3.652 (+/- 0.232)	1.419 (+/- 0.276)	9.084 (+/- 1.947)
score_time	0.022 (+/- 0.001)	0.025 (+/- 0.002)	0.030 (+/- 0.002)	0.231 (+/- 0.034)	0.077 (+/- 0.027)	0.111 (+/- 0.031)	0.142 (+/- 0.024)
test_f1	0.000 (+/- 0.000)	0.372 (+/- 0.015)	0.492 (+/- 0.006)	0.473 (+/- 0.009)	0.469 (+/- 0.009)	0.481 (+/- 0.015)	0.473 (+/- 0.011)
train_f1	0.000 (+/- 0.000)	0.374 (+/- 0.008)	0.493 (+/- 0.003)	0.999 (+/- 0.000)	0.744 (+/- 0.003)	0.585 (+/- 0.006)	0.608 (+/- 0.007)
test_recall	0.000 (+/- 0.000)	0.254 (+/- 0.013)	0.643 (+/- 0.017)	0.374 (+/- 0.009)	0.371 (+/- 0.006)	0.375 (+/- 0.013)	0.371 (+/- 0.010)
train_recall	0.000 (+/- 0.000)	0.255 (+/- 0.007)	0.643 (+/- 0.003)	0.998 (+/- 0.000)	0.622 (+/- 0.003)	0.459 (+/- 0.007)	0.484 (+/- 0.007)
test_average_precision	0.223 (+/- 0.000)	0.507 (+/- 0.016)	0.504 (+/- 0.016)	0.540 (+/- 0.015)	0.533 (+/- 0.019)	0.563 (+/- 0.018)	0.560 (+/- 0.019)
train_average_precision	0.223 (+/- 0.000)	0.509 (+/- 0.004)	0.505 (+/- 0.004)	1.000 (+/- 0.000)	0.900 (+/- 0.004)	0.765 (+/- 0.004)	0.767 (+/- 0.005)

	rf+rfecv
fit_time	1.475 (+/- 0.102)
score_time	0.072 (+/- 0.005)
test_f1	0.478 (+/- 0.012)
train_f1	0.565 (+/- 0.024)
test_recall	0.372 (+/- 0.010)
train_recall	0.441 (+/- 0.024)
test_average_precision	0.554 (+/- 0.016)
train_average_precision	0.731 (+/- 0.027)

Default Credit Card Client Predictor¶

Imports¶

1. Introduction ¶

2. Data splitting ¶

3. EDA ¶

4. Feature engineering ¶

5. Preprocessing and transformations ¶

6. Baseline model ¶

7. Linear models ¶

8. Different models ¶

9. Feature selection ¶

10. Hyperparameter optimization ¶

11. Interpretation and feature importances ¶

12. Results on the test set ¶

Part 3¶

13. Summary of results ¶

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default.payment.next.month
0	1	20000.0	2	2	1	24	2	2	-1	-1	...	0.0	0.0	0.0	0.0	689.0	0.0	0.0	0.0	0.0	1
1	2	120000.0	2	2	2	26	-1	2	0	0	...	3272.0	3455.0	3261.0	0.0	1000.0	1000.0	1000.0	0.0	2000.0	1
2	3	90000.0	2	2	2	34	0	0	0	0	...	14331.0	14948.0	15549.0	1518.0	1500.0	1000.0	1000.0	1000.0	5000.0	0
3	4	50000.0	2	2	1	37	0	0	0	0	...	28314.0	28959.0	29547.0	2000.0	2019.0	1200.0	1100.0	1069.0	1000.0	0
4	5	50000.0	1	2	1	57	-1	0	-1	0	...	20940.0	19146.0	19131.0	2000.0	36681.0	10000.0	9000.0	689.0	679.0	0

Weight	Feature
0.4756	PAY_0
0.0457	LIMIT_BAL
0.0445	avg_pay_amt
0.0402	BILL_AMT1
0.0388	PAY_AMT2
0.0300	PAY_AMT3
0.0298	PAY_3
0.0296	PAY_AMT1
0.0262	PAY_2
0.0243	avg_bill_amt
0.0218	PAY_AMT4
0.0205	BILL_AMT2
0.0203	PAY_4
0.0196	AGE
0.0177	PAY_6
0.0162	PAY_AMT5
0.0155	PAY_AMT6
0.0155	PAY_5
0.0145	BILL_AMT3
0.0130	BILL_AMT6
… 15 more …

	Default Count Percent	Count
No (0)	77.67619	16312
Yes (1)	22.32381	4688

	Count
Male	4688
Female	16312

	Default Percentage
Male	55.84471
Female	44.15529