XGBoost

Introduction

Example

Training XGBoost with the credit-bias dataset.

import pandas as pd

data = pd.read_csv("data/credit-bias-train.zip")
data.head()
   NewCreditCustomer  Amount  Interest  LoanDuration  Education  \
0              False  2125.0     20.97            60        4.0
1              False  3000.0     17.12            60        5.0
2               True  9100.0     13.67            60        4.0
3               True   635.0     42.66            60        2.0
4              False  5000.0     24.52            60        4.0

   NrOfDependants  EmploymentDurationCurrentEmployer  \
0             0.0                                6.0
1             0.0                                6.0
2             1.0                                3.0
3             0.0                                1.0
4             1.0                                5.0

   IncomeFromPrincipalEmployer  IncomeFromPension  IncomeFromFamilyAllowance  \
0                          0.0              301.0                        0.0
1                        900.0                0.0                        0.0
2                        600.0                0.0                        0.0
3                        745.0                0.0                        0.0
4                       1000.0                0.0                        0.0

   ...  Mortgage  Other  Owner  Owner_with_encumbrance  Tenant  Entrepreneur  \
0  ...         0      0      1                       0       0             0
1  ...         0      0      1                       0       0             1
2  ...         1      0      0                       0       0             1
3  ...         0      0      0                       0       1             0
4  ...         0      0      0                       0       0             0

   Fully  Partially  Retiree  Self_employed
0      0          0        1              0
1      0          0        0              0
2      0          0        0              0
3      1          0        0              0
4      1          0        0              0

[5 rows x 40 columns]
X_df = data.drop('PaidLoan', axis=1)
y_df = data['PaidLoan']
y_df.describe()
count     58003
unique        2
top        True
freq      29219
Name: PaidLoan, dtype: object
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X_df, y_df, test_size=0.25, random_state=42)

Runs a grid search to find the tuning parameters that maxisimise the area under the curve (AUC) train_x is the training data frame with loan details and train_y is the default target column for training The method returns the best parameters and corresponding AUC score.

from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
from typing import Tuple

def find_best_xgboost_model(train_x: pd.DataFrame, train_y: pd.Series) -> Tuple[dict, float]:
    scale_pos_weight = (len(train_y) - train_y.sum()) / train_y.sum()

    param_test = {
            'max_depth': [1, 2, 4, 8],
            'learning_rate': [0.05, 0.06, 0.07],
            'n_estimators': [10, 100, 200]
        }

    gsearch = GridSearchCV(estimator=XGBClassifier(
        objective='binary:logistic',
        scale_pos_weight=scale_pos_weight,
        seed=27),
        param_grid=param_test, scoring='roc_auc', n_jobs=-1, cv=8)

    gsearch.fit(train_x, train_y)

    return gsearch.best_params_, gsearch.best_score_
best_params, best_score = find_best_xgboost_model(train_x, train_y)

Using the xgboost model parameters, it predicts the probabilities of defaulting.

  • best_params_, best tuning parameters
  • train_x, training dataframe with loan details
  • train_y, default target column for training
  • test_x, testing dataframe with loan details
  • test_y, default target column for testing

The result is a series of probabilities whether loan entry will default or not and corresponding model’s AUC score

from sklearn.metrics import roc_auc_score

def xgboost_predict(best_params_: dict, train_x: pd.DataFrame, train_y: pd.Series, test_x: pd.DataFrame,
                    test_y: pd.Series) -> Tuple[list, float]:
    scale_pos_weight = (len(train_y) - train_y.sum()) / train_y.sum()
    xgb_model = XGBClassifier(objective='binary:logistic',
                              scale_pos_weight=scale_pos_weight,
                              seed=27,
                              max_depth=best_params_['max_depth'],
                              learning_rate=best_params_['learning_rate'],
                              n_estimators=best_params_['n_estimators']
                              )

    xgb_model.fit(train_x, train_y)
    predicted_probabilities_ = xgb_model.predict_proba(test_x)[:, 1]
    auc_ = roc_auc_score(test_y, predicted_probabilities_)

    return predicted_probabilities_, auc_
predicted_probabilities, auc = xgboost_predict(best_params, train_x, train_y, test_x, test_y)
print("AUC: {}".format(auc))

Filters the original loan dataframe to just include the loans from the test dataframe and then it adds the predicted probabilities.

  • loans_df_, original loan dataframe
  • test_index, indices from the test dataframes
  • predicted_probabilities_, the probabilities forecasted by the XGBoost model

Returns the loans dataframe with predictions

import numpy as np

def prepare_test_with_predictions(loans_df_: pd.DataFrame, test_index: pd.Index, predicted_probabilities_: np.array)\
        ->pd.DataFrame:
    loan_test_df = loans_df_.loc[test_index]
    loan_test_df['predicted_probabilities'] = predicted_probabilities_
    return loan_test_df
loans_with_predictions_df = prepare_test_with_predictions(data, test_x.index, predicted_probabilities)
loans_with_predictions_df.head()

Visualisation

import seaborn as sns

sns.histplot(loans_with_predictions_df['predicted_probabilities'], stat='density')

ROC and AUC

Based on actuals and predicted values, it calculates their false positive rate (fpr), the true positive rate (tpr). It also returns the corresponding thresholds used as well as the value for the area under the curve.

actuals, series of actual values indicating whether the loan defaulted or not predicted_probabilities, series of predicted probabilities of the loan defaulting Return a unique series of false and true positive rates with corresponding series of thresholds and value for total area under the curve.

from sklearn.metrics import roc_curve, auc

def get_roc_auc_data(actuals: pd.Series, predicted_probabilities: pd.Series) -> \
        Tuple[np.array, np.array, np.array, float]:
    fpr, tpr, thresholds = roc_curve(actuals, predicted_probabilities, pos_label=1)
    auc_score = auc(fpr, tpr)
    return fpr, tpr, thresholds, auc_score
fpr, tpr, thresholds, auc_score = get_roc_auc_data(loans_with_predictions_df['PaidLoan'], loans_with_predictions_df['predicted_probabilities'])
sns.lineplot(fpr)