import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn.model_selection import train_test_split
%matplotlib inline
%matplotlib widget
pd.set_option('display.max_columns', 50)


__author__ = "Frederick Emile Bondzie-Arthur"
__email__ = "Frederickauthur@hotmail.com"

DISCOVER

train_data= pd.read_csv("data/train_values.csv")
train_data_label= pd.read_csv("data/train_labels.csv")

final= pd.read_csv('data/test_values.csv')

train_data.head()

train_data_label.head()

print(train_data.shape)
print(train_data_label.shape)

(500000, 22)
(500000, 2)

data= train_data.merge(train_data_label, on='row_id')
data.head()

data.dtypes

row_id                              int64
loan_type                           int64
property_type                       int64
loan_purpose                        int64
occupancy                           int64
loan_amount                       float64
preapproval                         int64
msa_md                              int64
state_code                          int64
county_code                         int64
applicant_ethnicity                 int64
applicant_race                      int64
applicant_sex                       int64
applicant_income                  float64
population                        float64
minority_population_pct           float64
ffiecmedian_family_income         float64
tract_to_msa_md_income_pct        float64
number_of_owner-occupied_units    float64
number_of_1_to_4_family_units     float64
lender                              int64
co_applicant                         bool
accepted                            int64
dtype: object

print((data.astype(np.object) == '?').any())

row_id                            False
loan_type                         False
property_type                     False
loan_purpose                      False
occupancy                         False
loan_amount                       False
preapproval                       False
msa_md                            False
state_code                        False
county_code                       False
applicant_ethnicity               False
applicant_race                    False
applicant_sex                     False
applicant_income                  False
population                        False
minority_population_pct           False
ffiecmedian_family_income         False
tract_to_msa_md_income_pct        False
number_of_owner-occupied_units    False
number_of_1_to_4_family_units     False
lender                            False
co_applicant                      False
accepted                          False
dtype: bool

print((data.astype(np.object).isnull()).any())

row_id                            False
loan_type                         False
property_type                     False
loan_purpose                      False
occupancy                         False
loan_amount                       False
preapproval                       False
msa_md                            False
state_code                        False
county_code                       False
applicant_ethnicity               False
applicant_race                    False
applicant_sex                     False
applicant_income                   True
population                         True
minority_population_pct            True
ffiecmedian_family_income          True
tract_to_msa_md_income_pct         True
number_of_owner-occupied_units     True
number_of_1_to_4_family_units      True
lender                            False
co_applicant                      False
accepted                          False
dtype: bool

(data.isnull().sum()/ data.row_id.unique().shape[0] * 100).round(2)

row_id                            0.00
loan_type                         0.00
property_type                     0.00
loan_purpose                      0.00
occupancy                         0.00
loan_amount                       0.00
preapproval                       0.00
msa_md                            0.00
state_code                        0.00
county_code                       0.00
applicant_ethnicity               0.00
applicant_race                    0.00
applicant_sex                     0.00
applicant_income                  7.99
population                        4.49
minority_population_pct           4.49
ffiecmedian_family_income         4.49
tract_to_msa_md_income_pct        4.50
number_of_owner-occupied_units    4.51
number_of_1_to_4_family_units     4.51
lender                            0.00
co_applicant                      0.00
accepted                          0.00
dtype: float64

(final.isnull().sum()/ final.row_id.unique().shape[0] * 100).round(2)

row_id                            0.00
loan_type                         0.00
property_type                     0.00
loan_purpose                      0.00
occupancy                         0.00
loan_amount                       0.00
preapproval                       0.00
msa_md                            0.00
state_code                        0.00
county_code                       0.00
applicant_ethnicity               0.00
applicant_race                    0.00
applicant_sex                     0.00
applicant_income                  8.03
population                        4.50
minority_population_pct           4.50
ffiecmedian_family_income         4.49
tract_to_msa_md_income_pct        4.50
number_of_owner-occupied_units    4.51
number_of_1_to_4_family_units     4.51
lender                            0.00
co_applicant                      0.00
dtype: float64

data.isnull().sum()

row_id                                0
loan_type                             0
property_type                         0
loan_purpose                          0
occupancy                             0
loan_amount                           0
preapproval                           0
msa_md                                0
state_code                            0
county_code                           0
applicant_ethnicity                   0
applicant_race                        0
applicant_sex                         0
applicant_income                  39948
population                        22465
minority_population_pct           22466
ffiecmedian_family_income         22440
tract_to_msa_md_income_pct        22514
number_of_owner-occupied_units    22565
number_of_1_to_4_family_units     22530
lender                                0
co_applicant                          0
accepted                              0
dtype: int64

final.isnull().sum()

row_id                                0
loan_type                             0
property_type                         0
loan_purpose                          0
occupancy                             0
loan_amount                           0
preapproval                           0
msa_md                                0
state_code                            0
county_code                           0
applicant_ethnicity                   0
applicant_race                        0
applicant_sex                         0
applicant_income                  40141
population                        22480
minority_population_pct           22482
ffiecmedian_family_income         22453
tract_to_msa_md_income_pct        22517
number_of_owner-occupied_units    22574
number_of_1_to_4_family_units     22550
lender                                0
co_applicant                          0
dtype: int64

filter1 = data["msa_md"].isin([-1]) 
filter2 = data["county_code"].isin([-1]) 
filter3 = data["state_code"].isin([-1]) 

# displaying dataframe  with all filter applied and mandatory  
data[filter1 | filter2| filter3].head()

print(data.msa_md[filter1].count())
print(data.county_code[filter2].count())
print(data.state_code[filter3].count())

76982
20466
19132

print(round((data.msa_md[filter1].count()/data.row_id.unique().shape[0] * 100),2))
print(round((data.county_code[filter2].count()/data.row_id.unique().shape[0] * 100),2))
print(round((data.state_code[filter3].count()/data.row_id.unique().shape[0] * 100),2))

15.4
4.09
3.83

print(data.shape)
print(data.row_id.unique().shape)

(500000, 23)
(500000,)

data.describe().round(2)

Since the exist some missing data. We use the median of the data to fill the missing values.

data_median= data.median()
data_median

row_id                            249999.500
loan_type                              1.000
property_type                          1.000
loan_purpose                           2.000
occupancy                              1.000
loan_amount                          162.000
preapproval                            3.000
msa_md                               192.000
state_code                            26.000
county_code                          131.000
applicant_ethnicity                    2.000
applicant_race                         5.000
applicant_sex                          1.000
applicant_income                      74.000
population                          4975.000
minority_population_pct               22.901
ffiecmedian_family_income          67526.000
tract_to_msa_md_income_pct           100.000
number_of_owner-occupied_units      1327.000
number_of_1_to_4_family_units       1753.000
lender                              3731.000
co_applicant                           0.000
accepted                               1.000
dtype: float64

final_median= final.median()
final_median

row_id                            249999.500
loan_type                              1.000
property_type                          1.000
loan_purpose                           2.000
occupancy                              1.000
loan_amount                          162.000
preapproval                            3.000
msa_md                               192.000
state_code                            26.000
county_code                          131.000
applicant_ethnicity                    2.000
applicant_race                         5.000
applicant_sex                          1.000
applicant_income                      74.000
population                          4975.000
minority_population_pct               22.955
ffiecmedian_family_income          67514.000
tract_to_msa_md_income_pct           100.000
number_of_owner-occupied_units      1326.000
number_of_1_to_4_family_units       1753.000
lender                              3713.000
co_applicant                           0.000
dtype: float64

data.fillna(data_median,inplace=True)
data.shape

(500000, 23)

final.fillna(data_median,inplace=True)
final.shape

(500000, 22)

(data.isnull().sum()/ data.row_id.unique().shape[0] * 100).round(2)

row_id                            0.0
loan_type                         0.0
property_type                     0.0
loan_purpose                      0.0
occupancy                         0.0
loan_amount                       0.0
preapproval                       0.0
msa_md                            0.0
state_code                        0.0
county_code                       0.0
applicant_ethnicity               0.0
applicant_race                    0.0
applicant_sex                     0.0
applicant_income                  0.0
population                        0.0
minority_population_pct           0.0
ffiecmedian_family_income         0.0
tract_to_msa_md_income_pct        0.0
number_of_owner-occupied_units    0.0
number_of_1_to_4_family_units     0.0
lender                            0.0
co_applicant                      0.0
accepted                          0.0
dtype: float64

(final.isnull().sum()/ final.row_id.unique().shape[0] * 100).round(2)

row_id                            0.0
loan_type                         0.0
property_type                     0.0
loan_purpose                      0.0
occupancy                         0.0
loan_amount                       0.0
preapproval                       0.0
msa_md                            0.0
state_code                        0.0
county_code                       0.0
applicant_ethnicity               0.0
applicant_race                    0.0
applicant_sex                     0.0
applicant_income                  0.0
population                        0.0
minority_population_pct           0.0
ffiecmedian_family_income         0.0
tract_to_msa_md_income_pct        0.0
number_of_owner-occupied_units    0.0
number_of_1_to_4_family_units     0.0
lender                            0.0
co_applicant                      0.0
dtype: float64

data.shape

(500000, 23)

accepted_rate= data.accepted.value_counts()/data.shape[0]
accepted_rate

1    0.500228
0    0.499772
Name: accepted, dtype: float64

data.describe().round(2)

accepted_Summary= data.groupby('accepted')
accepted_Summary.mean()

corr=data.drop(['row_id','county_code','state_code'], axis=1).corr(method='spearman').round(2)
fig= plt.figure(figsize=(20,10))
colormap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, cmap=colormap, annot=True)
plt.xticks(rotation=45)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title('Spearman Correlation Heatmap')
plt.show()
plt.savefig('image1.png')

<Figure size 432x288 with 0 Axes>

corr.style.background_gradient().set_precision(2)

corr_with_acc=data.drop(['row_id','county_code','state_code'], axis=1).corr(method='spearman')['accepted'].sort_values(ascending=False)
plt.figure(figsize=(14,6))
corr_with_acc.drop("accepted").plot.bar()
plt.show()
plt.savefig('image8.png')

<Figure size 432x288 with 0 Axes>

cat_vars=['loan_type','property_type','loan_purpose','occupancy','preapproval','applicant_sex','co_applicant',
         'applicant_sex','applicant_race','applicant_ethnicity','msa_md','state_code','county_code']
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
          'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units']

def plot_voilin(combined, cols, col_x= 'accepted'):
    fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(30, 10))
    for col, subplot in zip(cols, ax.flatten()):
        sns.set_style("whitegrid")
        sns.violinplot(col_x, col, data=combined,ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(90)
        

#voilin plot for numerical variable 
plot_voilin(data, num_vars)
plt.savefig('image1.png')

def plot_box(combined, cols, col_x= 'accepted'):
    fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(30, 10))
    for col, subplot in zip(cols, ax.flatten()):
        sns.set_style("whitegrid")
        sns.boxplot(col_x, col, data=combined,ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(90)
        
#voilin plot for numerical variable 
plot_box(data, num_vars)
plt.savefig('image2.png')

def plot_den_hist(combined, cols, bins=10, hist= False):
    fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(20, 25))
    for col, subplot in zip(cols, ax.flatten()):
        sns.distplot(combined[col], bins= bins, rug=True, hist=hist, ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(0)

#KDE plot for numerical variable, histogram not enabled
plot_den_hist(data, num_vars) 
plt.savefig('image3.png')

plot_den_hist(data, num_vars, hist=True)
plt.savefig('image3b.png')

def plot_bar(cat_cols):
    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(30, 10))
    for col, subplot in zip(cat_cols, ax.flatten()):
        sns.countplot(data[col],hue=data['accepted'],ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(90)

# plotting bar graph for categorical variables 
plot_bar(cat_vars)   
plt.savefig('image5.png')

data[num_vars].hist(bins=25, figsize=(20, 10), layout=(4, 4));
plt.savefig('image6.png')

From the graph, it can be seen that all data features are skwed except the ffiecmedian_family_income. To fix this issue we apply log to the data features skewed.

Skewness is the measure of symmetry of a distrubution. For a normal distubution skewness=0 and thus it is symmetrical. When data is skewed towards the right, then it is a postive skew otherwise is viceversa.

Skewness between 0 to $\pm$ 5= acceptable
Skewness between $\pm$ 0.5 to $\pm$ 1= a problem
skewness between $\pm$ 1 or more= utmost

data.skew(axis=0)

row_id                            1.286588e-17
loan_type                         1.864712e+00
property_type                     5.196600e+00
loan_purpose                     -1.333652e-01
occupancy                         2.871840e+00
loan_amount                       7.655279e+01
preapproval                      -2.242003e+00
msa_md                            1.353241e-02
state_code                       -5.974008e-02
county_code                       2.309361e-01
applicant_ethnicity               5.802958e-01
applicant_race                   -1.583676e+00
applicant_sex                     1.370674e+00
applicant_income                  2.317498e+01
population                        2.947782e+00
minority_population_pct           1.068839e+00
ffiecmedian_family_income         8.063549e-01
tract_to_msa_md_income_pct       -2.035543e+00
number_of_owner-occupied_units    1.942059e+00
number_of_1_to_4_family_units     2.080321e+00
lender                           -2.196283e-01
co_applicant                      4.080284e-01
accepted                         -9.120028e-04
dtype: float64

import scipy.stats as ss
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

# function to print categorical variables after running it through the cramers_v dunction
def print_crammer_values(data, cat_features, cats_x= 'accepted'):
    for cat_ in cat_features:
        print(cat_+ ": "+ str(cramers_v(data[cat_],data[cats_x]).round(2)))
        
# calculating for correlation between categorical variable and target variable        
print_crammer_values(data, cat_vars)

loan_type: 0.02
property_type: 0.11
loan_purpose: 0.17
occupancy: 0.03
preapproval: 0.15
applicant_sex: 0.09
co_applicant: 0.1
applicant_sex: 0.09
applicant_race: 0.15
applicant_ethnicity: 0.11
msa_md: 0.17
state_code: 0.21
county_code: 0.2

# measuring kurtosis 
data.kurtosis(axis=0)

row_id                              -1.200000
loan_type                            2.707997
property_type                       29.022540
loan_purpose                        -1.874865
occupancy                            7.561584
loan_amount                       9385.071465
preapproval                          3.913370
msa_md                              -1.491417
state_code                          -1.361784
county_code                         -1.234460
applicant_ethnicity                  2.691639
applicant_race                       2.661800
applicant_sex                        1.370864
applicant_income                  1062.740924
population                          18.060147
minority_population_pct              0.179010
ffiecmedian_family_income            1.302638
tract_to_msa_md_income_pct           3.749475
number_of_owner-occupied_units       9.540237
number_of_1_to_4_family_units       12.029859
lender                              -1.105622
co_applicant                        -1.833520
accepted                            -2.000007
dtype: float64

data.columns

Index(['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy',
       'loan_amount', 'preapproval', 'msa_md', 'state_code', 'county_code',
       'applicant_ethnicity', 'applicant_race', 'applicant_sex',
       'applicant_income', 'population', 'minority_population_pct',
       'ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
       'number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
       'lender', 'co_applicant', 'accepted'],
      dtype='object')

sns.lmplot(x='applicant_income',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e1cc8f110>

sns.lmplot(x='population',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d78decd10>

sns.lmplot(x='lender',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d9cebc2d0>

sns.lmplot(x='minority_population_pct',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e1cc28dd0>

sns.lmplot(x='ffiecmedian_family_income',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e00f7ca10>

sns.lmplot(x='tract_to_msa_md_income_pct',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e1cf2a210>

sns.lmplot(x='number_of_owner-occupied_units',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8ddddb7510>

sns.lmplot(x='number_of_1_to_4_family_units',y='loan_amount', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e2444bb90>

sns.lmplot(x='applicant_income',y='population', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e1cf30f10>

sns.lmplot(x='minority_population_pct',y='population', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8ca7a3d0>

sns.lmplot(x='ffiecmedian_family_income',y='population', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e1cc90ed0>

sns.lmplot(x='tract_to_msa_md_income_pct',y='population', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8c5bce90>

sns.lmplot(x='number_of_owner-occupied_units',y='population', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8ca4cf10>

sns.lmplot(x='number_of_1_to_4_family_units',y='population', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8c675290>

sns.lmplot(x='minority_population_pct',y='applicant_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8c6da490>

sns.lmplot(x='ffiecmedian_family_income',y='applicant_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8c791410>

sns.lmplot(x='tract_to_msa_md_income_pct',y='applicant_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d9ce91990>

sns.lmplot(x='number_of_owner-occupied_units',y='applicant_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d9ce98750>

sns.lmplot(x='number_of_1_to_4_family_units',y='applicant_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e0a287c90>

sns.lmplot(x='ffiecmedian_family_income',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e0a28a710>

sns.lmplot(x='tract_to_msa_md_income_pct',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8c4a8310>

sns.lmplot(x='number_of_1_to_4_family_units',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d790034d0>

sns.lmplot(x='tract_to_msa_md_income_pct',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d79047e90>

sns.lmplot(x='number_of_owner-occupied_units',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8e0a295d90>

sns.lmplot(x='number_of_1_to_4_family_units',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d8c85d310>

sns.lmplot(x='number_of_owner-occupied_units',y='tract_to_msa_md_income_pct', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8dcd9f5510>

sns.lmplot(x='number_of_1_to_4_family_units',y='tract_to_msa_md_income_pct', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8dbd682710>

sns.lmplot(x='number_of_1_to_4_family_units',y='number_of_owner-occupied_units', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d68a169d0>

sns.lmplot(x='number_of_owner-occupied_units',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')

<seaborn.axisgrid.FacetGrid at 0x7f8d893a72d0>

Feature Engineering

From EDA we identified some of the features are higher skewed. This can affect our classifcation model.

Here we extract some features from the dataset and add to the datasets

#caculating the loan amount per applicant ratio LAPDR 
data['LDPR']= (data['applicant_income'])/ (data['loan_amount'])
#caculating the lenders x loantype 
data['LLT']= (data['lender'])* (data['loan_type'])
# caculating the property x loan purpose
data['PTLP']= (data['property_type'])* (data['loan_purpose'])

#caculating the loan amount per applicant ratio LAPDR
final['LDPR']= (final['applicant_income'])/ (final['loan_amount'])
#caculating the lenders x loantype
final['LLT']= (final['lender'])* (final['loan_type'])
# caculating the property x loan purpose
final['PTLP']= (final['property_type'])* (final['loan_purpose'])

#ie.msd_md acceptance Rate Mean on test data

MSDARM= pd.DataFrame(data.groupby(["msa_md"])["accepted"].mean()) 
MSDARM.shape

(409, 1)

#train dataset
LARM= pd.DataFrame(data.groupby(['lender'])['accepted'].mean())
data= pd.merge(data,LARM,how='left', on='lender')

final= pd.merge(final,LARM,how='left', on='lender')

# Renaming certain columns for better readability
data.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'accepted', 'LDPR','LLT','PTLP','LARM']

# Renaming certain columns for better readability
final.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'LDPR','LLT','PTLP','LARM']

data= pd.merge(data,MSDARM,how='left', on='msa_md')

final= pd.merge(final,MSDARM,how='left', on='msa_md')

# Renaming certain columns for better readability
data.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'accepted', 'LDPR','LLT','PTLP','LARM','MSDARM']

# Renaming certain columns for better readability
final.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'LDPR','LLT','PTLP','LARM','MSDARM']

cat_vars=['loan_type','property_type','loan_purpose','occupancy','preapproval','applicant_sex','co_applicant',
         'applicant_sex','applicant_race','applicant_ethnicity','msa_md','state_code','county_code']
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
          'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units',
         'LDPR','LLT','PTLP','LARM','MSDARM']

Class Balance

Mortgage acceptance rate: 50%

data.accepted.value_counts(1)*100

1    50.0228
0    49.9772
Name: accepted, dtype: float64

is_loan_accepted= data.accepted== 1
loan_accepted= data[is_loan_accepted]
loan_is_not_accepted= data.accepted== 0
loan_not_accepted= data[loan_is_not_accepted]

print(loan_accepted.shape)
print(loan_not_accepted.shape)

(250114, 28)
(249886, 28)

sns.countplot('accepted', data = data)
plt.title('Distribution of Loan Applicant')
plt.savefig('image4.png')

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 28 columns):
row_id                            500000 non-null int64
loan_type                         500000 non-null int64
property_type                     500000 non-null int64
loan_purpose                      500000 non-null int64
occupancy                         500000 non-null int64
loan_amount                       500000 non-null float64
preapproval                       500000 non-null int64
msa_md                            500000 non-null int64
state_code                        500000 non-null int64
county_code                       500000 non-null int64
applicant_ethnicity               500000 non-null int64
applicant_race                    500000 non-null int64
applicant_sex                     500000 non-null int64
applicant_income                  500000 non-null float64
population                        500000 non-null float64
minority_population_pct           500000 non-null float64
ffiecmedian_family_income         500000 non-null float64
tract_to_msa_md_income_pct        500000 non-null float64
number_of_owner-occupied_units    500000 non-null float64
number_of_1_to_4_family_units     500000 non-null float64
lender                            500000 non-null int64
co_applicant                      500000 non-null bool
accepted                          500000 non-null int64
LDPR                              500000 non-null float64
LLT                               500000 non-null int64
PTLP                              500000 non-null int64
LARM                              500000 non-null float64
MSDARM                            500000 non-null float64
dtypes: bool(1), float64(11), int64(16)
memory usage: 127.3 MB

final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 27 columns):
row_id                            500000 non-null int64
loan_type                         500000 non-null int64
property_type                     500000 non-null int64
loan_purpose                      500000 non-null int64
occupancy                         500000 non-null int64
loan_amount                       500000 non-null float64
preapproval                       500000 non-null int64
msa_md                            500000 non-null int64
state_code                        500000 non-null int64
county_code                       500000 non-null int64
applicant_ethnicity               500000 non-null int64
applicant_race                    500000 non-null int64
applicant_sex                     500000 non-null int64
applicant_income                  500000 non-null float64
population                        500000 non-null float64
minority_population_pct           500000 non-null float64
ffiecmedian_family_income         500000 non-null float64
tract_to_msa_md_income_pct        500000 non-null float64
number_of_owner-occupied_units    500000 non-null float64
number_of_1_to_4_family_units     500000 non-null float64
lender                            500000 non-null int64
co_applicant                      500000 non-null bool
LDPR                              500000 non-null float64
LLT                               500000 non-null int64
PTLP                              500000 non-null int64
LARM                              499278 non-null float64
MSDARM                            500000 non-null float64
dtypes: bool(1), float64(11), int64(15)
memory usage: 103.5 MB

Split Train/Test Set

Let's split our data into a train and test set. We'll fit our model with the train set and leave our test set for our last evaluation.

# Create the X and y set
X = data.drop('accepted', axis=1)
y = data.accepted

categorical_features_indices= np.where(X.dtypes != np.float)[0]
categorical_features_indices

array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 20, 21, 23, 24])

# Define train and test
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size = 0.3, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(350000, 27) (350000,)
(150000, 27) (150000,)

y_train.head()

226114    1
435187    1
294452    0
427864    0
188822    1
Name: accepted, dtype: int64

X_train.head()

x_predict= final
x_predict.head()

Model Measurement

Metrics used for measurement include following:

Accuracy
AUC
Macro Precision
Macro Recall
F1 Score

import sklearn.model_selection as ms
import sklearn.metrics as sklm

def score_model(probs, threshold):
    return np.array([1 if x> threshold else 0 for x in probs[:, 1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])

def plot_auc(labels, probs, threshold):
    ## compute the false postive rate, true positive rate  and threshold  along with the AUC 
    pl.style.use('ggplot')
    scores = score_model(probs, threshold)
    accuracy= sklm.accuracy_score(labels, scores)
    fpr, tpr, threshold = sklm.roc_curve(labels, probs[:,1]) 
    auc = sklm.auc(fpr, tpr)

    ## plot the result 
    plt.title('Reciever Operating Charateristic')
    plt.plot(fpr, tpr, color = 'orange', label = 'AUC = %0.2f' %auc)
    plt.legend(loc = 'lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.ylabel('True Positive Rate') 
    plt.xlabel('False Positive Rate')
    plt.title("Recieve Operating Characteristic (Accuracy= %0.2f)" %accuracy)
    plt.show()

Train Three Models

Catboost Classifier
Xgboost Classifier
Decision Tree

import catboost as cb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score
import matplotlib.pylab as pl

Choose Which Sampling Technique to Use For Model

We all models listed above using a 5-fold cross-validation.

labels= data.accepted
features= X
fig= plt.figure(figsize=(12,10))
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]
CV= 5
cv_df= pd.DataFrame(index= range(CV * len(models)))
entries= []
for model in models:
  model_name= model.__class__.__name__
  accuracies= cross_val_score(model, features, labels, scoring= 'accuracy', cv= CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df= pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x= 'model_name', y= 'accuracy', data= cv_df)
sns.stripplot(x= 'model_name', y= 'accuracy', data= cv_df, 
              size= 8, jitter= True, edgecolor= "gray", linewidth= 2)
plt.show()

print(cv_df.groupby('model_name').accuracy.mean())

model_name
CatBoostClassifier        0.726908
DecisionTreeClassifier    0.631850
XGBClassifier             0.708636
Name: accuracy, dtype: float64

From the results above we can see catboost classifier out performs the other models. We move further to tune the catboost classifier model.

models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]

result_df= pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

for model in models:
    model_name= model.__class__.__name__
    model_fit=model.fit(X_train,y_train)
    y_prob= model_fit.predict_proba(X_test)[::,1]
    
    fpr, tpr, threshold = sklm.roc_curve(y_test, y_prob) 
    auc = sklm.auc(fpr, tpr)
    
    result_df= result_df.append({'model_name':model_name,'fpr':fpr, 'tpr':tpr,'auc':auc}, ignore_index=True)

result_df.set_index('model_name', inplace=True)


fig= plt.figure(figsize=(+8,6))
pl.style.use('ggplot')

for i in result_df.index:
    plt.plot(result_df.loc[i]['fpr'],
            result_df.loc[i]['tpr'],
            label="{}, AUC={:.2f}".format(i, result_df.loc[i]['auc']))

plt.plot([0,1],[0,1], color='red',linestyle='--')
plt.xticks(np.arange(0.0,1.1,step=0.1))
plt.xlabel("False Postive Rate", fontsize=15)
plt.yticks(np.arange(0.0,1.1,step=0.1))
plt.ylabel("True Positive Rate",fontsize=15)
plt.title("Reciever Operating Charateristic")
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

From the results above we can see catboost classifier out performs the other models in terms of AUC We move further to tune the catboost classifier model. With this and the results above we then to tune and improve the catboost classifier model.

Normalizing the features that are highly skewed to make it normally distributed and running the model to see its accuracy and auc

data_norm= data.copy()
final_norm= final.copy()

data_norm.head()

final_norm.head()

num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
          'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units','LDPR',
         'LLT','PTLP','LARM','MSDARM']

#plotting histogram numerical variables
data_norm[num_vars].hist(bins=25, figsize=(30, 20), layout=(7, 3));

data_norm.skew(axis=0)

row_id                            1.286588e-17
loan_type                         1.864712e+00
property_type                     5.196600e+00
loan_purpose                     -1.333652e-01
occupancy                         2.871840e+00
loan_amount                       7.655279e+01
preapproval                      -2.242003e+00
msa_md                            1.353241e-02
state_code                       -5.974008e-02
county_code                       2.309361e-01
applicant_ethnicity               5.802958e-01
applicant_race                   -1.583676e+00
applicant_sex                     1.370674e+00
applicant_income                  2.317498e+01
population                        2.947782e+00
minority_population_pct           1.068839e+00
ffiecmedian_family_income         8.063549e-01
tract_to_msa_md_income_pct       -2.035543e+00
number_of_owner-occupied_units    1.942059e+00
number_of_1_to_4_family_units     2.080321e+00
lender                           -2.196283e-01
co_applicant                      4.080284e-01
accepted                         -9.120028e-04
LDPR                              8.941249e+01
LLT                               1.676016e+00
PTLP                              6.904792e-01
LARM                             -9.479196e-02
MSDARM                           -5.590501e-01
dtype: float64

# We apply log(x+1)
data_norm['log_loan_amount']= np.log(data_norm['loan_amount']+1)
data_norm['log_LDPR']= np.log(data_norm['LDPR']+1)
data_norm['log_PTLP']= np.log(data_norm['PTLP']+1)
data_norm['log_applicant_income']= np.log(data_norm['applicant_income']+1)
data_norm['log_population']= np.log(data_norm['population']+1)
data_norm['log_minority_population_pct']= np.log(data_norm['minority_population_pct']+1)
data_norm['log_ffiecmedian_family_income']= np.log(data_norm['ffiecmedian_family_income']+1)
data_norm['log_number_of_owner_occupied_units']= np.log(data_norm['number_of_owner-occupied_units']+1)
data_norm['log_number_of_1_to_4_family_units']= np.log(data_norm['number_of_1_to_4_family_units'])
data_norm['pwr_tract_to_msa_md_income_pct'] = np.power(data_norm['tract_to_msa_md_income_pct'],10)
data_norm['pwr_LLT']= np.power(data_norm['LLT'],10)

num_vars_log= ['log_loan_amount','log_LDPR','log_PTLP','log_applicant_income','log_population','log_minority_population_pct',
              'log_ffiecmedian_family_income','log_number_of_owner_occupied_units','log_number_of_1_to_4_family_units',
              'pwr_tract_to_msa_md_income_pct','pwr_LLT',]
data_norm[num_vars_log].hist(bins=25, figsize=(30, 20), layout=(6, 2));

data_norm.skew(axis=0)

row_id                                1.286588e-17
loan_type                             1.864712e+00
property_type                         5.196600e+00
loan_purpose                         -1.333652e-01
occupancy                             2.871840e+00
loan_amount                           7.655279e+01
preapproval                          -2.242003e+00
msa_md                                1.353241e-02
state_code                           -5.974008e-02
county_code                           2.309361e-01
applicant_ethnicity                   5.802958e-01
applicant_race                       -1.583676e+00
applicant_sex                         1.370674e+00
applicant_income                      2.317498e+01
population                            2.947782e+00
minority_population_pct               1.068839e+00
ffiecmedian_family_income             8.063549e-01
tract_to_msa_md_income_pct           -2.035543e+00
number_of_owner-occupied_units        1.942059e+00
number_of_1_to_4_family_units         2.080321e+00
lender                               -2.196283e-01
co_applicant                          4.080284e-01
accepted                             -9.120028e-04
LDPR                                  8.941249e+01
LLT                                   1.676016e+00
PTLP                                  6.904792e-01
LARM                                 -9.479196e-02
MSDARM                               -5.590501e-01
log_loan_amount                      -1.045326e+00
log_LDPR                              3.109443e+00
log_PTLP                             -8.455082e-02
log_applicant_income                  1.644426e-01
log_population                       -1.581668e-01
log_minority_population_pct          -2.711568e-01
log_ffiecmedian_family_income        -2.891652e-01
log_number_of_owner_occupied_units   -1.100173e+00
log_number_of_1_to_4_family_units    -1.615771e+00
pwr_tract_to_msa_md_income_pct       -8.940395e-01
pwr_LLT                               8.320281e-02
dtype: float64

# Create the X and y set
X_norm = data_norm.drop('accepted', axis=1)
y_norm = data_norm.accepted

# Define train and test
X_train_norm, X_test_norm, y_train_norm, y_test_norm= train_test_split(X_norm,y_norm, test_size = 0.3, random_state = 42)
print(X_train_norm.shape,y_train_norm.shape)
print(X_test_norm.shape,y_test_norm.shape)

(350000, 38) (350000,)
(150000, 38) (150000,)

labels= data_norm.accepted
features= X_norm
fig= plt.figure(figsize=(12,10))
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]
CV= 5
cv_df= pd.DataFrame(index= range(CV * len(models)))
entries= []
for model in models:
  model_name= model.__class__.__name__
  accuracies= cross_val_score(model, features, labels, scoring= 'accuracy', cv= CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df= pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x= 'model_name', y= 'accuracy', data= cv_df)
sns.stripplot(x= 'model_name', y= 'accuracy', data= cv_df, 
              size= 8, jitter= True, edgecolor= "gray", linewidth= 2)
plt.show()

print(cv_df.groupby('model_name').accuracy.mean())

model_name
CatBoostClassifier        0.726900
DecisionTreeClassifier    0.632066
XGBClassifier             0.708804
Name: accuracy, dtype: float64

models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]

result_df= pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

for model in models:
    model_name= model.__class__.__name__
    model_fit=model.fit(X_train_norm,y_train_norm)
    y_prob_norm= model_fit.predict_proba(X_test_norm)[::,1]
    
    fpr, tpr, threshold = sklm.roc_curve(y_test_norm, y_prob_norm) 
    auc = sklm.auc(fpr, tpr)
    
    result_df= result_df.append({'model_name':model_name,'fpr':fpr, 'tpr':tpr,'auc':auc}, ignore_index=True)

result_df.set_index('model_name', inplace=True)


fig= plt.figure(figsize=(+8,6))
pl.style.use('ggplot')

for i in result_df.index:
    plt.plot(result_df.loc[i]['fpr'],
            result_df.loc[i]['tpr'],
            label="{}, AUC={:.2f}".format(i, result_df.loc[i]['auc']))

plt.plot([0,1],[0,1], color='red',linestyle='--')
plt.xticks(np.arange(0.0,1.1,step=0.1))
plt.xlabel("False Postive Rate", fontsize=15)
plt.yticks(np.arange(0.0,1.1,step=0.1))
plt.ylabel("True Positive Rate",fontsize=15)
plt.title("Reciever Operating Charateristic")
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

From Comparing results from Non normalized features and normalized features. We see that the accuracy of non normalized features is slightly higher than normalized features. With results we preceed to using non-normalized features for tunning and feature training.

Tunning of Catboost model

import hyperopt
import sys
from frozendict import frozendict
import shap
shap.initjs()

class UAClassifierObjective(object):
    def __init__(self, dataset, const_params, fold_count):
        self._dataset = dataset
        self._const_params = const_params.copy()
        self._fold_count = fold_count
        self._evaluated_count = 0
        
    def _to_catboost_params(self, hyper_params):
        return {
            'learning_rate': hyper_params['learning_rate'],
            'depth': hyper_params['depth'],
            'l2_leaf_reg': hyper_params['l2_leaf_reg']}
    
    # hyperopt optimizes an objective using `__call__` method (e.g. by doing 
    # `foo(hyper_params)`), so we provide one
    def __call__(self, hyper_params):
        # join hyper-parameters provided by hyperopt with hyper-parameters 
        # provided by the user
        params = self._to_catboost_params(hyper_params)
        params.update(self._const_params)
        
        print('evaluating params={}'.format(params), file=sys.stdout)
        sys.stdout.flush()
        
        # we use cross-validation for objective evaluation, to avoid overfitting
        scores = cb.cv(
            pool=self._dataset,
            params=params,
            fold_count=self._fold_count,
            partition_random_seed=42,
            verbose=False)
        
        # scores returns a dictionary with mean and std (per-fold) of metric 
        # value for each cv iteration, we choose minimal value of objective 
        # mean (though it will be better to choose minimal value among all folds)
        # because noise is additive
        min_mean_auc = np.min(scores['test-AUC-mean'])
        print('evaluated score={}'.format(min_mean_auc), file=sys.stdout)
        
        self._evaluated_count += 1
        print('evaluated {} times'.format(self._evaluated_count), file=sys.stdout)
        
        # negate because hyperopt minimizes the objective
        return {'loss': -min_mean_auc, 'status': hyperopt.STATUS_OK}

def find_best_hyper_params(dataset, const_params, max_evals=100):    
    # we are going to optimize these three parameters, though there are a lot more of them (see CatBoost docs)
    parameter_space = {
        'learning_rate': hyperopt.hp.uniform('learning_rate', 0.2, 1.0),
        'depth': hyperopt.hp.randint('depth', 7),
        'l2_leaf_reg': hyperopt.hp.uniform('l2_leaf_reg', 1, 10)}
    objective = UAClassifierObjective(dataset=dataset, const_params=const_params, fold_count=6)
    trials = hyperopt.Trials()
    best = hyperopt.fmin(
        fn=objective,
        space=parameter_space,
        algo=hyperopt.rand.suggest,
        max_evals=max_evals,
        rstate=np.random.RandomState(seed=42))
    return best

def train_best_model(X, y, const_params, max_evals=100, use_default=False):
    # convert pandas.DataFrame to catboost.Pool to avoid converting it on each 
    # iteration of hyper-parameters optimization
    dataset = cb.Pool(X, y, cat_features=categorical_features_indices)
    
    if use_default:
        # pretrained optimal parameters
        best = {
            'learning_rate': 0.4234185321620083, 
            'depth': 5, 
            'l2_leaf_reg': 9.464266235679002}
    else:
        best = find_best_hyper_params(dataset, const_params, max_evals=max_evals)
            
    # merge subset of hyper-parameters provided by hyperopt with hyper-parameters 
    # provided by the user
    hyper_params = best.copy()
    hyper_params.update(const_params)
    
    # drop `use_best_model` because we are going to use entire dataset for 
    # training of the final model
    hyper_params.pop('use_best_model', None)
    
    model = cb.CatBoostClassifier(**hyper_params)
    model.fit(dataset, verbose=False)
    
    return model, hyper_params

import time
start=time.time()

have_gpu = False
# skip hyper-parameter optimization and just use provided optimal parameters
use_optimal_pretrained_params = False
# number of iterations of hyper-parameter search
hyperopt_iterations = 50

const_params = frozendict({
    'task_type': 'GPU' if have_gpu else 'CPU',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC', 
    'custom_metric': ['AUC'],
    'iterations': 100,
    'random_seed': 42})

model, params = train_best_model(
    X_train, y_train, 
    const_params, 
    max_evals=hyperopt_iterations, 
    use_default=use_optimal_pretrained_params)
print('best params are {}'.format(params), file=sys.stdout)
end = time.time()
print(end-start)

evaluating params={'learning_rate': 0.5637608417770977, 'depth': 4, 'l2_leaf_reg': 8.493688290834637, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 1 times
evaluating params={'learning_rate': 0.528083167082651, 'depth': 3, 'l2_leaf_reg': 7.549531688595925, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 2 times
evaluating params={'learning_rate': 0.8699106844426274, 'depth': 1, 'l2_leaf_reg': 8.949837496427758, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 3 times
evaluating params={'learning_rate': 0.5558660098409214, 'depth': 2, 'l2_leaf_reg': 9.268502695024392, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 4 times
evaluating params={'learning_rate': 0.8499167906858907, 'depth': 6, 'l2_leaf_reg': 2.546844052569046, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734748121140296
evaluated 5 times
evaluating params={'learning_rate': 0.8396427532857385, 'depth': 1, 'l2_leaf_reg': 4.942262677968309, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 6 times
evaluating params={'learning_rate': 0.7872224143884547, 'depth': 2, 'l2_leaf_reg': 9.454327638424944, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 7 times
evaluating params={'learning_rate': 0.693663486801853, 'depth': 0, 'l2_leaf_reg': 7.978279409450941, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 8 times
evaluating params={'learning_rate': 0.63472245415225, 'depth': 5, 'l2_leaf_reg': 9.280083037935846, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 9 times
evaluating params={'learning_rate': 0.9643823890479426, 'depth': 6, 'l2_leaf_reg': 7.3055930015922925, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.773471890147901
evaluated 10 times
evaluating params={'learning_rate': 0.2029042458037946, 'depth': 4, 'l2_leaf_reg': 8.360470176973763, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 11 times
evaluating params={'learning_rate': 0.9033087097584362, 'depth': 2, 'l2_leaf_reg': 4.226333703160277, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 12 times
evaluating params={'learning_rate': 0.905182734106353, 'depth': 2, 'l2_leaf_reg': 3.7653234749986546, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 13 times
evaluating params={'learning_rate': 0.6216878461972857, 'depth': 6, 'l2_leaf_reg': 1.1186473207406844, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734841160115428
evaluated 14 times
evaluating params={'learning_rate': 0.7096272251393012, 'depth': 0, 'l2_leaf_reg': 9.288835555293193, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 15 times
evaluating params={'learning_rate': 0.6666954285134632, 'depth': 0, 'l2_leaf_reg': 6.760090014166223, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 16 times
evaluating params={'learning_rate': 0.6126656895443174, 'depth': 2, 'l2_leaf_reg': 2.122305933450838, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 17 times
evaluating params={'learning_rate': 0.7536883616570151, 'depth': 0, 'l2_leaf_reg': 7.129352121946425, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 18 times
evaluating params={'learning_rate': 0.4305107282251851, 'depth': 0, 'l2_leaf_reg': 7.252206709468439, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 19 times
evaluating params={'learning_rate': 0.5182286954655402, 'depth': 5, 'l2_leaf_reg': 8.87600708641634, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 20 times
evaluating params={'learning_rate': 0.2782680588629646, 'depth': 3, 'l2_leaf_reg': 7.606937685716839, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 21 times
evaluating params={'learning_rate': 0.5596670546087271, 'depth': 6, 'l2_leaf_reg': 2.0726141256143755, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734748121140296
evaluated 22 times
evaluating params={'learning_rate': 0.46449369289712517, 'depth': 3, 'l2_leaf_reg': 6.339387901117264, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 23 times
evaluating params={'learning_rate': 0.8314498467716105, 'depth': 6, 'l2_leaf_reg': 2.3992161107439527, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734748121140296
evaluated 24 times
evaluating params={'learning_rate': 0.94205821260823, 'depth': 2, 'l2_leaf_reg': 6.774791231932208, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 25 times
evaluating params={'learning_rate': 0.4643443332702141, 'depth': 3, 'l2_leaf_reg': 3.7415118170244104, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310275811755531
evaluated 26 times
evaluating params={'learning_rate': 0.8642659638603969, 'depth': 0, 'l2_leaf_reg': 7.341252418608859, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 27 times
evaluating params={'learning_rate': 0.7294189282659973, 'depth': 5, 'l2_leaf_reg': 1.0626887985889903, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623793574052168
evaluated 28 times
evaluating params={'learning_rate': 0.48343522785711396, 'depth': 4, 'l2_leaf_reg': 6.710418002699299, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 29 times
evaluating params={'learning_rate': 0.7210562330475281, 'depth': 5, 'l2_leaf_reg': 6.850395695432215, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 30 times
evaluating params={'learning_rate': 0.5401179224561183, 'depth': 4, 'l2_leaf_reg': 2.184936328120133, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7492519266208718
evaluated 31 times
evaluating params={'learning_rate': 0.6308365875061976, 'depth': 0, 'l2_leaf_reg': 3.850918241301665, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 32 times
evaluating params={'learning_rate': 0.30502207676284404, 'depth': 5, 'l2_leaf_reg': 6.181142547861167, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 33 times
evaluating params={'learning_rate': 0.6292005093922117, 'depth': 4, 'l2_leaf_reg': 6.62131414290977, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 34 times
evaluating params={'learning_rate': 0.4610454641190919, 'depth': 2, 'l2_leaf_reg': 6.43044823439896, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 35 times
evaluating params={'learning_rate': 0.6292458327974482, 'depth': 4, 'l2_leaf_reg': 2.6565212671416507, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7492519266208718
evaluated 36 times
evaluating params={'learning_rate': 0.5464785404218941, 'depth': 1, 'l2_leaf_reg': 5.469164797061436, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 37 times
evaluating params={'learning_rate': 0.8815589660398113, 'depth': 3, 'l2_leaf_reg': 7.762603832174364, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 38 times
evaluating params={'learning_rate': 0.7673527934998763, 'depth': 5, 'l2_leaf_reg': 1.3354940643295945, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623793574052168
evaluated 39 times
evaluating params={'learning_rate': 0.9509122562293137, 'depth': 1, 'l2_leaf_reg': 8.157368633221372, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 40 times
evaluating params={'learning_rate': 0.37343651242935033, 'depth': 0, 'l2_leaf_reg': 9.120965626738386, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 41 times
evaluating params={'learning_rate': 0.5825104357671131, 'depth': 4, 'l2_leaf_reg': 7.375442989938085, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 42 times
evaluating params={'learning_rate': 0.6370940109439119, 'depth': 1, 'l2_leaf_reg': 3.310896149885864, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 43 times
evaluating params={'learning_rate': 0.9703050878380732, 'depth': 3, 'l2_leaf_reg': 4.42644166211115, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 44 times
evaluating params={'learning_rate': 0.6909652662226076, 'depth': 6, 'l2_leaf_reg': 8.067461444538466, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.773471890147901
evaluated 45 times
evaluating params={'learning_rate': 0.44765747767757863, 'depth': 4, 'l2_leaf_reg': 9.72685862525973, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 46 times
evaluating params={'learning_rate': 0.4380272549876373, 'depth': 1, 'l2_leaf_reg': 3.2977280073078528, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 47 times
evaluating params={'learning_rate': 0.2033304608226356, 'depth': 1, 'l2_leaf_reg': 9.813523901250528, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 48 times
evaluating params={'learning_rate': 0.9619808422033007, 'depth': 1, 'l2_leaf_reg': 8.88997567856084, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 49 times
evaluating params={'learning_rate': 0.990250545808629, 'depth': 0, 'l2_leaf_reg': 4.183924523770489, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 50 times
100%|██████████| 50/50 [2:56:50<00:00, 171.83s/it, best loss: -0.7734841160115428]

Warning: Custom metrics will not be evaluated because there are no test datasets

best params are {'depth': 6, 'l2_leaf_reg': 1.1186473207406844, 'learning_rate': 0.6216878461972857, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
10644.765050649643

probabilities = model.predict_proba(data=X_test) 
print_metrics(y_test, probabilities, 0.51)

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive     51767             22864
Actual Negative     17783             57586

Accuracy        0.73
AUC             0.81
Macro Precision 0.73
Macro Recall    0.73
 
           Positive      Negative
Num Case    74631         75369
Precision    0.74          0.72
Recall       0.69          0.76
F1           0.72          0.74

plot_auc(y_test, probabilities, 0.50)

from catboost import CatBoostClassifier
clf_cb= CatBoostClassifier(iterations=2500, depth=10,logging_level='Silent',
                           learning_rate=0.01,eval_metric='Accuracy',use_best_model=True, random_seed=42)
clf_cb.fit(X_test, y_test, cat_features= categorical_features_indices, eval_set=(X_test,y_test))

<catboost.core.CatBoostClassifier at 0x7f8deae5cf10>

probabilities= clf_cb.predict_proba(data= X_test) 
print_metrics(y_test, probabilities, 0.51)

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive     54448             20183
Actual Negative     13663             61706

Accuracy        0.77
AUC             0.86
Macro Precision 0.78
Macro Recall    0.77
 
           Positive      Negative
Num Case    74631         75369
Precision    0.80          0.75
Recall       0.73          0.82
F1           0.76          0.78

plot_auc(y_test, probabilities, 0.51)

from catboost import CatBoostClassifier
clf_cb= CatBoostClassifier(iterations=2500, depth=6,logging_level='Silent',
                           learning_rate=0.3548362548720143,eval_metric='Accuracy',l2_leaf_reg=2.683829844728577,
                          use_best_model=True, random_seed=42)
clf_cb.fit(X_test, y_test, cat_features= categorical_features_indices, eval_set=(X_test,y_test))

<catboost.core.CatBoostClassifier at 0x7f8e296d41d0>

probabilities= clf_cb.predict_proba(data= X_test) 
print_metrics(y_test, probabilities, 0.51)

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive     58500             16131
Actual Negative     12622             62747

Accuracy        0.81
AUC             0.89
Macro Precision 0.81
Macro Recall    0.81
 
           Positive      Negative
Num Case    74631         75369
Precision    0.82          0.80
Recall       0.78          0.83
F1           0.80          0.81

plot_auc(y_test, probabilities, 0.50)

tunning and optimizing catboost algorithm

final_score = clf_cb.predict(data=x_predict)
final_score= final_score.astype(np.int)
submit= pd.DataFrame({'row_id':x_predict['row_id'],'accepted':final_score})
submit.to_csv('submission.csv', index=False)

Catboost Classifier Feature Importance

clf_cb.get_feature_importance(prettified=True)

shap_values = clf_cb.get_feature_importance(cb.Pool(X, y, cat_features=categorical_features_indices), type='ShapValues')

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.summary_plot(shap_values, X)

# across the whole dataset
cat_features= data[cat_vars]

def plot_shap(cat_cols):
    for col in cat_cols:
        shap.dependence_plot(col, shap_values, X)

# plotting data for categorical variables 
plot_shap(cat_features)

# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))

# make SHAP plots of the three most important features
for i in range(20):
    shap.dependence_plot(top_inds[i], shap_values, X)

shap_values = clf_cb.get_feature_importance(cb.Pool(X, y, cat_features=categorical_features_indices), type='ShapValues')

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.summary_plot(shap_values, X)

Conclusion

Binary Classification: Approved / Denied

Need for application: Help customer and financial institution know if customers are eligible for mortgage approval or not.

Optimizing model: After training the model we sort out to optimize our model. Using Bayesian methods the model improved, optimal parameters were found to be depth=6, l2_leaf_reg=1.119 and learning rate= 0.622. These parameters provided an AUC-ROC of 0.81 and an accuracy of 0.73 at 100 iterations. Increasing the iteration to 2500, we achieved an AUC-ROC of 0.89 and accuracy of 0.81.

In conclusion, we can see that mortgage loan approvals can be done using data from traditional loan application without having key industry features such as credit history, debt to income ratio, etc at an accuracy of 81%. Also, we identified geographical features such as state, country and Metropolitan Statistical Area/ Metropolitan Division codes for the property tract has higher feature importance for our model. Other features that have high importance include lender, applicant income and applicant race. Lastly, a few bits of census information such as the percentage of minorities in the population for that tract and the FFIEC median family income for the MSA/MD which the tract is located had some feature importance to the model.

	row_id	loan_type	property_type	loan_purpose	occupancy	loan_amount	preapproval	msa_md	state_code	county_code	applicant_ethnicity	applicant_race	applicant_sex	applicant_income	population	minority_population_pct	ffiecmedian_family_income	tract_to_msa_md_income_pct	number_of_owner-occupied_units	number_of_1_to_4_family_units	lender	co_applicant
accepted
0	249936.842896	1.353433	1.066310	2.191667	1.102399	194.352733	2.755372	170.484961	22.279283	139.265501	2.031226	4.740073	1.488403	89.696290	5328.818777	33.201373	68142.082690	91.293288	1393.644762	1869.383779	3704.499764	0.350492
1	250062.099986	1.379107	1.029007	1.942066	1.116775	249.128605	2.774063	192.718844	25.173245	149.813813	2.041225	4.833056	1.436369	110.536831	5465.083798	29.251766	70174.743021	93.106654	1452.674053	1890.901325	3735.728684	0.449567

	loan_type	property_type	loan_purpose	occupancy	loan_amount	preapproval	msa_md	applicant_ethnicity	applicant_race	applicant_sex	applicant_income	population	minority_population_pct	ffiecmedian_family_income	tract_to_msa_md_income_pct	number_of_owner-occupied_units	number_of_1_to_4_family_units	lender	co_applicant	accepted
loan_type	1	-0.07	-0.11	-0.18	0.06	-0.14	-0.01	-0.06	-0.03	-0.05	-0.17	0.05	0.05	-0.06	-0.04	0.03	0.05	-0.03	-0.03	0.02
property_type	-0.07	1	-0.13	0.04	-0.18	0.05	-0.1	0.07	0.04	0.05	-0.16	-0.01	-0.03	-0.11	-0.06	-0.01	0.03	-0.05	-0.02	-0.1
loan_purpose	-0.11	-0.13	1	0	0.02	0.52	0.08	0.03	0.03	0.01	0.05	-0	0.03	0.05	-0.02	-0.01	-0.02	0.05	0.01	-0.13
occupancy	-0.18	0.04	0	1	-0.03	0.04	-0.01	0.09	0.06	0.05	0.14	-0.06	0.04	-0.03	-0.08	-0.1	-0.03	0	-0.01	0.02
loan_amount	0.06	-0.18	0.02	-0.03	1	-0.07	0.1	0.04	-0	-0.1	0.54	0.06	0.05	0.31	0.25	0.04	-0.06	0.03	0.16	0.17
preapproval	-0.14	0.05	0.52	0.04	-0.07	1	0.11	0.02	0.02	0.02	0.02	-0.01	0.02	0	-0.07	-0.02	-0.01	-0	0.01	-0.02
msa_md	-0.01	-0.1	0.08	-0.01	0.1	0.11	1	-0.02	-0.02	0	0.09	0.06	0.12	0.31	-0.05	0.03	-0.06	0.02	-0.01	0.09
applicant_ethnicity	-0.06	0.07	0.03	0.09	0.04	0.02	-0.02	1	0.41	0.34	0.08	-0.04	-0.16	0.07	0.06	0.03	0.02	0.01	-0.12	-0
applicant_race	-0.03	0.04	0.03	0.06	-0	0.02	-0.02	0.41	1	0.28	0.03	-0.02	-0.16	-0.04	0.04	0.02	0.02	0.01	-0.08	0.02
applicant_sex	-0.05	0.05	0.01	0.05	-0.1	0.02	0	0.34	0.28	1	-0.13	-0.02	0.06	0.02	-0.05	-0.04	-0.03	0.02	-0.29	-0.05
applicant_income	-0.17	-0.16	0.05	0.14	0.54	0.02	0.09	0.08	0.03	-0.13	1	0.03	-0.04	0.23	0.24	0.05	-0.03	0.02	0.3	0.18
population	0.05	-0.01	-0	-0.06	0.06	-0.01	0.06	-0.04	-0.02	-0.02	0.03	1	0.11	0.02	0.14	0.79	0.78	0	0.02	0.02
minority_population_pct	0.05	-0.03	0.03	0.04	0.05	0.02	0.12	-0.16	-0.16	0.06	-0.04	0.11	1	0.05	-0.3	-0.24	-0.19	0.02	-0.1	-0.07
ffiecmedian_family_income	-0.06	-0.11	0.05	-0.03	0.31	0	0.31	0.07	-0.04	0.02	0.23	0.02	0.05	1	-0.02	0	-0.15	0.02	0.02	0.07
tract_to_msa_md_income_pct	-0.04	-0.06	-0.02	-0.08	0.25	-0.07	-0.05	0.06	0.04	-0.05	0.24	0.14	-0.3	-0.02	1	0.36	0.18	0.01	0.09	0.06
number_of_owner-occupied_units	0.03	-0.01	-0.01	-0.1	0.04	-0.02	0.03	0.03	0.02	-0.04	0.05	0.79	-0.24	0	0.36	1	0.87	-0	0.05	0.04
number_of_1_to_4_family_units	0.05	0.03	-0.02	-0.03	-0.06	-0.01	-0.06	0.02	0.02	-0.03	-0.03	0.78	-0.19	-0.15	0.18	0.87	1	-0.01	0.03	0
lender	-0.03	-0.05	0.05	0	0.03	-0	0.02	0.01	0.01	0.02	0.02	0	0.02	0.02	0.01	-0	-0.01	1	0	0.01
co_applicant	-0.03	-0.02	0.01	-0.01	0.16	0.01	-0.01	-0.12	-0.08	-0.29	0.3	0.02	-0.1	0.02	0.09	0.05	0.03	0	1	0.1
accepted	0.02	-0.1	-0.13	0.02	0.17	-0.02	0.09	-0	0.02	-0.05	0.18	0.02	-0.07	0.07	0.06	0.04	0	0.01	0.1	1

	row_id	loan_type	property_type	loan_purpose	occupancy	loan_amount	preapproval	msa_md	state_code	county_code	applicant_ethnicity	applicant_race	applicant_sex	applicant_income	population	minority_population_pct	ffiecmedian_family_income	tract_to_msa_md_income_pct	number_of_owner-occupied_units	number_of_1_to_4_family_units	lender	co_applicant	accepted	LDPR	LLT	PTLP	LARM	MSDARM
0	0	3	1	1	1	70.0	3	18	37	246	2	5	1	24.0	6203.0	44.230	60588.0	50.933	716.0	2642.0	4536	False	1	0.342857	13608	1	0.837209	0.508505
1	1	1	1	3	1	178.0	3	369	52	299	1	5	1	57.0	5774.0	15.905	54821.0	100.000	1622.0	2108.0	2458	False	0	0.320225	2458	3	0.168919	0.543021
2	2	2	1	3	1	163.0	3	16	10	306	2	5	1	67.0	6094.0	61.270	67719.0	100.000	760.0	1048.0	5710	False	1	0.411043	11420	3	0.489632	0.508886
3	3	1	1	1	1	155.0	1	305	47	180	2	5	1	105.0	6667.0	6.246	78439.0	100.000	2025.0	2299.0	5888	True	1	0.677419	5888	1	0.691964	0.544391
4	4	1	1	1	1	305.0	3	24	37	20	2	3	2	71.0	6732.0	100.000	63075.0	82.200	1464.0	1847.0	289	False	1	0.232787	289	1	0.542994	0.524821

	row_id	loan_type	property_type	loan_purpose	occupancy	loan_amount	preapproval	msa_md	state_code	county_code	applicant_ethnicity	applicant_race	applicant_sex	applicant_income	population	minority_population_pct	ffiecmedian_family_income	tract_to_msa_md_income_pct	number_of_owner-occupied_units	number_of_1_to_4_family_units	lender	co_applicant	accepted
17	17	2	2	3	1	138.0	3	-1	37	59	2	5	1	NaN	4193.0	14.996	57774.0	74.411	1247.0	1998.0	2566	True	1
26	26	1	1	1	1	113.0	1	-1	-1	-1	2	5	2	54.0	NaN	NaN	NaN	NaN	NaN	NaN	2839	False	0
35	35	1	1	3	1	168.0	3	-1	36	151	2	5	2	65.0	3195.0	20.700	47253.0	100.000	339.0	814.0	2597	False	1
38	38	1	1	1	2	88.0	1	-1	-1	-1	2	5	2	104.0	NaN	NaN	NaN	NaN	NaN	NaN	788	True	0
45	45	1	2	1	1	106.0	3	-1	42	136	3	1	1	48.0	NaN	NaN	NaN	NaN	NaN	NaN	2318	True	0

	row_id	loan_type	property_type	loan_purpose	occupancy	loan_amount	preapproval	msa_md	state_code	county_code	applicant_ethnicity	applicant_race	applicant_sex	applicant_income	population	minority_population_pct	ffiecmedian_family_income	tract_to_msa_md_income_pct	number_of_owner-occupied_units	number_of_1_to_4_family_units	lender	accepted
count	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	500000.00	460052.00	477535.00	477534.00	477560.00	477486.00	477435.00	477470.00	500000.00	500000.0
mean	249999.50	1.37	1.05	2.07	1.11	221.75	2.76	181.61	23.73	144.54	2.04	4.79	1.46	102.39	5416.83	31.62	69235.60	91.83	1427.72	1886.15	3720.12	0.5
std	144337.71	0.69	0.23	0.95	0.33	590.64	0.54	138.46	15.98	100.24	0.51	1.02	0.68	153.53	2728.14	26.33	14810.06	14.21	737.56	914.12	1838.31	0.5
min	0.00	1.00	1.00	1.00	1.00	1.00	1.00	-1.00	-1.00	-1.00	1.00	1.00	1.00	1.00	14.00	0.53	17858.00	3.98	4.00	1.00	0.00	0.0
25%	124999.75	1.00	1.00	1.00	1.00	93.00	3.00	25.00	6.00	57.00	2.00	5.00	1.00	47.00	3744.00	10.70	59731.00	88.07	944.00	1301.00	2442.00	0.0
50%	249999.50	1.00	1.00	2.00	1.00	162.00	3.00	192.00	26.00	131.00	2.00	5.00	1.00	74.00	4975.00	22.90	67526.00	100.00	1327.00	1753.00	3731.00	1.0
75%	374999.25	2.00	1.00	3.00	1.00	266.00	3.00	314.00	37.00	246.00	2.00	5.00	2.00	117.00	6467.00	46.02	75351.00	100.00	1780.00	2309.00	5436.00	1.0
max	499999.00	4.00	3.00	3.00	3.00	100878.00	3.00	408.00	52.00	324.00	4.00	7.00	4.00	10139.00	37097.00	100.00	125248.00	100.00	8771.00	13623.00	6508.00	1.0

	row_id	loan_type	property_type	loan_purpose	occupancy	loan_amount	preapproval	msa_md	state_code	county_code	applicant_ethnicity	applicant_race	applicant_sex	applicant_income	population	minority_population_pct	ffiecmedian_family_income	tract_to_msa_md_income_pct	number_of_owner-occupied_units	number_of_1_to_4_family_units	lender	co_applicant	LDPR	LLT	PTLP	LARM	MSDARM
226114	226114	1	1	1	1	52.0	3	383	25	49	2	5	1	40.0	4101.0	6.760	66693.0	88.103	1293.0	1647.0	2612	False	0.769231	2612	1	0.935484	0.464900
435187	435187	1	1	3	1	291.0	3	358	32	259	2	5	1	195.0	3668.0	34.298	109805.0	99.295	645.0	642.0	3873	True	0.670103	3873	3	0.803419	0.523775
294452	294452	1	1	3	1	692.0	3	350	38	233	2	5	1	312.0	5201.0	9.346	85651.0	100.000	1642.0	1766.0	5316	False	0.450867	5316	3	0.610101	0.627844
427864	427864	1	1	2	1	49.0	3	205	2	124	2	5	1	40.0	2342.0	11.994	58003.0	100.000	700.0	1366.0	878	True	0.816327	878	2	0.428756	0.462359
188822	188822	1	1	1	1	212.0	2	305	47	68	2	5	1	88.0	4786.0	17.743	75595.0	100.000	1480.0	1632.0	4791	True	0.415094	4791	1	0.806499	0.544391

	row_id	loan_type	property_type	loan_purpose	occupancy	loan_amount	preapproval	msa_md	state_code	county_code	applicant_ethnicity	applicant_race	applicant_sex	applicant_income	population	minority_population_pct	ffiecmedian_family_income	tract_to_msa_md_income_pct	number_of_owner-occupied_units	number_of_1_to_4_family_units	lender	co_applicant	LDPR	LLT	PTLP	LARM	MSDARM
0	0	2	1	3	1	115.0	3	101	16	276	2	5	1	74.0	6329.0	59.536	69889.0	85.78	1874.0	2410.0	3791	True	0.643478	7582	3	0.785178	0.495576
1	1	1	1	1	1	252.0	2	87	20	68	2	5	1	107.0	2473.0	8.050	65313.0	100.00	947.0	1214.0	2839	True	0.424603	2839	1	0.371434	0.607497
2	2	1	1	1	1	270.0	1	-1	-1	-1	2	1	2	119.0	4975.0	22.901	67526.0	100.00	1327.0	1753.0	4701	False	0.440741	4701	1	0.195064	0.338949
3	3	2	1	1	1	179.0	2	376	20	11	2	2	2	44.0	4795.0	29.676	57766.0	100.00	1426.0	1765.0	2153	True	0.245810	4306	1	0.817891	0.527134
4	4	2	1	1	1	36.0	2	254	48	156	3	6	3	32.0	5246.0	5.110	63332.0	100.00	1452.0	2092.0	5710	False	0.888889	11420	1	0.489632	0.547771

	Feature Index	Importances
0	county_code	10.394392
1	LARM	9.530536
2	applicant_income	6.364388
3	lender	5.648429
4	state_code	5.553903
5	LDPR	5.433531
6	minority_population_pct	5.136094
7	LLT	5.083558
8	loan_amount	5.001768
9	ffiecmedian_family_income	4.999677
10	population	4.665573
11	number_of_owner-occupied_units	4.295008
12	number_of_1_to_4_family_units	4.156638
13	msa_md	3.210549
14	MSDARM	3.206498
15	PTLP	2.540563
16	tract_to_msa_md_income_pct	2.470535
17	applicant_race	2.337708
18	preapproval	1.836922
19	loan_purpose	1.757151
20	loan_type	1.600887
21	applicant_sex	1.437460
22	applicant_ethnicity	1.344018
23	occupancy	1.038224
24	co_applicant	0.607123
25	property_type	0.347833
26	row_id	0.001035