import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn.model_selection import train_test_split
%matplotlib inline
%matplotlib widget
pd.set_option('display.max_columns', 50)


__author__ = "Frederick Emile Bondzie-Arthur"
__email__ = "Frederickauthur@hotmail.com"

DISCOVER

train_data= pd.read_csv("data/train_values.csv")
train_data_label= pd.read_csv("data/train_labels.csv")

final= pd.read_csv('data/test_values.csv')
train_data.head()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant
0 0 3 1 1 1 70.0 3 18 37 246 2 5 1 24.0 6203.0 44.230 60588.0 50.933 716.0 2642.0 4536 False
1 1 1 1 3 1 178.0 3 369 52 299 1 5 1 57.0 5774.0 15.905 54821.0 100.000 1622.0 2108.0 2458 False
2 2 2 1 3 1 163.0 3 16 10 306 2 5 1 67.0 6094.0 61.270 67719.0 100.000 760.0 1048.0 5710 False
3 3 1 1 1 1 155.0 1 305 47 180 2 5 1 105.0 6667.0 6.246 78439.0 100.000 2025.0 2299.0 5888 True
4 4 1 1 1 1 305.0 3 24 37 20 2 3 2 71.0 6732.0 100.000 63075.0 82.200 1464.0 1847.0 289 False
train_data_label.head()
row_id accepted
0 0 1
1 1 0
2 2 1
3 3 1
4 4 1
print(train_data.shape)
print(train_data_label.shape)
(500000, 22)
(500000, 2)
data= train_data.merge(train_data_label, on='row_id')
data.head()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant accepted
0 0 3 1 1 1 70.0 3 18 37 246 2 5 1 24.0 6203.0 44.230 60588.0 50.933 716.0 2642.0 4536 False 1
1 1 1 1 3 1 178.0 3 369 52 299 1 5 1 57.0 5774.0 15.905 54821.0 100.000 1622.0 2108.0 2458 False 0
2 2 2 1 3 1 163.0 3 16 10 306 2 5 1 67.0 6094.0 61.270 67719.0 100.000 760.0 1048.0 5710 False 1
3 3 1 1 1 1 155.0 1 305 47 180 2 5 1 105.0 6667.0 6.246 78439.0 100.000 2025.0 2299.0 5888 True 1
4 4 1 1 1 1 305.0 3 24 37 20 2 3 2 71.0 6732.0 100.000 63075.0 82.200 1464.0 1847.0 289 False 1
data.dtypes
row_id                              int64
loan_type                           int64
property_type                       int64
loan_purpose                        int64
occupancy                           int64
loan_amount                       float64
preapproval                         int64
msa_md                              int64
state_code                          int64
county_code                         int64
applicant_ethnicity                 int64
applicant_race                      int64
applicant_sex                       int64
applicant_income                  float64
population                        float64
minority_population_pct           float64
ffiecmedian_family_income         float64
tract_to_msa_md_income_pct        float64
number_of_owner-occupied_units    float64
number_of_1_to_4_family_units     float64
lender                              int64
co_applicant                         bool
accepted                            int64
dtype: object
print((data.astype(np.object) == '?').any())
row_id                            False
loan_type                         False
property_type                     False
loan_purpose                      False
occupancy                         False
loan_amount                       False
preapproval                       False
msa_md                            False
state_code                        False
county_code                       False
applicant_ethnicity               False
applicant_race                    False
applicant_sex                     False
applicant_income                  False
population                        False
minority_population_pct           False
ffiecmedian_family_income         False
tract_to_msa_md_income_pct        False
number_of_owner-occupied_units    False
number_of_1_to_4_family_units     False
lender                            False
co_applicant                      False
accepted                          False
dtype: bool
print((data.astype(np.object).isnull()).any())
row_id                            False
loan_type                         False
property_type                     False
loan_purpose                      False
occupancy                         False
loan_amount                       False
preapproval                       False
msa_md                            False
state_code                        False
county_code                       False
applicant_ethnicity               False
applicant_race                    False
applicant_sex                     False
applicant_income                   True
population                         True
minority_population_pct            True
ffiecmedian_family_income          True
tract_to_msa_md_income_pct         True
number_of_owner-occupied_units     True
number_of_1_to_4_family_units      True
lender                            False
co_applicant                      False
accepted                          False
dtype: bool
(data.isnull().sum()/ data.row_id.unique().shape[0] * 100).round(2)
row_id                            0.00
loan_type                         0.00
property_type                     0.00
loan_purpose                      0.00
occupancy                         0.00
loan_amount                       0.00
preapproval                       0.00
msa_md                            0.00
state_code                        0.00
county_code                       0.00
applicant_ethnicity               0.00
applicant_race                    0.00
applicant_sex                     0.00
applicant_income                  7.99
population                        4.49
minority_population_pct           4.49
ffiecmedian_family_income         4.49
tract_to_msa_md_income_pct        4.50
number_of_owner-occupied_units    4.51
number_of_1_to_4_family_units     4.51
lender                            0.00
co_applicant                      0.00
accepted                          0.00
dtype: float64
(final.isnull().sum()/ final.row_id.unique().shape[0] * 100).round(2)
row_id                            0.00
loan_type                         0.00
property_type                     0.00
loan_purpose                      0.00
occupancy                         0.00
loan_amount                       0.00
preapproval                       0.00
msa_md                            0.00
state_code                        0.00
county_code                       0.00
applicant_ethnicity               0.00
applicant_race                    0.00
applicant_sex                     0.00
applicant_income                  8.03
population                        4.50
minority_population_pct           4.50
ffiecmedian_family_income         4.49
tract_to_msa_md_income_pct        4.50
number_of_owner-occupied_units    4.51
number_of_1_to_4_family_units     4.51
lender                            0.00
co_applicant                      0.00
dtype: float64
data.isnull().sum()
row_id                                0
loan_type                             0
property_type                         0
loan_purpose                          0
occupancy                             0
loan_amount                           0
preapproval                           0
msa_md                                0
state_code                            0
county_code                           0
applicant_ethnicity                   0
applicant_race                        0
applicant_sex                         0
applicant_income                  39948
population                        22465
minority_population_pct           22466
ffiecmedian_family_income         22440
tract_to_msa_md_income_pct        22514
number_of_owner-occupied_units    22565
number_of_1_to_4_family_units     22530
lender                                0
co_applicant                          0
accepted                              0
dtype: int64
final.isnull().sum()
row_id                                0
loan_type                             0
property_type                         0
loan_purpose                          0
occupancy                             0
loan_amount                           0
preapproval                           0
msa_md                                0
state_code                            0
county_code                           0
applicant_ethnicity                   0
applicant_race                        0
applicant_sex                         0
applicant_income                  40141
population                        22480
minority_population_pct           22482
ffiecmedian_family_income         22453
tract_to_msa_md_income_pct        22517
number_of_owner-occupied_units    22574
number_of_1_to_4_family_units     22550
lender                                0
co_applicant                          0
dtype: int64
filter1 = data["msa_md"].isin([-1]) 
filter2 = data["county_code"].isin([-1]) 
filter3 = data["state_code"].isin([-1]) 

# displaying dataframe  with all filter applied and mandatory  
data[filter1 | filter2| filter3].head()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant accepted
17 17 2 2 3 1 138.0 3 -1 37 59 2 5 1 NaN 4193.0 14.996 57774.0 74.411 1247.0 1998.0 2566 True 1
26 26 1 1 1 1 113.0 1 -1 -1 -1 2 5 2 54.0 NaN NaN NaN NaN NaN NaN 2839 False 0
35 35 1 1 3 1 168.0 3 -1 36 151 2 5 2 65.0 3195.0 20.700 47253.0 100.000 339.0 814.0 2597 False 1
38 38 1 1 1 2 88.0 1 -1 -1 -1 2 5 2 104.0 NaN NaN NaN NaN NaN NaN 788 True 0
45 45 1 2 1 1 106.0 3 -1 42 136 3 1 1 48.0 NaN NaN NaN NaN NaN NaN 2318 True 0
print(data.msa_md[filter1].count())
print(data.county_code[filter2].count())
print(data.state_code[filter3].count())
76982
20466
19132
print(round((data.msa_md[filter1].count()/data.row_id.unique().shape[0] * 100),2))
print(round((data.county_code[filter2].count()/data.row_id.unique().shape[0] * 100),2))
print(round((data.state_code[filter3].count()/data.row_id.unique().shape[0] * 100),2))
15.4
4.09
3.83
print(data.shape)
print(data.row_id.unique().shape)
(500000, 23)
(500000,)
data.describe().round(2)
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender accepted
count 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 460052.00 477535.00 477534.00 477560.00 477486.00 477435.00 477470.00 500000.00 500000.0
mean 249999.50 1.37 1.05 2.07 1.11 221.75 2.76 181.61 23.73 144.54 2.04 4.79 1.46 102.39 5416.83 31.62 69235.60 91.83 1427.72 1886.15 3720.12 0.5
std 144337.71 0.69 0.23 0.95 0.33 590.64 0.54 138.46 15.98 100.24 0.51 1.02 0.68 153.53 2728.14 26.33 14810.06 14.21 737.56 914.12 1838.31 0.5
min 0.00 1.00 1.00 1.00 1.00 1.00 1.00 -1.00 -1.00 -1.00 1.00 1.00 1.00 1.00 14.00 0.53 17858.00 3.98 4.00 1.00 0.00 0.0
25% 124999.75 1.00 1.00 1.00 1.00 93.00 3.00 25.00 6.00 57.00 2.00 5.00 1.00 47.00 3744.00 10.70 59731.00 88.07 944.00 1301.00 2442.00 0.0
50% 249999.50 1.00 1.00 2.00 1.00 162.00 3.00 192.00 26.00 131.00 2.00 5.00 1.00 74.00 4975.00 22.90 67526.00 100.00 1327.00 1753.00 3731.00 1.0
75% 374999.25 2.00 1.00 3.00 1.00 266.00 3.00 314.00 37.00 246.00 2.00 5.00 2.00 117.00 6467.00 46.02 75351.00 100.00 1780.00 2309.00 5436.00 1.0
max 499999.00 4.00 3.00 3.00 3.00 100878.00 3.00 408.00 52.00 324.00 4.00 7.00 4.00 10139.00 37097.00 100.00 125248.00 100.00 8771.00 13623.00 6508.00 1.0

Since the exist some missing data. We use the median of the data to fill the missing values.

data_median= data.median()
data_median
row_id                            249999.500
loan_type                              1.000
property_type                          1.000
loan_purpose                           2.000
occupancy                              1.000
loan_amount                          162.000
preapproval                            3.000
msa_md                               192.000
state_code                            26.000
county_code                          131.000
applicant_ethnicity                    2.000
applicant_race                         5.000
applicant_sex                          1.000
applicant_income                      74.000
population                          4975.000
minority_population_pct               22.901
ffiecmedian_family_income          67526.000
tract_to_msa_md_income_pct           100.000
number_of_owner-occupied_units      1327.000
number_of_1_to_4_family_units       1753.000
lender                              3731.000
co_applicant                           0.000
accepted                               1.000
dtype: float64
final_median= final.median()
final_median
row_id                            249999.500
loan_type                              1.000
property_type                          1.000
loan_purpose                           2.000
occupancy                              1.000
loan_amount                          162.000
preapproval                            3.000
msa_md                               192.000
state_code                            26.000
county_code                          131.000
applicant_ethnicity                    2.000
applicant_race                         5.000
applicant_sex                          1.000
applicant_income                      74.000
population                          4975.000
minority_population_pct               22.955
ffiecmedian_family_income          67514.000
tract_to_msa_md_income_pct           100.000
number_of_owner-occupied_units      1326.000
number_of_1_to_4_family_units       1753.000
lender                              3713.000
co_applicant                           0.000
dtype: float64
data.fillna(data_median,inplace=True)
data.shape
(500000, 23)
final.fillna(data_median,inplace=True)
final.shape
(500000, 22)
(data.isnull().sum()/ data.row_id.unique().shape[0] * 100).round(2)
row_id                            0.0
loan_type                         0.0
property_type                     0.0
loan_purpose                      0.0
occupancy                         0.0
loan_amount                       0.0
preapproval                       0.0
msa_md                            0.0
state_code                        0.0
county_code                       0.0
applicant_ethnicity               0.0
applicant_race                    0.0
applicant_sex                     0.0
applicant_income                  0.0
population                        0.0
minority_population_pct           0.0
ffiecmedian_family_income         0.0
tract_to_msa_md_income_pct        0.0
number_of_owner-occupied_units    0.0
number_of_1_to_4_family_units     0.0
lender                            0.0
co_applicant                      0.0
accepted                          0.0
dtype: float64
(final.isnull().sum()/ final.row_id.unique().shape[0] * 100).round(2)
row_id                            0.0
loan_type                         0.0
property_type                     0.0
loan_purpose                      0.0
occupancy                         0.0
loan_amount                       0.0
preapproval                       0.0
msa_md                            0.0
state_code                        0.0
county_code                       0.0
applicant_ethnicity               0.0
applicant_race                    0.0
applicant_sex                     0.0
applicant_income                  0.0
population                        0.0
minority_population_pct           0.0
ffiecmedian_family_income         0.0
tract_to_msa_md_income_pct        0.0
number_of_owner-occupied_units    0.0
number_of_1_to_4_family_units     0.0
lender                            0.0
co_applicant                      0.0
dtype: float64
data.shape
(500000, 23)
accepted_rate= data.accepted.value_counts()/data.shape[0]
accepted_rate
1    0.500228
0    0.499772
Name: accepted, dtype: float64
data.describe().round(2)
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender accepted
count 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.00 500000.0
mean 249999.50 1.37 1.05 2.07 1.11 221.75 2.76 181.61 23.73 144.54 2.04 4.79 1.46 100.12 5396.98 31.23 69158.88 92.20 1423.17 1880.15 3720.12 0.5
std 144337.71 0.69 0.23 0.95 0.33 590.64 0.54 138.46 15.98 100.24 0.51 1.02 0.68 147.47 2667.72 25.80 14478.23 13.99 721.03 893.72 1838.31 0.5
min 0.00 1.00 1.00 1.00 1.00 1.00 1.00 -1.00 -1.00 -1.00 1.00 1.00 1.00 1.00 14.00 0.53 17858.00 3.98 4.00 1.00 0.00 0.0
25% 124999.75 1.00 1.00 1.00 1.00 93.00 3.00 25.00 6.00 57.00 2.00 5.00 1.00 49.00 3805.00 11.19 60071.00 89.14 963.00 1323.00 2442.00 0.0
50% 249999.50 1.00 1.00 2.00 1.00 162.00 3.00 192.00 26.00 131.00 2.00 5.00 1.00 74.00 4975.00 22.90 67526.00 100.00 1327.00 1753.00 3731.00 1.0
75% 374999.25 2.00 1.00 3.00 1.00 266.00 3.00 314.00 37.00 246.00 2.00 5.00 2.00 112.00 6379.00 44.49 74714.25 100.00 1754.00 2275.00 5436.00 1.0
max 499999.00 4.00 3.00 3.00 3.00 100878.00 3.00 408.00 52.00 324.00 4.00 7.00 4.00 10139.00 37097.00 100.00 125248.00 100.00 8771.00 13623.00 6508.00 1.0
accepted_Summary= data.groupby('accepted')
accepted_Summary.mean()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant
accepted
0 249936.842896 1.353433 1.066310 2.191667 1.102399 194.352733 2.755372 170.484961 22.279283 139.265501 2.031226 4.740073 1.488403 89.696290 5328.818777 33.201373 68142.082690 91.293288 1393.644762 1869.383779 3704.499764 0.350492
1 250062.099986 1.379107 1.029007 1.942066 1.116775 249.128605 2.774063 192.718844 25.173245 149.813813 2.041225 4.833056 1.436369 110.536831 5465.083798 29.251766 70174.743021 93.106654 1452.674053 1890.901325 3735.728684 0.449567
corr=data.drop(['row_id','county_code','state_code'], axis=1).corr(method='spearman').round(2)
fig= plt.figure(figsize=(20,10))
colormap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, cmap=colormap, annot=True)
plt.xticks(rotation=45)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title('Spearman Correlation Heatmap')
plt.show()
plt.savefig('image1.png')
<Figure size 432x288 with 0 Axes>
corr.style.background_gradient().set_precision(2)
loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant accepted
loan_type 1 -0.07 -0.11 -0.18 0.06 -0.14 -0.01 -0.06 -0.03 -0.05 -0.17 0.05 0.05 -0.06 -0.04 0.03 0.05 -0.03 -0.03 0.02
property_type -0.07 1 -0.13 0.04 -0.18 0.05 -0.1 0.07 0.04 0.05 -0.16 -0.01 -0.03 -0.11 -0.06 -0.01 0.03 -0.05 -0.02 -0.1
loan_purpose -0.11 -0.13 1 0 0.02 0.52 0.08 0.03 0.03 0.01 0.05 -0 0.03 0.05 -0.02 -0.01 -0.02 0.05 0.01 -0.13
occupancy -0.18 0.04 0 1 -0.03 0.04 -0.01 0.09 0.06 0.05 0.14 -0.06 0.04 -0.03 -0.08 -0.1 -0.03 0 -0.01 0.02
loan_amount 0.06 -0.18 0.02 -0.03 1 -0.07 0.1 0.04 -0 -0.1 0.54 0.06 0.05 0.31 0.25 0.04 -0.06 0.03 0.16 0.17
preapproval -0.14 0.05 0.52 0.04 -0.07 1 0.11 0.02 0.02 0.02 0.02 -0.01 0.02 0 -0.07 -0.02 -0.01 -0 0.01 -0.02
msa_md -0.01 -0.1 0.08 -0.01 0.1 0.11 1 -0.02 -0.02 0 0.09 0.06 0.12 0.31 -0.05 0.03 -0.06 0.02 -0.01 0.09
applicant_ethnicity -0.06 0.07 0.03 0.09 0.04 0.02 -0.02 1 0.41 0.34 0.08 -0.04 -0.16 0.07 0.06 0.03 0.02 0.01 -0.12 -0
applicant_race -0.03 0.04 0.03 0.06 -0 0.02 -0.02 0.41 1 0.28 0.03 -0.02 -0.16 -0.04 0.04 0.02 0.02 0.01 -0.08 0.02
applicant_sex -0.05 0.05 0.01 0.05 -0.1 0.02 0 0.34 0.28 1 -0.13 -0.02 0.06 0.02 -0.05 -0.04 -0.03 0.02 -0.29 -0.05
applicant_income -0.17 -0.16 0.05 0.14 0.54 0.02 0.09 0.08 0.03 -0.13 1 0.03 -0.04 0.23 0.24 0.05 -0.03 0.02 0.3 0.18
population 0.05 -0.01 -0 -0.06 0.06 -0.01 0.06 -0.04 -0.02 -0.02 0.03 1 0.11 0.02 0.14 0.79 0.78 0 0.02 0.02
minority_population_pct 0.05 -0.03 0.03 0.04 0.05 0.02 0.12 -0.16 -0.16 0.06 -0.04 0.11 1 0.05 -0.3 -0.24 -0.19 0.02 -0.1 -0.07
ffiecmedian_family_income -0.06 -0.11 0.05 -0.03 0.31 0 0.31 0.07 -0.04 0.02 0.23 0.02 0.05 1 -0.02 0 -0.15 0.02 0.02 0.07
tract_to_msa_md_income_pct -0.04 -0.06 -0.02 -0.08 0.25 -0.07 -0.05 0.06 0.04 -0.05 0.24 0.14 -0.3 -0.02 1 0.36 0.18 0.01 0.09 0.06
number_of_owner-occupied_units 0.03 -0.01 -0.01 -0.1 0.04 -0.02 0.03 0.03 0.02 -0.04 0.05 0.79 -0.24 0 0.36 1 0.87 -0 0.05 0.04
number_of_1_to_4_family_units 0.05 0.03 -0.02 -0.03 -0.06 -0.01 -0.06 0.02 0.02 -0.03 -0.03 0.78 -0.19 -0.15 0.18 0.87 1 -0.01 0.03 0
lender -0.03 -0.05 0.05 0 0.03 -0 0.02 0.01 0.01 0.02 0.02 0 0.02 0.02 0.01 -0 -0.01 1 0 0.01
co_applicant -0.03 -0.02 0.01 -0.01 0.16 0.01 -0.01 -0.12 -0.08 -0.29 0.3 0.02 -0.1 0.02 0.09 0.05 0.03 0 1 0.1
accepted 0.02 -0.1 -0.13 0.02 0.17 -0.02 0.09 -0 0.02 -0.05 0.18 0.02 -0.07 0.07 0.06 0.04 0 0.01 0.1 1
corr_with_acc=data.drop(['row_id','county_code','state_code'], axis=1).corr(method='spearman')['accepted'].sort_values(ascending=False)
plt.figure(figsize=(14,6))
corr_with_acc.drop("accepted").plot.bar()
plt.show()
plt.savefig('image8.png')
<Figure size 432x288 with 0 Axes>
cat_vars=['loan_type','property_type','loan_purpose','occupancy','preapproval','applicant_sex','co_applicant',
         'applicant_sex','applicant_race','applicant_ethnicity','msa_md','state_code','county_code']
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
          'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units']
def plot_voilin(combined, cols, col_x= 'accepted'):
    fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(30, 10))
    for col, subplot in zip(cols, ax.flatten()):
        sns.set_style("whitegrid")
        sns.violinplot(col_x, col, data=combined,ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(90)
        

#voilin plot for numerical variable 
plot_voilin(data, num_vars)
plt.savefig('image1.png')
def plot_box(combined, cols, col_x= 'accepted'):
    fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(30, 10))
    for col, subplot in zip(cols, ax.flatten()):
        sns.set_style("whitegrid")
        sns.boxplot(col_x, col, data=combined,ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(90)
        
#voilin plot for numerical variable 
plot_box(data, num_vars)
plt.savefig('image2.png')
def plot_den_hist(combined, cols, bins=10, hist= False):
    fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(20, 25))
    for col, subplot in zip(cols, ax.flatten()):
        sns.distplot(combined[col], bins= bins, rug=True, hist=hist, ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(0)

#KDE plot for numerical variable, histogram not enabled
plot_den_hist(data, num_vars) 
plt.savefig('image3.png')
plot_den_hist(data, num_vars, hist=True)
plt.savefig('image3b.png')
def plot_bar(cat_cols):
    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(30, 10))
    for col, subplot in zip(cat_cols, ax.flatten()):
        sns.countplot(data[col],hue=data['accepted'],ax=subplot)
        for label in subplot.get_xticklabels():
            label.set_rotation(90)

# plotting bar graph for categorical variables 
plot_bar(cat_vars)   
plt.savefig('image5.png')
data[num_vars].hist(bins=25, figsize=(20, 10), layout=(4, 4));
plt.savefig('image6.png')

From the graph, it can be seen that all data features are skwed except the ffiecmedian_family_income. To fix this issue we apply log to the data features skewed.

Skewness is the measure of symmetry of a distrubution. For a normal distubution skewness=0 and thus it is symmetrical. When data is skewed towards the right, then it is a postive skew otherwise is viceversa.

  1. Skewness between 0 to $\pm$ 5= acceptable
  2. Skewness between $\pm$ 0.5 to $\pm$ 1= a problem
  3. skewness between $\pm$ 1 or more= utmost
data.skew(axis=0)
row_id                            1.286588e-17
loan_type                         1.864712e+00
property_type                     5.196600e+00
loan_purpose                     -1.333652e-01
occupancy                         2.871840e+00
loan_amount                       7.655279e+01
preapproval                      -2.242003e+00
msa_md                            1.353241e-02
state_code                       -5.974008e-02
county_code                       2.309361e-01
applicant_ethnicity               5.802958e-01
applicant_race                   -1.583676e+00
applicant_sex                     1.370674e+00
applicant_income                  2.317498e+01
population                        2.947782e+00
minority_population_pct           1.068839e+00
ffiecmedian_family_income         8.063549e-01
tract_to_msa_md_income_pct       -2.035543e+00
number_of_owner-occupied_units    1.942059e+00
number_of_1_to_4_family_units     2.080321e+00
lender                           -2.196283e-01
co_applicant                      4.080284e-01
accepted                         -9.120028e-04
dtype: float64
import scipy.stats as ss
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

# function to print categorical variables after running it through the cramers_v dunction
def print_crammer_values(data, cat_features, cats_x= 'accepted'):
    for cat_ in cat_features:
        print(cat_+ ": "+ str(cramers_v(data[cat_],data[cats_x]).round(2)))
        
# calculating for correlation between categorical variable and target variable        
print_crammer_values(data, cat_vars)
loan_type: 0.02
property_type: 0.11
loan_purpose: 0.17
occupancy: 0.03
preapproval: 0.15
applicant_sex: 0.09
co_applicant: 0.1
applicant_sex: 0.09
applicant_race: 0.15
applicant_ethnicity: 0.11
msa_md: 0.17
state_code: 0.21
county_code: 0.2
# measuring kurtosis 
data.kurtosis(axis=0)
row_id                              -1.200000
loan_type                            2.707997
property_type                       29.022540
loan_purpose                        -1.874865
occupancy                            7.561584
loan_amount                       9385.071465
preapproval                          3.913370
msa_md                              -1.491417
state_code                          -1.361784
county_code                         -1.234460
applicant_ethnicity                  2.691639
applicant_race                       2.661800
applicant_sex                        1.370864
applicant_income                  1062.740924
population                          18.060147
minority_population_pct              0.179010
ffiecmedian_family_income            1.302638
tract_to_msa_md_income_pct           3.749475
number_of_owner-occupied_units       9.540237
number_of_1_to_4_family_units       12.029859
lender                              -1.105622
co_applicant                        -1.833520
accepted                            -2.000007
dtype: float64
data.columns
Index(['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy',
       'loan_amount', 'preapproval', 'msa_md', 'state_code', 'county_code',
       'applicant_ethnicity', 'applicant_race', 'applicant_sex',
       'applicant_income', 'population', 'minority_population_pct',
       'ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
       'number_of_owner-occupied_units', 'number_of_1_to_4_family_units',
       'lender', 'co_applicant', 'accepted'],
      dtype='object')
sns.lmplot(x='applicant_income',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e1cc8f110>
sns.lmplot(x='population',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d78decd10>
sns.lmplot(x='lender',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d9cebc2d0>
sns.lmplot(x='minority_population_pct',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e1cc28dd0>
sns.lmplot(x='ffiecmedian_family_income',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e00f7ca10>
sns.lmplot(x='tract_to_msa_md_income_pct',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e1cf2a210>
sns.lmplot(x='number_of_owner-occupied_units',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8ddddb7510>
sns.lmplot(x='number_of_1_to_4_family_units',y='loan_amount', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e2444bb90>
sns.lmplot(x='applicant_income',y='population', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e1cf30f10>
sns.lmplot(x='minority_population_pct',y='population', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8ca7a3d0>
sns.lmplot(x='ffiecmedian_family_income',y='population', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e1cc90ed0>
sns.lmplot(x='tract_to_msa_md_income_pct',y='population', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8c5bce90>
sns.lmplot(x='number_of_owner-occupied_units',y='population', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8ca4cf10>
sns.lmplot(x='number_of_1_to_4_family_units',y='population', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8c675290>
sns.lmplot(x='minority_population_pct',y='applicant_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8c6da490>
sns.lmplot(x='ffiecmedian_family_income',y='applicant_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8c791410>
sns.lmplot(x='tract_to_msa_md_income_pct',y='applicant_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d9ce91990>
sns.lmplot(x='number_of_owner-occupied_units',y='applicant_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d9ce98750>
sns.lmplot(x='number_of_1_to_4_family_units',y='applicant_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e0a287c90>
sns.lmplot(x='ffiecmedian_family_income',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e0a28a710>
sns.lmplot(x='tract_to_msa_md_income_pct',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8c4a8310>
sns.lmplot(x='number_of_1_to_4_family_units',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d790034d0>
sns.lmplot(x='tract_to_msa_md_income_pct',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d79047e90>
sns.lmplot(x='number_of_owner-occupied_units',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8e0a295d90>
sns.lmplot(x='number_of_1_to_4_family_units',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d8c85d310>
sns.lmplot(x='number_of_owner-occupied_units',y='tract_to_msa_md_income_pct', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8dcd9f5510>
sns.lmplot(x='number_of_1_to_4_family_units',y='tract_to_msa_md_income_pct', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8dbd682710>
sns.lmplot(x='number_of_1_to_4_family_units',y='number_of_owner-occupied_units', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d68a169d0>
sns.lmplot(x='number_of_owner-occupied_units',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
<seaborn.axisgrid.FacetGrid at 0x7f8d893a72d0>

Feature Engineering

From EDA we identified some of the features are higher skewed. This can affect our classifcation model.

Here we extract some features from the dataset and add to the datasets

#caculating the loan amount per applicant ratio LAPDR 
data['LDPR']= (data['applicant_income'])/ (data['loan_amount'])
#caculating the lenders x loantype 
data['LLT']= (data['lender'])* (data['loan_type'])
# caculating the property x loan purpose
data['PTLP']= (data['property_type'])* (data['loan_purpose'])
#caculating the loan amount per applicant ratio LAPDR
final['LDPR']= (final['applicant_income'])/ (final['loan_amount'])
#caculating the lenders x loantype
final['LLT']= (final['lender'])* (final['loan_type'])
# caculating the property x loan purpose
final['PTLP']= (final['property_type'])* (final['loan_purpose'])
#ie.msd_md acceptance Rate Mean on test data

MSDARM= pd.DataFrame(data.groupby(["msa_md"])["accepted"].mean()) 
MSDARM.shape
(409, 1)
#train dataset
LARM= pd.DataFrame(data.groupby(['lender'])['accepted'].mean())
data= pd.merge(data,LARM,how='left', on='lender')
final= pd.merge(final,LARM,how='left', on='lender')
# Renaming certain columns for better readability
data.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'accepted', 'LDPR','LLT','PTLP','LARM']
# Renaming certain columns for better readability
final.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'LDPR','LLT','PTLP','LARM']
data= pd.merge(data,MSDARM,how='left', on='msa_md')
final= pd.merge(final,MSDARM,how='left', on='msa_md')
# Renaming certain columns for better readability
data.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'accepted', 'LDPR','LLT','PTLP','LARM','MSDARM']
# Renaming certain columns for better readability
final.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount', 
                'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
                'applicant_race', 'applicant_sex','applicant_income', 'population', 
                'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
                'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
                'co_applicant', 'LDPR','LLT','PTLP','LARM','MSDARM']
cat_vars=['loan_type','property_type','loan_purpose','occupancy','preapproval','applicant_sex','co_applicant',
         'applicant_sex','applicant_race','applicant_ethnicity','msa_md','state_code','county_code']
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
          'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units',
         'LDPR','LLT','PTLP','LARM','MSDARM']
# pairwise plot for all variable origanal variable %config InlineBackend.figure_format = 'png' g= sns.PairGrid(data=data, hue="accepted") g.map(plt.scatter) plt.savefig('image9.png')num_vars_=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units', 'LDPR','LLT','PTLP','LARM','MSDARM','accepted'] # pairwise plot for all variable origanal variable %config InlineBackend.figure_format = 'png' sns.pairplot(data[num_vars_]) plt.savefig('image10.png')%config InlineBackend.figure_format = 'png' fig= plt.figure(figsize=(12,10)) sns.pairplot(data[num_vars], palette="Set2",diag_kind="kde", height=2).map_upper(sns.kdeplot, cmap="Blues_d") plt.savefig('image11.png')

Class Balance

Mortgage acceptance rate: 50%

data.accepted.value_counts(1)*100
1    50.0228
0    49.9772
Name: accepted, dtype: float64
is_loan_accepted= data.accepted== 1
loan_accepted= data[is_loan_accepted]
loan_is_not_accepted= data.accepted== 0
loan_not_accepted= data[loan_is_not_accepted]

print(loan_accepted.shape)
print(loan_not_accepted.shape)
(250114, 28)
(249886, 28)
sns.countplot('accepted', data = data)
plt.title('Distribution of Loan Applicant')
plt.savefig('image4.png')
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 28 columns):
row_id                            500000 non-null int64
loan_type                         500000 non-null int64
property_type                     500000 non-null int64
loan_purpose                      500000 non-null int64
occupancy                         500000 non-null int64
loan_amount                       500000 non-null float64
preapproval                       500000 non-null int64
msa_md                            500000 non-null int64
state_code                        500000 non-null int64
county_code                       500000 non-null int64
applicant_ethnicity               500000 non-null int64
applicant_race                    500000 non-null int64
applicant_sex                     500000 non-null int64
applicant_income                  500000 non-null float64
population                        500000 non-null float64
minority_population_pct           500000 non-null float64
ffiecmedian_family_income         500000 non-null float64
tract_to_msa_md_income_pct        500000 non-null float64
number_of_owner-occupied_units    500000 non-null float64
number_of_1_to_4_family_units     500000 non-null float64
lender                            500000 non-null int64
co_applicant                      500000 non-null bool
accepted                          500000 non-null int64
LDPR                              500000 non-null float64
LLT                               500000 non-null int64
PTLP                              500000 non-null int64
LARM                              500000 non-null float64
MSDARM                            500000 non-null float64
dtypes: bool(1), float64(11), int64(16)
memory usage: 127.3 MB
final.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 27 columns):
row_id                            500000 non-null int64
loan_type                         500000 non-null int64
property_type                     500000 non-null int64
loan_purpose                      500000 non-null int64
occupancy                         500000 non-null int64
loan_amount                       500000 non-null float64
preapproval                       500000 non-null int64
msa_md                            500000 non-null int64
state_code                        500000 non-null int64
county_code                       500000 non-null int64
applicant_ethnicity               500000 non-null int64
applicant_race                    500000 non-null int64
applicant_sex                     500000 non-null int64
applicant_income                  500000 non-null float64
population                        500000 non-null float64
minority_population_pct           500000 non-null float64
ffiecmedian_family_income         500000 non-null float64
tract_to_msa_md_income_pct        500000 non-null float64
number_of_owner-occupied_units    500000 non-null float64
number_of_1_to_4_family_units     500000 non-null float64
lender                            500000 non-null int64
co_applicant                      500000 non-null bool
LDPR                              500000 non-null float64
LLT                               500000 non-null int64
PTLP                              500000 non-null int64
LARM                              499278 non-null float64
MSDARM                            500000 non-null float64
dtypes: bool(1), float64(11), int64(15)
memory usage: 103.5 MB

Split Train/Test Set


Let's split our data into a train and test set. We'll fit our model with the train set and leave our test set for our last evaluation.

# Create the X and y set
X = data.drop('accepted', axis=1)
y = data.accepted
categorical_features_indices= np.where(X.dtypes != np.float)[0]
categorical_features_indices
array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 20, 21, 23, 24])
# Define train and test
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size = 0.3, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
(350000, 27) (350000,)
(150000, 27) (150000,)
y_train.head()
226114    1
435187    1
294452    0
427864    0
188822    1
Name: accepted, dtype: int64
X_train.head()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant LDPR LLT PTLP LARM MSDARM
226114 226114 1 1 1 1 52.0 3 383 25 49 2 5 1 40.0 4101.0 6.760 66693.0 88.103 1293.0 1647.0 2612 False 0.769231 2612 1 0.935484 0.464900
435187 435187 1 1 3 1 291.0 3 358 32 259 2 5 1 195.0 3668.0 34.298 109805.0 99.295 645.0 642.0 3873 True 0.670103 3873 3 0.803419 0.523775
294452 294452 1 1 3 1 692.0 3 350 38 233 2 5 1 312.0 5201.0 9.346 85651.0 100.000 1642.0 1766.0 5316 False 0.450867 5316 3 0.610101 0.627844
427864 427864 1 1 2 1 49.0 3 205 2 124 2 5 1 40.0 2342.0 11.994 58003.0 100.000 700.0 1366.0 878 True 0.816327 878 2 0.428756 0.462359
188822 188822 1 1 1 1 212.0 2 305 47 68 2 5 1 88.0 4786.0 17.743 75595.0 100.000 1480.0 1632.0 4791 True 0.415094 4791 1 0.806499 0.544391
x_predict= final
x_predict.head()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant LDPR LLT PTLP LARM MSDARM
0 0 2 1 3 1 115.0 3 101 16 276 2 5 1 74.0 6329.0 59.536 69889.0 85.78 1874.0 2410.0 3791 True 0.643478 7582 3 0.785178 0.495576
1 1 1 1 1 1 252.0 2 87 20 68 2 5 1 107.0 2473.0 8.050 65313.0 100.00 947.0 1214.0 2839 True 0.424603 2839 1 0.371434 0.607497
2 2 1 1 1 1 270.0 1 -1 -1 -1 2 1 2 119.0 4975.0 22.901 67526.0 100.00 1327.0 1753.0 4701 False 0.440741 4701 1 0.195064 0.338949
3 3 2 1 1 1 179.0 2 376 20 11 2 2 2 44.0 4795.0 29.676 57766.0 100.00 1426.0 1765.0 2153 True 0.245810 4306 1 0.817891 0.527134
4 4 2 1 1 1 36.0 2 254 48 156 3 6 3 32.0 5246.0 5.110 63332.0 100.00 1452.0 2092.0 5710 False 0.888889 11420 1 0.489632 0.547771

Model Measurement


Metrics used for measurement include following:

  • Accuracy
  • AUC
  • Macro Precision
  • Macro Recall
  • F1 Score
import sklearn.model_selection as ms
import sklearn.metrics as sklm
def score_model(probs, threshold):
    return np.array([1 if x> threshold else 0 for x in probs[:, 1]])
def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
def plot_auc(labels, probs, threshold):
    ## compute the false postive rate, true positive rate  and threshold  along with the AUC 
    pl.style.use('ggplot')
    scores = score_model(probs, threshold)
    accuracy= sklm.accuracy_score(labels, scores)
    fpr, tpr, threshold = sklm.roc_curve(labels, probs[:,1]) 
    auc = sklm.auc(fpr, tpr)

    ## plot the result 
    plt.title('Reciever Operating Charateristic')
    plt.plot(fpr, tpr, color = 'orange', label = 'AUC = %0.2f' %auc)
    plt.legend(loc = 'lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.ylabel('True Positive Rate') 
    plt.xlabel('False Positive Rate')
    plt.title("Recieve Operating Characteristic (Accuracy= %0.2f)" %accuracy)
    plt.show()

Train Three Models


  1. Catboost Classifier
  2. Xgboost Classifier
  3. Decision Tree
import catboost as cb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score
import matplotlib.pylab as pl

Choose Which Sampling Technique to Use For Model


We all models listed above using a 5-fold cross-validation.

labels= data.accepted
features= X
fig= plt.figure(figsize=(12,10))
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]
CV= 5
cv_df= pd.DataFrame(index= range(CV * len(models)))
entries= []
for model in models:
  model_name= model.__class__.__name__
  accuracies= cross_val_score(model, features, labels, scoring= 'accuracy', cv= CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df= pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x= 'model_name', y= 'accuracy', data= cv_df)
sns.stripplot(x= 'model_name', y= 'accuracy', data= cv_df, 
              size= 8, jitter= True, edgecolor= "gray", linewidth= 2)
plt.show()

print(cv_df.groupby('model_name').accuracy.mean())
model_name
CatBoostClassifier        0.726908
DecisionTreeClassifier    0.631850
XGBClassifier             0.708636
Name: accuracy, dtype: float64

From the results above we can see catboost classifier out performs the other models. We move further to tune the catboost classifier model.

models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]

result_df= pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

for model in models:
    model_name= model.__class__.__name__
    model_fit=model.fit(X_train,y_train)
    y_prob= model_fit.predict_proba(X_test)[::,1]
    
    fpr, tpr, threshold = sklm.roc_curve(y_test, y_prob) 
    auc = sklm.auc(fpr, tpr)
    
    result_df= result_df.append({'model_name':model_name,'fpr':fpr, 'tpr':tpr,'auc':auc}, ignore_index=True)

result_df.set_index('model_name', inplace=True)


fig= plt.figure(figsize=(+8,6))
pl.style.use('ggplot')

for i in result_df.index:
    plt.plot(result_df.loc[i]['fpr'],
            result_df.loc[i]['tpr'],
            label="{}, AUC={:.2f}".format(i, result_df.loc[i]['auc']))

plt.plot([0,1],[0,1], color='red',linestyle='--')
plt.xticks(np.arange(0.0,1.1,step=0.1))
plt.xlabel("False Postive Rate", fontsize=15)
plt.yticks(np.arange(0.0,1.1,step=0.1))
plt.ylabel("True Positive Rate",fontsize=15)
plt.title("Reciever Operating Charateristic")
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

From the results above we can see catboost classifier out performs the other models in terms of AUC We move further to tune the catboost classifier model. With this and the results above we then to tune and improve the catboost classifier model.

Normalizing the features that are highly skewed to make it normally distributed and running the model to see its accuracy and auc

data_norm= data.copy()
final_norm= final.copy()
data_norm.head()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant accepted LDPR LLT PTLP LARM MSDARM
0 0 3 1 1 1 70.0 3 18 37 246 2 5 1 24.0 6203.0 44.230 60588.0 50.933 716.0 2642.0 4536 False 1 0.342857 13608 1 0.837209 0.508505
1 1 1 1 3 1 178.0 3 369 52 299 1 5 1 57.0 5774.0 15.905 54821.0 100.000 1622.0 2108.0 2458 False 0 0.320225 2458 3 0.168919 0.543021
2 2 2 1 3 1 163.0 3 16 10 306 2 5 1 67.0 6094.0 61.270 67719.0 100.000 760.0 1048.0 5710 False 1 0.411043 11420 3 0.489632 0.508886
3 3 1 1 1 1 155.0 1 305 47 180 2 5 1 105.0 6667.0 6.246 78439.0 100.000 2025.0 2299.0 5888 True 1 0.677419 5888 1 0.691964 0.544391
4 4 1 1 1 1 305.0 3 24 37 20 2 3 2 71.0 6732.0 100.000 63075.0 82.200 1464.0 1847.0 289 False 1 0.232787 289 1 0.542994 0.524821
final_norm.head()
row_id loan_type property_type loan_purpose occupancy loan_amount preapproval msa_md state_code county_code applicant_ethnicity applicant_race applicant_sex applicant_income population minority_population_pct ffiecmedian_family_income tract_to_msa_md_income_pct number_of_owner-occupied_units number_of_1_to_4_family_units lender co_applicant LDPR LLT PTLP LARM MSDARM
0 0 2 1 3 1 115.0 3 101 16 276 2 5 1 74.0 6329.0 59.536 69889.0 85.78 1874.0 2410.0 3791 True 0.643478 7582 3 0.785178 0.495576
1 1 1 1 1 1 252.0 2 87 20 68 2 5 1 107.0 2473.0 8.050 65313.0 100.00 947.0 1214.0 2839 True 0.424603 2839 1 0.371434 0.607497
2 2 1 1 1 1 270.0 1 -1 -1 -1 2 1 2 119.0 4975.0 22.901 67526.0 100.00 1327.0 1753.0 4701 False 0.440741 4701 1 0.195064 0.338949
3 3 2 1 1 1 179.0 2 376 20 11 2 2 2 44.0 4795.0 29.676 57766.0 100.00 1426.0 1765.0 2153 True 0.245810 4306 1 0.817891 0.527134
4 4 2 1 1 1 36.0 2 254 48 156 3 6 3 32.0 5246.0 5.110 63332.0 100.00 1452.0 2092.0 5710 False 0.888889 11420 1 0.489632 0.547771
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
          'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units','LDPR',
         'LLT','PTLP','LARM','MSDARM']

#plotting histogram numerical variables
data_norm[num_vars].hist(bins=25, figsize=(30, 20), layout=(7, 3));
data_norm.skew(axis=0)
row_id                            1.286588e-17
loan_type                         1.864712e+00
property_type                     5.196600e+00
loan_purpose                     -1.333652e-01
occupancy                         2.871840e+00
loan_amount                       7.655279e+01
preapproval                      -2.242003e+00
msa_md                            1.353241e-02
state_code                       -5.974008e-02
county_code                       2.309361e-01
applicant_ethnicity               5.802958e-01
applicant_race                   -1.583676e+00
applicant_sex                     1.370674e+00
applicant_income                  2.317498e+01
population                        2.947782e+00
minority_population_pct           1.068839e+00
ffiecmedian_family_income         8.063549e-01
tract_to_msa_md_income_pct       -2.035543e+00
number_of_owner-occupied_units    1.942059e+00
number_of_1_to_4_family_units     2.080321e+00
lender                           -2.196283e-01
co_applicant                      4.080284e-01
accepted                         -9.120028e-04
LDPR                              8.941249e+01
LLT                               1.676016e+00
PTLP                              6.904792e-01
LARM                             -9.479196e-02
MSDARM                           -5.590501e-01
dtype: float64
# We apply log(x+1)
data_norm['log_loan_amount']= np.log(data_norm['loan_amount']+1)
data_norm['log_LDPR']= np.log(data_norm['LDPR']+1)
data_norm['log_PTLP']= np.log(data_norm['PTLP']+1)
data_norm['log_applicant_income']= np.log(data_norm['applicant_income']+1)
data_norm['log_population']= np.log(data_norm['population']+1)
data_norm['log_minority_population_pct']= np.log(data_norm['minority_population_pct']+1)
data_norm['log_ffiecmedian_family_income']= np.log(data_norm['ffiecmedian_family_income']+1)
data_norm['log_number_of_owner_occupied_units']= np.log(data_norm['number_of_owner-occupied_units']+1)
data_norm['log_number_of_1_to_4_family_units']= np.log(data_norm['number_of_1_to_4_family_units'])
data_norm['pwr_tract_to_msa_md_income_pct'] = np.power(data_norm['tract_to_msa_md_income_pct'],10)
data_norm['pwr_LLT']= np.power(data_norm['LLT'],10)
num_vars_log= ['log_loan_amount','log_LDPR','log_PTLP','log_applicant_income','log_population','log_minority_population_pct',
              'log_ffiecmedian_family_income','log_number_of_owner_occupied_units','log_number_of_1_to_4_family_units',
              'pwr_tract_to_msa_md_income_pct','pwr_LLT',]
data_norm[num_vars_log].hist(bins=25, figsize=(30, 20), layout=(6, 2));
data_norm.skew(axis=0)
row_id                                1.286588e-17
loan_type                             1.864712e+00
property_type                         5.196600e+00
loan_purpose                         -1.333652e-01
occupancy                             2.871840e+00
loan_amount                           7.655279e+01
preapproval                          -2.242003e+00
msa_md                                1.353241e-02
state_code                           -5.974008e-02
county_code                           2.309361e-01
applicant_ethnicity                   5.802958e-01
applicant_race                       -1.583676e+00
applicant_sex                         1.370674e+00
applicant_income                      2.317498e+01
population                            2.947782e+00
minority_population_pct               1.068839e+00
ffiecmedian_family_income             8.063549e-01
tract_to_msa_md_income_pct           -2.035543e+00
number_of_owner-occupied_units        1.942059e+00
number_of_1_to_4_family_units         2.080321e+00
lender                               -2.196283e-01
co_applicant                          4.080284e-01
accepted                             -9.120028e-04
LDPR                                  8.941249e+01
LLT                                   1.676016e+00
PTLP                                  6.904792e-01
LARM                                 -9.479196e-02
MSDARM                               -5.590501e-01
log_loan_amount                      -1.045326e+00
log_LDPR                              3.109443e+00
log_PTLP                             -8.455082e-02
log_applicant_income                  1.644426e-01
log_population                       -1.581668e-01
log_minority_population_pct          -2.711568e-01
log_ffiecmedian_family_income        -2.891652e-01
log_number_of_owner_occupied_units   -1.100173e+00
log_number_of_1_to_4_family_units    -1.615771e+00
pwr_tract_to_msa_md_income_pct       -8.940395e-01
pwr_LLT                               8.320281e-02
dtype: float64
# Create the X and y set
X_norm = data_norm.drop('accepted', axis=1)
y_norm = data_norm.accepted
# Define train and test
X_train_norm, X_test_norm, y_train_norm, y_test_norm= train_test_split(X_norm,y_norm, test_size = 0.3, random_state = 42)
print(X_train_norm.shape,y_train_norm.shape)
print(X_test_norm.shape,y_test_norm.shape)
(350000, 38) (350000,)
(150000, 38) (150000,)
labels= data_norm.accepted
features= X_norm
fig= plt.figure(figsize=(12,10))
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]
CV= 5
cv_df= pd.DataFrame(index= range(CV * len(models)))
entries= []
for model in models:
  model_name= model.__class__.__name__
  accuracies= cross_val_score(model, features, labels, scoring= 'accuracy', cv= CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df= pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x= 'model_name', y= 'accuracy', data= cv_df)
sns.stripplot(x= 'model_name', y= 'accuracy', data= cv_df, 
              size= 8, jitter= True, edgecolor= "gray", linewidth= 2)
plt.show()

print(cv_df.groupby('model_name').accuracy.mean())
model_name
CatBoostClassifier        0.726900
DecisionTreeClassifier    0.632066
XGBClassifier             0.708804
Name: accuracy, dtype: float64
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
         XGBClassifier(n_estimators=10, max_depth=4),
         tree.DecisionTreeClassifier(random_state= 42)]

result_df= pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

for model in models:
    model_name= model.__class__.__name__
    model_fit=model.fit(X_train_norm,y_train_norm)
    y_prob_norm= model_fit.predict_proba(X_test_norm)[::,1]
    
    fpr, tpr, threshold = sklm.roc_curve(y_test_norm, y_prob_norm) 
    auc = sklm.auc(fpr, tpr)
    
    result_df= result_df.append({'model_name':model_name,'fpr':fpr, 'tpr':tpr,'auc':auc}, ignore_index=True)

result_df.set_index('model_name', inplace=True)


fig= plt.figure(figsize=(+8,6))
pl.style.use('ggplot')

for i in result_df.index:
    plt.plot(result_df.loc[i]['fpr'],
            result_df.loc[i]['tpr'],
            label="{}, AUC={:.2f}".format(i, result_df.loc[i]['auc']))

plt.plot([0,1],[0,1], color='red',linestyle='--')
plt.xticks(np.arange(0.0,1.1,step=0.1))
plt.xlabel("False Postive Rate", fontsize=15)
plt.yticks(np.arange(0.0,1.1,step=0.1))
plt.ylabel("True Positive Rate",fontsize=15)
plt.title("Reciever Operating Charateristic")
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

From Comparing results from Non normalized features and normalized features. We see that the accuracy of non normalized features is slightly higher than normalized features. With results we preceed to using non-normalized features for tunning and feature training.

Tunning of Catboost model

import hyperopt
import sys
from frozendict import frozendict
import shap
shap.initjs()
class UAClassifierObjective(object):
    def __init__(self, dataset, const_params, fold_count):
        self._dataset = dataset
        self._const_params = const_params.copy()
        self._fold_count = fold_count
        self._evaluated_count = 0
        
    def _to_catboost_params(self, hyper_params):
        return {
            'learning_rate': hyper_params['learning_rate'],
            'depth': hyper_params['depth'],
            'l2_leaf_reg': hyper_params['l2_leaf_reg']}
    
    # hyperopt optimizes an objective using `__call__` method (e.g. by doing 
    # `foo(hyper_params)`), so we provide one
    def __call__(self, hyper_params):
        # join hyper-parameters provided by hyperopt with hyper-parameters 
        # provided by the user
        params = self._to_catboost_params(hyper_params)
        params.update(self._const_params)
        
        print('evaluating params={}'.format(params), file=sys.stdout)
        sys.stdout.flush()
        
        # we use cross-validation for objective evaluation, to avoid overfitting
        scores = cb.cv(
            pool=self._dataset,
            params=params,
            fold_count=self._fold_count,
            partition_random_seed=42,
            verbose=False)
        
        # scores returns a dictionary with mean and std (per-fold) of metric 
        # value for each cv iteration, we choose minimal value of objective 
        # mean (though it will be better to choose minimal value among all folds)
        # because noise is additive
        min_mean_auc = np.min(scores['test-AUC-mean'])
        print('evaluated score={}'.format(min_mean_auc), file=sys.stdout)
        
        self._evaluated_count += 1
        print('evaluated {} times'.format(self._evaluated_count), file=sys.stdout)
        
        # negate because hyperopt minimizes the objective
        return {'loss': -min_mean_auc, 'status': hyperopt.STATUS_OK}
def find_best_hyper_params(dataset, const_params, max_evals=100):    
    # we are going to optimize these three parameters, though there are a lot more of them (see CatBoost docs)
    parameter_space = {
        'learning_rate': hyperopt.hp.uniform('learning_rate', 0.2, 1.0),
        'depth': hyperopt.hp.randint('depth', 7),
        'l2_leaf_reg': hyperopt.hp.uniform('l2_leaf_reg', 1, 10)}
    objective = UAClassifierObjective(dataset=dataset, const_params=const_params, fold_count=6)
    trials = hyperopt.Trials()
    best = hyperopt.fmin(
        fn=objective,
        space=parameter_space,
        algo=hyperopt.rand.suggest,
        max_evals=max_evals,
        rstate=np.random.RandomState(seed=42))
    return best

def train_best_model(X, y, const_params, max_evals=100, use_default=False):
    # convert pandas.DataFrame to catboost.Pool to avoid converting it on each 
    # iteration of hyper-parameters optimization
    dataset = cb.Pool(X, y, cat_features=categorical_features_indices)
    
    if use_default:
        # pretrained optimal parameters
        best = {
            'learning_rate': 0.4234185321620083, 
            'depth': 5, 
            'l2_leaf_reg': 9.464266235679002}
    else:
        best = find_best_hyper_params(dataset, const_params, max_evals=max_evals)
            
    # merge subset of hyper-parameters provided by hyperopt with hyper-parameters 
    # provided by the user
    hyper_params = best.copy()
    hyper_params.update(const_params)
    
    # drop `use_best_model` because we are going to use entire dataset for 
    # training of the final model
    hyper_params.pop('use_best_model', None)
    
    model = cb.CatBoostClassifier(**hyper_params)
    model.fit(dataset, verbose=False)
    
    return model, hyper_params
import time
start=time.time()

have_gpu = False
# skip hyper-parameter optimization and just use provided optimal parameters
use_optimal_pretrained_params = False
# number of iterations of hyper-parameter search
hyperopt_iterations = 50

const_params = frozendict({
    'task_type': 'GPU' if have_gpu else 'CPU',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC', 
    'custom_metric': ['AUC'],
    'iterations': 100,
    'random_seed': 42})

model, params = train_best_model(
    X_train, y_train, 
    const_params, 
    max_evals=hyperopt_iterations, 
    use_default=use_optimal_pretrained_params)
print('best params are {}'.format(params), file=sys.stdout)
end = time.time()
print(end-start)
evaluating params={'learning_rate': 0.5637608417770977, 'depth': 4, 'l2_leaf_reg': 8.493688290834637, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 1 times
evaluating params={'learning_rate': 0.528083167082651, 'depth': 3, 'l2_leaf_reg': 7.549531688595925, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 2 times
evaluating params={'learning_rate': 0.8699106844426274, 'depth': 1, 'l2_leaf_reg': 8.949837496427758, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 3 times
evaluating params={'learning_rate': 0.5558660098409214, 'depth': 2, 'l2_leaf_reg': 9.268502695024392, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 4 times
evaluating params={'learning_rate': 0.8499167906858907, 'depth': 6, 'l2_leaf_reg': 2.546844052569046, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734748121140296
evaluated 5 times
evaluating params={'learning_rate': 0.8396427532857385, 'depth': 1, 'l2_leaf_reg': 4.942262677968309, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 6 times
evaluating params={'learning_rate': 0.7872224143884547, 'depth': 2, 'l2_leaf_reg': 9.454327638424944, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 7 times
evaluating params={'learning_rate': 0.693663486801853, 'depth': 0, 'l2_leaf_reg': 7.978279409450941, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 8 times
evaluating params={'learning_rate': 0.63472245415225, 'depth': 5, 'l2_leaf_reg': 9.280083037935846, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 9 times
evaluating params={'learning_rate': 0.9643823890479426, 'depth': 6, 'l2_leaf_reg': 7.3055930015922925, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.773471890147901
evaluated 10 times
evaluating params={'learning_rate': 0.2029042458037946, 'depth': 4, 'l2_leaf_reg': 8.360470176973763, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 11 times
evaluating params={'learning_rate': 0.9033087097584362, 'depth': 2, 'l2_leaf_reg': 4.226333703160277, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 12 times
evaluating params={'learning_rate': 0.905182734106353, 'depth': 2, 'l2_leaf_reg': 3.7653234749986546, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 13 times
evaluating params={'learning_rate': 0.6216878461972857, 'depth': 6, 'l2_leaf_reg': 1.1186473207406844, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734841160115428
evaluated 14 times
evaluating params={'learning_rate': 0.7096272251393012, 'depth': 0, 'l2_leaf_reg': 9.288835555293193, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 15 times
evaluating params={'learning_rate': 0.6666954285134632, 'depth': 0, 'l2_leaf_reg': 6.760090014166223, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 16 times
evaluating params={'learning_rate': 0.6126656895443174, 'depth': 2, 'l2_leaf_reg': 2.122305933450838, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 17 times
evaluating params={'learning_rate': 0.7536883616570151, 'depth': 0, 'l2_leaf_reg': 7.129352121946425, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 18 times
evaluating params={'learning_rate': 0.4305107282251851, 'depth': 0, 'l2_leaf_reg': 7.252206709468439, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 19 times
evaluating params={'learning_rate': 0.5182286954655402, 'depth': 5, 'l2_leaf_reg': 8.87600708641634, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 20 times
evaluating params={'learning_rate': 0.2782680588629646, 'depth': 3, 'l2_leaf_reg': 7.606937685716839, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 21 times
evaluating params={'learning_rate': 0.5596670546087271, 'depth': 6, 'l2_leaf_reg': 2.0726141256143755, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734748121140296
evaluated 22 times
evaluating params={'learning_rate': 0.46449369289712517, 'depth': 3, 'l2_leaf_reg': 6.339387901117264, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 23 times
evaluating params={'learning_rate': 0.8314498467716105, 'depth': 6, 'l2_leaf_reg': 2.3992161107439527, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7734748121140296
evaluated 24 times
evaluating params={'learning_rate': 0.94205821260823, 'depth': 2, 'l2_leaf_reg': 6.774791231932208, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 25 times
evaluating params={'learning_rate': 0.4643443332702141, 'depth': 3, 'l2_leaf_reg': 3.7415118170244104, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310275811755531
evaluated 26 times
evaluating params={'learning_rate': 0.8642659638603969, 'depth': 0, 'l2_leaf_reg': 7.341252418608859, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 27 times
evaluating params={'learning_rate': 0.7294189282659973, 'depth': 5, 'l2_leaf_reg': 1.0626887985889903, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623793574052168
evaluated 28 times
evaluating params={'learning_rate': 0.48343522785711396, 'depth': 4, 'l2_leaf_reg': 6.710418002699299, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 29 times
evaluating params={'learning_rate': 0.7210562330475281, 'depth': 5, 'l2_leaf_reg': 6.850395695432215, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 30 times
evaluating params={'learning_rate': 0.5401179224561183, 'depth': 4, 'l2_leaf_reg': 2.184936328120133, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7492519266208718
evaluated 31 times
evaluating params={'learning_rate': 0.6308365875061976, 'depth': 0, 'l2_leaf_reg': 3.850918241301665, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 32 times
evaluating params={'learning_rate': 0.30502207676284404, 'depth': 5, 'l2_leaf_reg': 6.181142547861167, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623741421835207
evaluated 33 times
evaluating params={'learning_rate': 0.6292005093922117, 'depth': 4, 'l2_leaf_reg': 6.62131414290977, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 34 times
evaluating params={'learning_rate': 0.4610454641190919, 'depth': 2, 'l2_leaf_reg': 6.43044823439896, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7178748842690442
evaluated 35 times
evaluating params={'learning_rate': 0.6292458327974482, 'depth': 4, 'l2_leaf_reg': 2.6565212671416507, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7492519266208718
evaluated 36 times
evaluating params={'learning_rate': 0.5464785404218941, 'depth': 1, 'l2_leaf_reg': 5.469164797061436, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 37 times
evaluating params={'learning_rate': 0.8815589660398113, 'depth': 3, 'l2_leaf_reg': 7.762603832174364, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 38 times
evaluating params={'learning_rate': 0.7673527934998763, 'depth': 5, 'l2_leaf_reg': 1.3354940643295945, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7623793574052168
evaluated 39 times
evaluating params={'learning_rate': 0.9509122562293137, 'depth': 1, 'l2_leaf_reg': 8.157368633221372, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 40 times
evaluating params={'learning_rate': 0.37343651242935033, 'depth': 0, 'l2_leaf_reg': 9.120965626738386, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 41 times
evaluating params={'learning_rate': 0.5825104357671131, 'depth': 4, 'l2_leaf_reg': 7.375442989938085, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 42 times
evaluating params={'learning_rate': 0.6370940109439119, 'depth': 1, 'l2_leaf_reg': 3.310896149885864, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 43 times
evaluating params={'learning_rate': 0.9703050878380732, 'depth': 3, 'l2_leaf_reg': 4.42644166211115, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.7310254089012167
evaluated 44 times
evaluating params={'learning_rate': 0.6909652662226076, 'depth': 6, 'l2_leaf_reg': 8.067461444538466, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.773471890147901
evaluated 45 times
evaluating params={'learning_rate': 0.44765747767757863, 'depth': 4, 'l2_leaf_reg': 9.72685862525973, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.749250579565882
evaluated 46 times
evaluating params={'learning_rate': 0.4380272549876373, 'depth': 1, 'l2_leaf_reg': 3.2977280073078528, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 47 times
evaluating params={'learning_rate': 0.2033304608226356, 'depth': 1, 'l2_leaf_reg': 9.813523901250528, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 48 times
evaluating params={'learning_rate': 0.9619808422033007, 'depth': 1, 'l2_leaf_reg': 8.88997567856084, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.6790411481213688
evaluated 49 times
evaluating params={'learning_rate': 0.990250545808629, 'depth': 0, 'l2_leaf_reg': 4.183924523770489, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
evaluated score=0.5
evaluated 50 times
100%|██████████| 50/50 [2:56:50<00:00, 171.83s/it, best loss: -0.7734841160115428]
Warning: Custom metrics will not be evaluated because there are no test datasets
best params are {'depth': 6, 'l2_leaf_reg': 1.1186473207406844, 'learning_rate': 0.6216878461972857, 'task_type': 'CPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'custom_metric': ['AUC'], 'iterations': 100, 'random_seed': 42}
10644.765050649643
probabilities = model.predict_proba(data=X_test) 
print_metrics(y_test, probabilities, 0.51)
                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive     51767             22864
Actual Negative     17783             57586

Accuracy        0.73
AUC             0.81
Macro Precision 0.73
Macro Recall    0.73
 
           Positive      Negative
Num Case    74631         75369
Precision    0.74          0.72
Recall       0.69          0.76
F1           0.72          0.74
plot_auc(y_test, probabilities, 0.50)
from catboost import CatBoostClassifier
clf_cb= CatBoostClassifier(iterations=2500, depth=10,logging_level='Silent',
                           learning_rate=0.01,eval_metric='Accuracy',use_best_model=True, random_seed=42)
clf_cb.fit(X_test, y_test, cat_features= categorical_features_indices, eval_set=(X_test,y_test))
<catboost.core.CatBoostClassifier at 0x7f8deae5cf10>
probabilities= clf_cb.predict_proba(data= X_test) 
print_metrics(y_test, probabilities, 0.51)
                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive     54448             20183
Actual Negative     13663             61706

Accuracy        0.77
AUC             0.86
Macro Precision 0.78
Macro Recall    0.77
 
           Positive      Negative
Num Case    74631         75369
Precision    0.80          0.75
Recall       0.73          0.82
F1           0.76          0.78
plot_auc(y_test, probabilities, 0.51)
from catboost import CatBoostClassifier
clf_cb= CatBoostClassifier(iterations=2500, depth=6,logging_level='Silent',
                           learning_rate=0.3548362548720143,eval_metric='Accuracy',l2_leaf_reg=2.683829844728577,
                          use_best_model=True, random_seed=42)
clf_cb.fit(X_test, y_test, cat_features= categorical_features_indices, eval_set=(X_test,y_test))
<catboost.core.CatBoostClassifier at 0x7f8e296d41d0>
probabilities= clf_cb.predict_proba(data= X_test) 
print_metrics(y_test, probabilities, 0.51)
                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive     58500             16131
Actual Negative     12622             62747

Accuracy        0.81
AUC             0.89
Macro Precision 0.81
Macro Recall    0.81
 
           Positive      Negative
Num Case    74631         75369
Precision    0.82          0.80
Recall       0.78          0.83
F1           0.80          0.81
plot_auc(y_test, probabilities, 0.50)

tunning and optimizing catboost algorithm

final_score = clf_cb.predict(data=x_predict)
final_score= final_score.astype(np.int)
submit= pd.DataFrame({'row_id':x_predict['row_id'],'accepted':final_score})
submit.to_csv('submission.csv', index=False)

Catboost Classifier Feature Importance

clf_cb.get_feature_importance(prettified=True)
Feature Index Importances
0 county_code 10.394392
1 LARM 9.530536
2 applicant_income 6.364388
3 lender 5.648429
4 state_code 5.553903
5 LDPR 5.433531
6 minority_population_pct 5.136094
7 LLT 5.083558
8 loan_amount 5.001768
9 ffiecmedian_family_income 4.999677
10 population 4.665573
11 number_of_owner-occupied_units 4.295008
12 number_of_1_to_4_family_units 4.156638
13 msa_md 3.210549
14 MSDARM 3.206498
15 PTLP 2.540563
16 tract_to_msa_md_income_pct 2.470535
17 applicant_race 2.337708
18 preapproval 1.836922
19 loan_purpose 1.757151
20 loan_type 1.600887
21 applicant_sex 1.437460
22 applicant_ethnicity 1.344018
23 occupancy 1.038224
24 co_applicant 0.607123
25 property_type 0.347833
26 row_id 0.001035
shap_values = clf_cb.get_feature_importance(cb.Pool(X, y, cat_features=categorical_features_indices), type='ShapValues')
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]
shap.summary_plot(shap_values, X)
# across the whole dataset
cat_features= data[cat_vars]

def plot_shap(cat_cols):
    for col in cat_cols:
        shap.dependence_plot(col, shap_values, X)

# plotting data for categorical variables 
plot_shap(cat_features)
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))

# make SHAP plots of the three most important features
for i in range(20):
    shap.dependence_plot(top_inds[i], shap_values, X)
shap_values = clf_cb.get_feature_importance(cb.Pool(X, y, cat_features=categorical_features_indices), type='ShapValues')
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]
shap.summary_plot(shap_values, X)

Conclusion

Binary Classification: Approved / Denied

Need for application: Help customer and financial institution know if customers are eligible for mortgage approval or not.

Optimizing model: After training the model we sort out to optimize our model. Using Bayesian methods the model improved, optimal parameters were found to be depth=6, l2_leaf_reg=1.119 and learning rate= 0.622. These parameters provided an AUC-ROC of 0.81 and an accuracy of 0.73 at 100 iterations. Increasing the iteration to 2500, we achieved an AUC-ROC of 0.89 and accuracy of 0.81.

In conclusion, we can see that mortgage loan approvals can be done using data from traditional loan application without having key industry features such as credit history, debt to income ratio, etc at an accuracy of 81%. Also, we identified geographical features such as state, country and Metropolitan Statistical Area/ Metropolitan Division codes for the property tract has higher feature importance for our model. Other features that have high importance include lender, applicant income and applicant race. Lastly, a few bits of census information such as the percentage of minorities in the population for that tract and the FFIEC median family income for the MSA/MD which the tract is located had some feature importance to the model.