Mortgage Loan prediction based on mortgage application.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn.model_selection import train_test_split
%matplotlib inline
%matplotlib widget
pd.set_option('display.max_columns', 50)
__author__ = "Frederick Emile Bondzie-Arthur"
__email__ = "Frederickauthur@hotmail.com"
train_data= pd.read_csv("data/train_values.csv")
train_data_label= pd.read_csv("data/train_labels.csv")
final= pd.read_csv('data/test_values.csv')
train_data.head()
train_data_label.head()
print(train_data.shape)
print(train_data_label.shape)
data= train_data.merge(train_data_label, on='row_id')
data.head()
data.dtypes
print((data.astype(np.object) == '?').any())
print((data.astype(np.object).isnull()).any())
(data.isnull().sum()/ data.row_id.unique().shape[0] * 100).round(2)
(final.isnull().sum()/ final.row_id.unique().shape[0] * 100).round(2)
data.isnull().sum()
final.isnull().sum()
filter1 = data["msa_md"].isin([-1])
filter2 = data["county_code"].isin([-1])
filter3 = data["state_code"].isin([-1])
# displaying dataframe with all filter applied and mandatory
data[filter1 | filter2| filter3].head()
print(data.msa_md[filter1].count())
print(data.county_code[filter2].count())
print(data.state_code[filter3].count())
print(round((data.msa_md[filter1].count()/data.row_id.unique().shape[0] * 100),2))
print(round((data.county_code[filter2].count()/data.row_id.unique().shape[0] * 100),2))
print(round((data.state_code[filter3].count()/data.row_id.unique().shape[0] * 100),2))
print(data.shape)
print(data.row_id.unique().shape)
data.describe().round(2)
Since the exist some missing data. We use the median of the data to fill the missing values.
data_median= data.median()
data_median
final_median= final.median()
final_median
data.fillna(data_median,inplace=True)
data.shape
final.fillna(data_median,inplace=True)
final.shape
(data.isnull().sum()/ data.row_id.unique().shape[0] * 100).round(2)
(final.isnull().sum()/ final.row_id.unique().shape[0] * 100).round(2)
data.shape
accepted_rate= data.accepted.value_counts()/data.shape[0]
accepted_rate
data.describe().round(2)
accepted_Summary= data.groupby('accepted')
accepted_Summary.mean()
corr=data.drop(['row_id','county_code','state_code'], axis=1).corr(method='spearman').round(2)
fig= plt.figure(figsize=(20,10))
colormap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, cmap=colormap, annot=True)
plt.xticks(rotation=45)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title('Spearman Correlation Heatmap')
plt.show()
plt.savefig('image1.png')
corr.style.background_gradient().set_precision(2)
corr_with_acc=data.drop(['row_id','county_code','state_code'], axis=1).corr(method='spearman')['accepted'].sort_values(ascending=False)
plt.figure(figsize=(14,6))
corr_with_acc.drop("accepted").plot.bar()
plt.show()
plt.savefig('image8.png')
cat_vars=['loan_type','property_type','loan_purpose','occupancy','preapproval','applicant_sex','co_applicant',
'applicant_sex','applicant_race','applicant_ethnicity','msa_md','state_code','county_code']
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units']
def plot_voilin(combined, cols, col_x= 'accepted'):
fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(30, 10))
for col, subplot in zip(cols, ax.flatten()):
sns.set_style("whitegrid")
sns.violinplot(col_x, col, data=combined,ax=subplot)
for label in subplot.get_xticklabels():
label.set_rotation(90)
#voilin plot for numerical variable
plot_voilin(data, num_vars)
plt.savefig('image1.png')
def plot_box(combined, cols, col_x= 'accepted'):
fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(30, 10))
for col, subplot in zip(cols, ax.flatten()):
sns.set_style("whitegrid")
sns.boxplot(col_x, col, data=combined,ax=subplot)
for label in subplot.get_xticklabels():
label.set_rotation(90)
#voilin plot for numerical variable
plot_box(data, num_vars)
plt.savefig('image2.png')
def plot_den_hist(combined, cols, bins=10, hist= False):
fig, ax = plt.subplots(nrows=2, ncols=4, figsize=(20, 25))
for col, subplot in zip(cols, ax.flatten()):
sns.distplot(combined[col], bins= bins, rug=True, hist=hist, ax=subplot)
for label in subplot.get_xticklabels():
label.set_rotation(0)
#KDE plot for numerical variable, histogram not enabled
plot_den_hist(data, num_vars)
plt.savefig('image3.png')
plot_den_hist(data, num_vars, hist=True)
plt.savefig('image3b.png')
def plot_bar(cat_cols):
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(30, 10))
for col, subplot in zip(cat_cols, ax.flatten()):
sns.countplot(data[col],hue=data['accepted'],ax=subplot)
for label in subplot.get_xticklabels():
label.set_rotation(90)
# plotting bar graph for categorical variables
plot_bar(cat_vars)
plt.savefig('image5.png')
data[num_vars].hist(bins=25, figsize=(20, 10), layout=(4, 4));
plt.savefig('image6.png')
From the graph, it can be seen that all data features are skwed except the ffiecmedian_family_income. To fix this issue we apply log to the data features skewed.
Skewness is the measure of symmetry of a distrubution. For a normal distubution skewness=0 and thus it is symmetrical. When data is skewed towards the right, then it is a postive skew otherwise is viceversa.
- Skewness between 0 to $\pm$ 5= acceptable
- Skewness between $\pm$ 0.5 to $\pm$ 1= a problem
- skewness between $\pm$ 1 or more= utmost
data.skew(axis=0)
import scipy.stats as ss
def cramers_v(x, y):
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
# function to print categorical variables after running it through the cramers_v dunction
def print_crammer_values(data, cat_features, cats_x= 'accepted'):
for cat_ in cat_features:
print(cat_+ ": "+ str(cramers_v(data[cat_],data[cats_x]).round(2)))
# calculating for correlation between categorical variable and target variable
print_crammer_values(data, cat_vars)
# measuring kurtosis
data.kurtosis(axis=0)
data.columns
sns.lmplot(x='applicant_income',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='population',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='lender',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='minority_population_pct',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='ffiecmedian_family_income',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='tract_to_msa_md_income_pct',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_owner-occupied_units',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_1_to_4_family_units',y='loan_amount', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='applicant_income',y='population', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='minority_population_pct',y='population', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='ffiecmedian_family_income',y='population', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='tract_to_msa_md_income_pct',y='population', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_owner-occupied_units',y='population', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_1_to_4_family_units',y='population', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='minority_population_pct',y='applicant_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='ffiecmedian_family_income',y='applicant_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='tract_to_msa_md_income_pct',y='applicant_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_owner-occupied_units',y='applicant_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_1_to_4_family_units',y='applicant_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='ffiecmedian_family_income',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='tract_to_msa_md_income_pct',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_1_to_4_family_units',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='tract_to_msa_md_income_pct',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_owner-occupied_units',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_1_to_4_family_units',y='ffiecmedian_family_income', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_owner-occupied_units',y='tract_to_msa_md_income_pct', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_1_to_4_family_units',y='tract_to_msa_md_income_pct', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_1_to_4_family_units',y='number_of_owner-occupied_units', data= data, fit_reg=False, hue='accepted')
sns.lmplot(x='number_of_owner-occupied_units',y='minority_population_pct', data= data, fit_reg=False, hue='accepted')
#caculating the loan amount per applicant ratio LAPDR
data['LDPR']= (data['applicant_income'])/ (data['loan_amount'])
#caculating the lenders x loantype
data['LLT']= (data['lender'])* (data['loan_type'])
# caculating the property x loan purpose
data['PTLP']= (data['property_type'])* (data['loan_purpose'])
#caculating the loan amount per applicant ratio LAPDR
final['LDPR']= (final['applicant_income'])/ (final['loan_amount'])
#caculating the lenders x loantype
final['LLT']= (final['lender'])* (final['loan_type'])
# caculating the property x loan purpose
final['PTLP']= (final['property_type'])* (final['loan_purpose'])
#ie.msd_md acceptance Rate Mean on test data
MSDARM= pd.DataFrame(data.groupby(["msa_md"])["accepted"].mean())
MSDARM.shape
#train dataset
LARM= pd.DataFrame(data.groupby(['lender'])['accepted'].mean())
data= pd.merge(data,LARM,how='left', on='lender')
final= pd.merge(final,LARM,how='left', on='lender')
# Renaming certain columns for better readability
data.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount',
'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
'applicant_race', 'applicant_sex','applicant_income', 'population',
'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
'co_applicant', 'accepted', 'LDPR','LLT','PTLP','LARM']
# Renaming certain columns for better readability
final.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount',
'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
'applicant_race', 'applicant_sex','applicant_income', 'population',
'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
'co_applicant', 'LDPR','LLT','PTLP','LARM']
data= pd.merge(data,MSDARM,how='left', on='msa_md')
final= pd.merge(final,MSDARM,how='left', on='msa_md')
# Renaming certain columns for better readability
data.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount',
'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
'applicant_race', 'applicant_sex','applicant_income', 'population',
'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
'co_applicant', 'accepted', 'LDPR','LLT','PTLP','LARM','MSDARM']
# Renaming certain columns for better readability
final.columns = ['row_id', 'loan_type', 'property_type', 'loan_purpose', 'occupancy','loan_amount',
'preapproval', 'msa_md', 'state_code', 'county_code','applicant_ethnicity',
'applicant_race', 'applicant_sex','applicant_income', 'population',
'minority_population_pct','ffiecmedian_family_income', 'tract_to_msa_md_income_pct',
'number_of_owner-occupied_units', 'number_of_1_to_4_family_units','lender',
'co_applicant', 'LDPR','LLT','PTLP','LARM','MSDARM']
cat_vars=['loan_type','property_type','loan_purpose','occupancy','preapproval','applicant_sex','co_applicant',
'applicant_sex','applicant_race','applicant_ethnicity','msa_md','state_code','county_code']
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units',
'LDPR','LLT','PTLP','LARM','MSDARM']
data.accepted.value_counts(1)*100
is_loan_accepted= data.accepted== 1
loan_accepted= data[is_loan_accepted]
loan_is_not_accepted= data.accepted== 0
loan_not_accepted= data[loan_is_not_accepted]
print(loan_accepted.shape)
print(loan_not_accepted.shape)
sns.countplot('accepted', data = data)
plt.title('Distribution of Loan Applicant')
plt.savefig('image4.png')
data.info()
final.info()
# Create the X and y set
X = data.drop('accepted', axis=1)
y = data.accepted
categorical_features_indices= np.where(X.dtypes != np.float)[0]
categorical_features_indices
# Define train and test
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size = 0.3, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
y_train.head()
X_train.head()
x_predict= final
x_predict.head()
import sklearn.model_selection as ms
import sklearn.metrics as sklm
def score_model(probs, threshold):
return np.array([1 if x> threshold else 0 for x in probs[:, 1]])
def print_metrics(labels, probs, threshold):
scores = score_model(probs, threshold)
metrics = sklm.precision_recall_fscore_support(labels, scores)
conf = sklm.confusion_matrix(labels, scores)
print(' Confusion Matrix')
print(' Score Positive Score Negative')
print('Actual Positive %6d' % conf[0,0] + ' %5d' % conf[0,1])
print('Actual Negative %6d' % conf[1,0] + ' %5d' % conf[1,1])
print('')
print('Accuracy %0.2f' % sklm.accuracy_score(labels, scores))
print('AUC %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
print('Macro Recall %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
print(' ')
print(' Positive Negative')
print('Num Case %6d' % metrics[3][0] + ' %6d' % metrics[3][1])
print('Precision %6.2f' % metrics[0][0] + ' %6.2f' % metrics[0][1])
print('Recall %6.2f' % metrics[1][0] + ' %6.2f' % metrics[1][1])
print('F1 %6.2f' % metrics[2][0] + ' %6.2f' % metrics[2][1])
def plot_auc(labels, probs, threshold):
## compute the false postive rate, true positive rate and threshold along with the AUC
pl.style.use('ggplot')
scores = score_model(probs, threshold)
accuracy= sklm.accuracy_score(labels, scores)
fpr, tpr, threshold = sklm.roc_curve(labels, probs[:,1])
auc = sklm.auc(fpr, tpr)
## plot the result
plt.title('Reciever Operating Charateristic')
plt.plot(fpr, tpr, color = 'orange', label = 'AUC = %0.2f' %auc)
plt.legend(loc = 'lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title("Recieve Operating Characteristic (Accuracy= %0.2f)" %accuracy)
plt.show()
import catboost as cb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score
import matplotlib.pylab as pl
labels= data.accepted
features= X
fig= plt.figure(figsize=(12,10))
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
XGBClassifier(n_estimators=10, max_depth=4),
tree.DecisionTreeClassifier(random_state= 42)]
CV= 5
cv_df= pd.DataFrame(index= range(CV * len(models)))
entries= []
for model in models:
model_name= model.__class__.__name__
accuracies= cross_val_score(model, features, labels, scoring= 'accuracy', cv= CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df= pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
sns.boxplot(x= 'model_name', y= 'accuracy', data= cv_df)
sns.stripplot(x= 'model_name', y= 'accuracy', data= cv_df,
size= 8, jitter= True, edgecolor= "gray", linewidth= 2)
plt.show()
print(cv_df.groupby('model_name').accuracy.mean())
From the results above we can see catboost classifier out performs the other models. We move further to tune the catboost classifier model.
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
XGBClassifier(n_estimators=10, max_depth=4),
tree.DecisionTreeClassifier(random_state= 42)]
result_df= pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
for model in models:
model_name= model.__class__.__name__
model_fit=model.fit(X_train,y_train)
y_prob= model_fit.predict_proba(X_test)[::,1]
fpr, tpr, threshold = sklm.roc_curve(y_test, y_prob)
auc = sklm.auc(fpr, tpr)
result_df= result_df.append({'model_name':model_name,'fpr':fpr, 'tpr':tpr,'auc':auc}, ignore_index=True)
result_df.set_index('model_name', inplace=True)
fig= plt.figure(figsize=(+8,6))
pl.style.use('ggplot')
for i in result_df.index:
plt.plot(result_df.loc[i]['fpr'],
result_df.loc[i]['tpr'],
label="{}, AUC={:.2f}".format(i, result_df.loc[i]['auc']))
plt.plot([0,1],[0,1], color='red',linestyle='--')
plt.xticks(np.arange(0.0,1.1,step=0.1))
plt.xlabel("False Postive Rate", fontsize=15)
plt.yticks(np.arange(0.0,1.1,step=0.1))
plt.ylabel("True Positive Rate",fontsize=15)
plt.title("Reciever Operating Charateristic")
plt.legend(prop={'size':13}, loc='lower right')
plt.show()
From the results above we can see catboost classifier out performs the other models in terms of AUC We move further to tune the catboost classifier model. With this and the results above we then to tune and improve the catboost classifier model.
Normalizing the features that are highly skewed to make it normally distributed and running the model to see its accuracy and auc
data_norm= data.copy()
final_norm= final.copy()
data_norm.head()
final_norm.head()
num_vars=['loan_amount','population','applicant_income','minority_population_pct','ffiecmedian_family_income',
'tract_to_msa_md_income_pct','number_of_owner-occupied_units','number_of_1_to_4_family_units','LDPR',
'LLT','PTLP','LARM','MSDARM']
#plotting histogram numerical variables
data_norm[num_vars].hist(bins=25, figsize=(30, 20), layout=(7, 3));
data_norm.skew(axis=0)
# We apply log(x+1)
data_norm['log_loan_amount']= np.log(data_norm['loan_amount']+1)
data_norm['log_LDPR']= np.log(data_norm['LDPR']+1)
data_norm['log_PTLP']= np.log(data_norm['PTLP']+1)
data_norm['log_applicant_income']= np.log(data_norm['applicant_income']+1)
data_norm['log_population']= np.log(data_norm['population']+1)
data_norm['log_minority_population_pct']= np.log(data_norm['minority_population_pct']+1)
data_norm['log_ffiecmedian_family_income']= np.log(data_norm['ffiecmedian_family_income']+1)
data_norm['log_number_of_owner_occupied_units']= np.log(data_norm['number_of_owner-occupied_units']+1)
data_norm['log_number_of_1_to_4_family_units']= np.log(data_norm['number_of_1_to_4_family_units'])
data_norm['pwr_tract_to_msa_md_income_pct'] = np.power(data_norm['tract_to_msa_md_income_pct'],10)
data_norm['pwr_LLT']= np.power(data_norm['LLT'],10)
num_vars_log= ['log_loan_amount','log_LDPR','log_PTLP','log_applicant_income','log_population','log_minority_population_pct',
'log_ffiecmedian_family_income','log_number_of_owner_occupied_units','log_number_of_1_to_4_family_units',
'pwr_tract_to_msa_md_income_pct','pwr_LLT',]
data_norm[num_vars_log].hist(bins=25, figsize=(30, 20), layout=(6, 2));
data_norm.skew(axis=0)
# Create the X and y set
X_norm = data_norm.drop('accepted', axis=1)
y_norm = data_norm.accepted
# Define train and test
X_train_norm, X_test_norm, y_train_norm, y_test_norm= train_test_split(X_norm,y_norm, test_size = 0.3, random_state = 42)
print(X_train_norm.shape,y_train_norm.shape)
print(X_test_norm.shape,y_test_norm.shape)
labels= data_norm.accepted
features= X_norm
fig= plt.figure(figsize=(12,10))
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
XGBClassifier(n_estimators=10, max_depth=4),
tree.DecisionTreeClassifier(random_state= 42)]
CV= 5
cv_df= pd.DataFrame(index= range(CV * len(models)))
entries= []
for model in models:
model_name= model.__class__.__name__
accuracies= cross_val_score(model, features, labels, scoring= 'accuracy', cv= CV)
for fold_idx, accuracy in enumerate(accuracies):
entries.append((model_name, fold_idx, accuracy))
cv_df= pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
sns.boxplot(x= 'model_name', y= 'accuracy', data= cv_df)
sns.stripplot(x= 'model_name', y= 'accuracy', data= cv_df,
size= 8, jitter= True, edgecolor= "gray", linewidth= 2)
plt.show()
print(cv_df.groupby('model_name').accuracy.mean())
models= [CatBoostClassifier(depth=10,logging_level='Silent', random_seed=42),
XGBClassifier(n_estimators=10, max_depth=4),
tree.DecisionTreeClassifier(random_state= 42)]
result_df= pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
for model in models:
model_name= model.__class__.__name__
model_fit=model.fit(X_train_norm,y_train_norm)
y_prob_norm= model_fit.predict_proba(X_test_norm)[::,1]
fpr, tpr, threshold = sklm.roc_curve(y_test_norm, y_prob_norm)
auc = sklm.auc(fpr, tpr)
result_df= result_df.append({'model_name':model_name,'fpr':fpr, 'tpr':tpr,'auc':auc}, ignore_index=True)
result_df.set_index('model_name', inplace=True)
fig= plt.figure(figsize=(+8,6))
pl.style.use('ggplot')
for i in result_df.index:
plt.plot(result_df.loc[i]['fpr'],
result_df.loc[i]['tpr'],
label="{}, AUC={:.2f}".format(i, result_df.loc[i]['auc']))
plt.plot([0,1],[0,1], color='red',linestyle='--')
plt.xticks(np.arange(0.0,1.1,step=0.1))
plt.xlabel("False Postive Rate", fontsize=15)
plt.yticks(np.arange(0.0,1.1,step=0.1))
plt.ylabel("True Positive Rate",fontsize=15)
plt.title("Reciever Operating Charateristic")
plt.legend(prop={'size':13}, loc='lower right')
plt.show()
From Comparing results from Non normalized features and normalized features. We see that the accuracy of non normalized features is slightly higher than normalized features. With results we preceed to using non-normalized features for tunning and feature training.
Tunning of Catboost model
import hyperopt
import sys
from frozendict import frozendict
import shap
shap.initjs()
class UAClassifierObjective(object):
def __init__(self, dataset, const_params, fold_count):
self._dataset = dataset
self._const_params = const_params.copy()
self._fold_count = fold_count
self._evaluated_count = 0
def _to_catboost_params(self, hyper_params):
return {
'learning_rate': hyper_params['learning_rate'],
'depth': hyper_params['depth'],
'l2_leaf_reg': hyper_params['l2_leaf_reg']}
# hyperopt optimizes an objective using `__call__` method (e.g. by doing
# `foo(hyper_params)`), so we provide one
def __call__(self, hyper_params):
# join hyper-parameters provided by hyperopt with hyper-parameters
# provided by the user
params = self._to_catboost_params(hyper_params)
params.update(self._const_params)
print('evaluating params={}'.format(params), file=sys.stdout)
sys.stdout.flush()
# we use cross-validation for objective evaluation, to avoid overfitting
scores = cb.cv(
pool=self._dataset,
params=params,
fold_count=self._fold_count,
partition_random_seed=42,
verbose=False)
# scores returns a dictionary with mean and std (per-fold) of metric
# value for each cv iteration, we choose minimal value of objective
# mean (though it will be better to choose minimal value among all folds)
# because noise is additive
min_mean_auc = np.min(scores['test-AUC-mean'])
print('evaluated score={}'.format(min_mean_auc), file=sys.stdout)
self._evaluated_count += 1
print('evaluated {} times'.format(self._evaluated_count), file=sys.stdout)
# negate because hyperopt minimizes the objective
return {'loss': -min_mean_auc, 'status': hyperopt.STATUS_OK}
def find_best_hyper_params(dataset, const_params, max_evals=100):
# we are going to optimize these three parameters, though there are a lot more of them (see CatBoost docs)
parameter_space = {
'learning_rate': hyperopt.hp.uniform('learning_rate', 0.2, 1.0),
'depth': hyperopt.hp.randint('depth', 7),
'l2_leaf_reg': hyperopt.hp.uniform('l2_leaf_reg', 1, 10)}
objective = UAClassifierObjective(dataset=dataset, const_params=const_params, fold_count=6)
trials = hyperopt.Trials()
best = hyperopt.fmin(
fn=objective,
space=parameter_space,
algo=hyperopt.rand.suggest,
max_evals=max_evals,
rstate=np.random.RandomState(seed=42))
return best
def train_best_model(X, y, const_params, max_evals=100, use_default=False):
# convert pandas.DataFrame to catboost.Pool to avoid converting it on each
# iteration of hyper-parameters optimization
dataset = cb.Pool(X, y, cat_features=categorical_features_indices)
if use_default:
# pretrained optimal parameters
best = {
'learning_rate': 0.4234185321620083,
'depth': 5,
'l2_leaf_reg': 9.464266235679002}
else:
best = find_best_hyper_params(dataset, const_params, max_evals=max_evals)
# merge subset of hyper-parameters provided by hyperopt with hyper-parameters
# provided by the user
hyper_params = best.copy()
hyper_params.update(const_params)
# drop `use_best_model` because we are going to use entire dataset for
# training of the final model
hyper_params.pop('use_best_model', None)
model = cb.CatBoostClassifier(**hyper_params)
model.fit(dataset, verbose=False)
return model, hyper_params
import time
start=time.time()
have_gpu = False
# skip hyper-parameter optimization and just use provided optimal parameters
use_optimal_pretrained_params = False
# number of iterations of hyper-parameter search
hyperopt_iterations = 50
const_params = frozendict({
'task_type': 'GPU' if have_gpu else 'CPU',
'loss_function': 'Logloss',
'eval_metric': 'AUC',
'custom_metric': ['AUC'],
'iterations': 100,
'random_seed': 42})
model, params = train_best_model(
X_train, y_train,
const_params,
max_evals=hyperopt_iterations,
use_default=use_optimal_pretrained_params)
print('best params are {}'.format(params), file=sys.stdout)
end = time.time()
print(end-start)
probabilities = model.predict_proba(data=X_test)
print_metrics(y_test, probabilities, 0.51)
plot_auc(y_test, probabilities, 0.50)
from catboost import CatBoostClassifier
clf_cb= CatBoostClassifier(iterations=2500, depth=10,logging_level='Silent',
learning_rate=0.01,eval_metric='Accuracy',use_best_model=True, random_seed=42)
clf_cb.fit(X_test, y_test, cat_features= categorical_features_indices, eval_set=(X_test,y_test))
probabilities= clf_cb.predict_proba(data= X_test)
print_metrics(y_test, probabilities, 0.51)
plot_auc(y_test, probabilities, 0.51)
from catboost import CatBoostClassifier
clf_cb= CatBoostClassifier(iterations=2500, depth=6,logging_level='Silent',
learning_rate=0.3548362548720143,eval_metric='Accuracy',l2_leaf_reg=2.683829844728577,
use_best_model=True, random_seed=42)
clf_cb.fit(X_test, y_test, cat_features= categorical_features_indices, eval_set=(X_test,y_test))
probabilities= clf_cb.predict_proba(data= X_test)
print_metrics(y_test, probabilities, 0.51)
plot_auc(y_test, probabilities, 0.50)
tunning and optimizing catboost algorithm
final_score = clf_cb.predict(data=x_predict)
final_score= final_score.astype(np.int)
submit= pd.DataFrame({'row_id':x_predict['row_id'],'accepted':final_score})
submit.to_csv('submission.csv', index=False)
clf_cb.get_feature_importance(prettified=True)
shap_values = clf_cb.get_feature_importance(cb.Pool(X, y, cat_features=categorical_features_indices), type='ShapValues')
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]
shap.summary_plot(shap_values, X)
# across the whole dataset
cat_features= data[cat_vars]
def plot_shap(cat_cols):
for col in cat_cols:
shap.dependence_plot(col, shap_values, X)
# plotting data for categorical variables
plot_shap(cat_features)
# (sum of SHAP value magnitudes over the validation dataset)
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))
# make SHAP plots of the three most important features
for i in range(20):
shap.dependence_plot(top_inds[i], shap_values, X)
shap_values = clf_cb.get_feature_importance(cb.Pool(X, y, cat_features=categorical_features_indices), type='ShapValues')
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]
shap.summary_plot(shap_values, X)
Conclusion
Binary Classification: Approved / Denied
Need for application: Help customer and financial institution know if customers are eligible for mortgage approval or not.
Optimizing model: After training the model we sort out to optimize our model. Using Bayesian methods the model improved, optimal parameters were found to be depth=6, l2_leaf_reg=1.119 and learning rate= 0.622. These parameters provided an AUC-ROC of 0.81 and an accuracy of 0.73 at 100 iterations. Increasing the iteration to 2500, we achieved an AUC-ROC of 0.89 and accuracy of 0.81.
In conclusion, we can see that mortgage loan approvals can be done using data from traditional loan application without having key industry features such as credit history, debt to income ratio, etc at an accuracy of 81%. Also, we identified geographical features such as state, country and Metropolitan Statistical Area/ Metropolitan Division codes for the property tract has higher feature importance for our model. Other features that have high importance include lender, applicant income and applicant race. Lastly, a few bits of census information such as the percentage of minorities in the population for that tract and the FFIEC median family income for the MSA/MD which the tract is located had some feature importance to the model.