BikeBuyer Classification
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
%matplotlib inline
%matplotlib widget
customer_info = pd.read_csv('Data/AdvWorksCusts.csv')
customer_spending = pd.read_csv('Data/AW_AveMonthSpend.csv')
customer_has_bike = pd.read_csv('Data/AW_BikeBuyer.csv')
(customer_info.astype(np.object) == '?').any()
(customer_spending.astype(np.object) == '?').any()
(customer_has_bike.astype(np.object) == '?').any()
# checking for missing values
print((customer_info.astype(np.object).isnull()).any())
print((customer_spending.astype(np.object).isnull()).any())
print((customer_has_bike.astype(np.object).isnull()).any())
print( "Customer data")
print(customer_info.shape)
print(customer_info.CustomerID.unique().shape)
print('\n' + "Customer Spending ")
print(customer_spending.shape)
print(customer_spending.CustomerID.unique().shape)
print('\n' + "Customer has bikes")
print(customer_has_bike.shape)
print(customer_has_bike.CustomerID.unique().shape)
customer_info.drop_duplicates(subset='CustomerID', keep='last',inplace=True)
print(customer_info.shape)
print(customer_info.CustomerID.unique().shape)
customer_spending.drop_duplicates(subset='CustomerID',keep='last',inplace=True)
print(customer_spending.shape)
print(customer_spending.CustomerID.unique().shape)
customer_has_bike.drop_duplicates(subset='CustomerID',keep='last',inplace=True)
print(customer_has_bike.shape)
print(customer_has_bike.CustomerID.unique().shape)
customer_info.describe().round()
customer_has_bike.describe().round()
#normalize to retun the relative frequency
print(customer_has_bike.BikeBuyer.value_counts(normalize=True))
print(customer_has_bike.BikeBuyer.value_counts())
combined = customer_info.merge(customer_has_bike,
on = 'CustomerID',
how='left')
combined.head(5)
def plot_box(combined, cols, col_x= 'BikeBuyer'):
for col in cols:
sns.set_style("whitegrid")
sns.boxplot(col_x, col, data=combined)
plt.xlabel(col_x) # set x-axis
plt.ylabel(col) # set y-axis
plt.show()
cols =['YearlyIncome','NumberCarsOwned'
,'NumberChildrenAtHome','TotalChildren']
plot_box(combined, cols)
# forming categorical variables
is_bike_buyer = combined.BikeBuyer== 1
bike_buyers = combined[is_bike_buyer]
is_non_bike_buyer = combined.BikeBuyer == 0
non_bike_buyers = combined[is_non_bike_buyer]
print(bike_buyers.shape)
print(non_bike_buyers.shape)
# plot bar plot bike buyers counts
def plot_bar(cat_cols):
combined['dummy'] = np.ones(shape = combined.shape[0])
for col in cat_cols:
counts = combined[['dummy','BikeBuyer', col]].groupby(['BikeBuyer',col],
as_index = False).count()
temp = counts[counts['BikeBuyer'] ==0][[col,'dummy']]
temp.plot.bar(x=col, y= 'dummy')
plt.title('Counts for ' + col + '\n non bike buyer')
plt.ylabel('count')
temp = counts[counts['BikeBuyer'] == 1][[col,'dummy']]
temp.plot.bar(x=col, y='dummy')
plt.title('Counts for ' + col + '\n bike buyer')
plt.ylabel('count')
plt.show()
cols = ['Occupation','Gender','MaritalStatus']
plot_bar(cols)
has_child_at_home = []
def generate_has_child_at_home(customer_info, has_child_at_home):
for index, row in customer_info.iterrows():
if row.NumberChildrenAtHome>0:
has_child_at_home.append('Y')
else:
has_child_at_home.append('N')
return has_child_at_home
combined['hasChildAtHome'] = generate_has_child_at_home(customer_info, has_child_at_home)
combined[['hasChildAtHome','NumberChildrenAtHome']].head()
from datetime import datetime
from dateutil.parser import parse
def generate_age(data, format):
collect_date = birthday = datetime(1998,1,1,0,0,0)
age = []
for index, row in data.iterrows():
cust_date = datetime.strptime(row['BirthDate'], format)
age.append(int((collect_date - cust_date).days/365))
return age
data['Age'] = generate_age(data, '%Y-%m-%d')
data[['BirthDate','Age']].head()
cols = ['hasChildAtHome']
plot_bar(cols)
features_chosen = ['YearlyIncome','NumberCarsOwned','Occupation','Gender','MaritalStatus',
'hasChildAtHome']
features = combined[features_chosen]
features.head()
Preparing data for scikit learn
1. encode categorical variable using one hot encoding.
2. convert features and labels to numpy arrays.
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
labels = np.array(combined.BikeBuyer)
print(labels)
def encode_string(cat_features):
enc = preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features = enc.transform(cat_features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
def encode_cat_features(features):
cat_features = ['Gender','MaritalStatus','hasChildAtHome']
f = encode_string(features['Occupation'])
for cat in cat_features:
enc = encode_string(features[cat])
f = np.concatenate([f, enc], 1)
return f
numeric_features = np.array(combined[['YearlyIncome','NumberCarsOwned']])
encoded_features = encode_cat_features(features)
features = np.concatenate([encoded_features,numeric_features],1)
features.shape
features[3,:13]
nr.seed(9988)
indx = range(features.shape[0])
indx = ms.train_test_split(indx, test_size=300)
X_train = features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = features[indx[1],:]
y_test = np.ravel(labels[indx[1]])
X_train[:2]
scalar = preprocessing.MinMaxScaler(feature_range=(-1,1)).fit(X_train[:,11:])
X_train[:,11:] = scalar.transform(X_train[:,11:])
X_test[:,11:] = scalar.transform(X_test[:,11:])
X_train[:2]
# Due to class inbalanc for bike buyers and no bike buyer, the class weight parameter is used
logistic_mod = linear_model.LogisticRegression(class_weight='balanced')
nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle=True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle=True)
nr.seed(3456)
param_grid = {"C": [0.1,1,10,100,1000]}
clf = ms.GridSearchCV(estimator=logistic_mod, param_grid=param_grid,
cv=inside, # using the inside folds
scoring = 'roc_auc',
return_train_score = True)
clf.fit(features,labels)
clf.best_estimator_.C
nr.seed(498)
cv_estimate = ms.cross_val_score(clf, features, labels,
cv = outside) # use the outside folds
print('Mean perfomance metic = %4.3f' %np.mean(cv_estimate))
print('STD of the metric = %4.3f' %np.std(cv_estimate))
print('Outcome by cv fold')
for i, x in enumerate (cv_estimate):
print('Fold %2d %4.3f' % (i+1,x))
logistic_mod = linear_model.LogisticRegression(C=clf.best_estimator_.C, class_weight='balanced')
logistic_mod.fit(X_train,y_train)
print(logistic_mod.intercept_)
print(logistic_mod.coef_)
probabilities = logistic_mod.predict_proba(X_test)
print(probabilities[:15,:])
def score_model(probs, threshold):
return np.array([1 if x > threshold else 0 for x in probs[:,1]])
threshold = 0.51
scores = score_model(probabilities, threshold)
print(np.array(scores[:18]))
print(y_test[:18])
def print_matrics(labels, scores):
metrics = sklm.precision_recall_fscore_support(labels, scores)
conf = sklm.confusion_matrix(labels, scores)
print(' Confusion matrix')
print(' Score positive Score negative')
print('Actual positive %6d' % conf[0,0] + ' %5d' % conf[0,1])
print('Actual negative %6d' % conf[1,0] + ' %5d' % conf[1,1])
print('')
print('Accuracy %0.2f' % sklm.accuracy_score(labels, scores))
print(' ')
print(' Positive Negative')
print('Num case %6d' % metrics[3][0] + ' %6d' % metrics[3][1])
print('Precision %6.2f' % metrics[0][0] + ' %6.2f' % metrics[0][1])
print('Recall %6.2f' % metrics[1][0] + ' %6.2f' % metrics[1][1])
print('F1 %6.2f' % metrics[2][0] + ' %6.2f' % metrics[2][1])
print_matrics(y_test, scores)
def plot_auc(labels, probs):
## compute the false postive rate, true positive rate and threshold along with the AUC
fpr, tpr, threshold = sklm.roc_curve(labels, probs[:,1])
auc = sklm.auc(fpr, tpr)
## plot the result
plt.title('Reciever Operating Charateristic')
plt.plot(fpr, tpr, color = 'orange', label = 'AUC = %0.2f' %auc)
plt.legend(loc = 'lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
plot_auc(y_test, probabilities)
from sklearn.ensemble import RandomForestClassifier
param_grid = {'max_features': [2,3,5,10,13], 'min_samples_leaf':[3,5,10,20]}
nr.seed(3456)
rf_clf = RandomForestClassifier(class_weight = 'balanced')
nr.seed(4455)
rf_clf = ms.GridSearchCV(estimator=rf_clf, param_grid=param_grid,
cv = inside, # Use the inside folds
scoring = 'roc_auc', return_train_score = True)
rf_clf.fit(features, labels)
print(rf_clf.best_estimator_.max_features)
print(rf_clf.best_estimator_.min_samples_leaf)
nr.seed(1115)
rf_mod = RandomForestClassifier(class_weight='balanced',
max_features = rf_clf.best_estimator_.max_features,
min_samples_leaf =rf_clf.best_estimator_.min_samples_leaf)
rf_mod.fit(X_train,y_train)
probabilities = rf_mod.predict_proba(X_test)
scores = score_model(probabilities,0.54)
print(print_matrics(y_test, scores))
plot_auc(y_test,probabilities)
#nr.seed(1115)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear',probability=True, random_state= 0)
svclassifier.fit(X_train,y_train)
probabilities = svclassifier.predict_proba(X_test)
scores = score_model(probabilities,0.54)
print(print_matrics(y_test, scores))
plot_auc(y_test, probabilities)
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear', random_state=0)
svclassifier.fit(X_train,y_train)
param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
svm_clf = ms.GridSearchCV(estimator= svclassifier,
param_grid= param_grid,
scoring= 'accuracy',
cv= 3,
n_jobs=-1)
svm_clf.fit(X_train,y_train)
print(svm_clf.best_score_)
print(svm_clf.best_params_)
print(svm_clf.best_estimator_)
from sklearn.svm import SVC
svclassifier = SVC(kernel='rbf',C=1,probability=True, gamma=0.7)
svclassifier.fit(X_train,y_train)
probabilities = svclassifier.predict_proba(X_test)
scores = score_model(probabilities,0.54)
print(print_matrics(y_test, scores))
plot_auc(y_test, probabilities)
Looking at the accuracy of all the models used seems we settle on SVM with the parameters used.
final = pd.read_csv('Data/AW_test.csv')
final['hasChildAtHome'] = generate_has_child_at_home(final,[])
final_features = final[features_chosen]
numeric_final_features = np.array(final_features[['YearlyIncome','NumberCarsOwned']])
encoded_final_features = encode_cat_features(final_features)
final_features = np.concatenate([encoded_final_features, numeric_final_features],1)
final_features[:,11:] = scalar.transform(final_features[:,11:])
probabilities = svclassifier.predict_proba(final_features)
scores = score_model(probabilities, 0.54)
print(scores)
np.savetxt('final_answer_classification.csv',scores,delimiter=',',fmt='%i')