import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math

%matplotlib inline
%matplotlib widget
customer_info = pd.read_csv('Data/AdvWorksCusts.csv')
customer_spending = pd.read_csv('Data/AW_AveMonthSpend.csv')
customer_has_bike = pd.read_csv('Data/AW_BikeBuyer.csv') 
(customer_info.astype(np.object) == '?').any()
(customer_spending.astype(np.object) == '?').any()
(customer_has_bike.astype(np.object) == '?').any()
# checking for missing values 
print((customer_info.astype(np.object).isnull()).any())
print((customer_spending.astype(np.object).isnull()).any())
print((customer_has_bike.astype(np.object).isnull()).any())
print( "Customer data")
print(customer_info.shape)
print(customer_info.CustomerID.unique().shape)

print('\n' + "Customer Spending ")
print(customer_spending.shape)
print(customer_spending.CustomerID.unique().shape)

print('\n' + "Customer has bikes")
print(customer_has_bike.shape)
print(customer_has_bike.CustomerID.unique().shape)
customer_info.drop_duplicates(subset='CustomerID', keep='last',inplace=True)
print(customer_info.shape)
print(customer_info.CustomerID.unique().shape)
customer_spending.drop_duplicates(subset='CustomerID',keep='last',inplace=True)
print(customer_spending.shape)
print(customer_spending.CustomerID.unique().shape)
customer_has_bike.drop_duplicates(subset='CustomerID',keep='last',inplace=True)
print(customer_has_bike.shape)
print(customer_has_bike.CustomerID.unique().shape)
customer_info.describe().round()
customer_has_bike.describe().round()
#normalize to retun the relative frequency
print(customer_has_bike.BikeBuyer.value_counts(normalize=True))
print(customer_has_bike.BikeBuyer.value_counts())
combined = customer_info.merge(customer_has_bike,
                              on = 'CustomerID',
                              how='left')
combined.head(5)
Running various visualization to see features to select for ML model
def plot_box(combined, cols, col_x= 'BikeBuyer'):
    for col in cols:
        sns.set_style("whitegrid")
        sns.boxplot(col_x, col, data=combined)
        plt.xlabel(col_x) # set x-axis
        plt.ylabel(col) # set y-axis 
        plt.show()
cols =['YearlyIncome','NumberCarsOwned'
       ,'NumberChildrenAtHome','TotalChildren']
plot_box(combined, cols)
# forming categorical variables 

is_bike_buyer = combined.BikeBuyer== 1 
bike_buyers = combined[is_bike_buyer]
is_non_bike_buyer = combined.BikeBuyer == 0
non_bike_buyers = combined[is_non_bike_buyer]

print(bike_buyers.shape)
print(non_bike_buyers.shape)
# plot bar plot bike buyers counts 

def plot_bar(cat_cols):
    combined['dummy'] = np.ones(shape = combined.shape[0])
    for col in cat_cols:
        counts = combined[['dummy','BikeBuyer', col]].groupby(['BikeBuyer',col],
                                                            as_index = False).count()
        temp = counts[counts['BikeBuyer'] ==0][[col,'dummy']]
        temp.plot.bar(x=col, y= 'dummy')
        
        plt.title('Counts for ' + col + '\n non bike buyer') 
        plt.ylabel('count')
        temp = counts[counts['BikeBuyer'] == 1][[col,'dummy']]
        temp.plot.bar(x=col, y='dummy')
        
        plt.title('Counts for ' + col + '\n bike buyer') 
        plt.ylabel('count')
        plt.show()
        
      
cols = ['Occupation','Gender','MaritalStatus']
plot_bar(cols)
has_child_at_home = [] 
def generate_has_child_at_home(customer_info, has_child_at_home):
    for index, row in customer_info.iterrows():
        if row.NumberChildrenAtHome>0:
            has_child_at_home.append('Y')
        else:
            has_child_at_home.append('N')
    return has_child_at_home
combined['hasChildAtHome'] = generate_has_child_at_home(customer_info, has_child_at_home)
combined[['hasChildAtHome','NumberChildrenAtHome']].head()
from datetime import datetime 
from dateutil.parser import parse

def generate_age(data, format):
    collect_date = birthday = datetime(1998,1,1,0,0,0)
    age = [] 
    for index, row in data.iterrows():
        cust_date = datetime.strptime(row['BirthDate'], format)
        age.append(int((collect_date - cust_date).days/365))
    return age 
data['Age'] = generate_age(data, '%Y-%m-%d')
data[['BirthDate','Age']].head()
cols = ['hasChildAtHome']
plot_bar(cols)
features_chosen = ['YearlyIncome','NumberCarsOwned','Occupation','Gender','MaritalStatus',
                  'hasChildAtHome']
features = combined[features_chosen]
features.head()

Preparing data for scikit learn

1. encode categorical variable using one hot encoding. 
2. convert features and labels to numpy arrays. 
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
labels = np.array(combined.BikeBuyer)
print(labels)
def encode_string(cat_features):
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
def encode_cat_features(features):
    cat_features = ['Gender','MaritalStatus','hasChildAtHome']
    f = encode_string(features['Occupation']) 
    for cat in cat_features:
        enc = encode_string(features[cat])
        f = np.concatenate([f, enc], 1)
    return f
numeric_features = np.array(combined[['YearlyIncome','NumberCarsOwned']])
encoded_features = encode_cat_features(features)
features = np.concatenate([encoded_features,numeric_features],1)
features.shape
features[3,:13]
nr.seed(9988)
indx = range(features.shape[0])
indx = ms.train_test_split(indx, test_size=300)
X_train = features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = features[indx[1],:]
y_test = np.ravel(labels[indx[1]])
X_train[:2]
scalar = preprocessing.MinMaxScaler(feature_range=(-1,1)).fit(X_train[:,11:])
X_train[:,11:] = scalar.transform(X_train[:,11:]) 
X_test[:,11:] = scalar.transform(X_test[:,11:]) 
X_train[:2]
# Due to class inbalanc for bike buyers and no bike buyer, the class weight parameter is used 
logistic_mod = linear_model.LogisticRegression(class_weight='balanced')
nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle=True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle=True)
nr.seed(3456)
param_grid = {"C": [0.1,1,10,100,1000]}
clf = ms.GridSearchCV(estimator=logistic_mod, param_grid=param_grid,
                     cv=inside, # using the inside folds
                     scoring = 'roc_auc',
                     return_train_score = True) 
clf.fit(features,labels)
clf.best_estimator_.C
nr.seed(498)
cv_estimate = ms.cross_val_score(clf, features, labels, 
                                cv = outside) # use the outside folds
print('Mean perfomance metic = %4.3f' %np.mean(cv_estimate))
print('STD of the metric     = %4.3f' %np.std(cv_estimate))
print('Outcome by cv fold')
for i, x in enumerate (cv_estimate):
    print('Fold %2d      %4.3f' % (i+1,x))
logistic_mod = linear_model.LogisticRegression(C=clf.best_estimator_.C, class_weight='balanced')
logistic_mod.fit(X_train,y_train)
print(logistic_mod.intercept_)
print(logistic_mod.coef_)
probabilities = logistic_mod.predict_proba(X_test)
print(probabilities[:15,:])
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])
threshold = 0.51 
scores = score_model(probabilities, threshold) 
print(np.array(scores[:18]))
print(y_test[:18])
def print_matrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy  %0.2f' % sklm.accuracy_score(labels, scores))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
print_matrics(y_test, scores)
def plot_auc(labels, probs):
    ## compute the false postive rate, true positive rate  and threshold  along with the AUC 
    fpr, tpr, threshold = sklm.roc_curve(labels, probs[:,1]) 
    auc = sklm.auc(fpr, tpr)

    ## plot the result 
    plt.title('Reciever Operating Charateristic')
    plt.plot(fpr, tpr, color = 'orange', label = 'AUC = %0.2f' %auc)
    plt.legend(loc = 'lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.ylabel('True Positive Rate') 
    plt.xlabel('False Positive Rate')
    plt.show()
    
plot_auc(y_test, probabilities)
from sklearn.ensemble import RandomForestClassifier

param_grid = {'max_features': [2,3,5,10,13], 'min_samples_leaf':[3,5,10,20]} 
nr.seed(3456) 
rf_clf = RandomForestClassifier(class_weight = 'balanced')
nr.seed(4455)
rf_clf = ms.GridSearchCV(estimator=rf_clf, param_grid=param_grid,
                        cv = inside, # Use the inside folds
                        scoring = 'roc_auc', return_train_score = True)
rf_clf.fit(features, labels)
print(rf_clf.best_estimator_.max_features)
print(rf_clf.best_estimator_.min_samples_leaf)
nr.seed(1115)
rf_mod = RandomForestClassifier(class_weight='balanced',
                               max_features = rf_clf.best_estimator_.max_features,
                               min_samples_leaf =rf_clf.best_estimator_.min_samples_leaf)
rf_mod.fit(X_train,y_train)
probabilities = rf_mod.predict_proba(X_test)
scores = score_model(probabilities,0.54)
print(print_matrics(y_test, scores))
plot_auc(y_test,probabilities)
#nr.seed(1115)
from sklearn.svm import SVC 
svclassifier = SVC(kernel='linear',probability=True, random_state= 0)
svclassifier.fit(X_train,y_train)
probabilities = svclassifier.predict_proba(X_test)
scores = score_model(probabilities,0.54)
print(print_matrics(y_test, scores))
plot_auc(y_test, probabilities)
from sklearn.svm import SVC 
svclassifier = SVC(kernel='linear', random_state=0)
svclassifier.fit(X_train,y_train)

param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]


svm_clf = ms.GridSearchCV(estimator= svclassifier,
                          param_grid= param_grid,
                          scoring= 'accuracy',
                          cv= 3,
                          n_jobs=-1)
svm_clf.fit(X_train,y_train)
print(svm_clf.best_score_)
print(svm_clf.best_params_)
print(svm_clf.best_estimator_)
from sklearn.svm import SVC 
svclassifier = SVC(kernel='rbf',C=1,probability=True, gamma=0.7)
svclassifier.fit(X_train,y_train)
probabilities = svclassifier.predict_proba(X_test)
scores = score_model(probabilities,0.54)
print(print_matrics(y_test, scores))
plot_auc(y_test, probabilities)

Looking at the accuracy of all the models used seems we settle on SVM with the parameters used.

final = pd.read_csv('Data/AW_test.csv')
final['hasChildAtHome'] = generate_has_child_at_home(final,[])
final_features = final[features_chosen]

numeric_final_features = np.array(final_features[['YearlyIncome','NumberCarsOwned']])

encoded_final_features = encode_cat_features(final_features)

final_features = np.concatenate([encoded_final_features, numeric_final_features],1)

final_features[:,11:] = scalar.transform(final_features[:,11:])
probabilities = svclassifier.predict_proba(final_features)
scores = score_model(probabilities, 0.54)
print(scores) 
np.savetxt('final_answer_classification.csv',scores,delimiter=',',fmt='%i')