BikeBuyer Regression
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
%matplotlib inline
customer_info = pd.read_csv('Data/AdvWorksCusts.csv')
customer_spending = pd.read_csv('Data/AW_AveMonthSpend.csv')
customer_has_bike = pd.read_csv('Data/AW_BikeBuyer.csv')
print("For customer_info: " + "\n")
print(customer_info.shape)
print(customer_info.CustomerID.unique().shape)
print("\n" + "For customer_spending" + "\n")
print(customer_spending.shape)
print(customer_spending.CustomerID.unique().shape)
print("\n" + "For customer_has_bike" + "\n")
print(customer_has_bike.shape)
print(customer_has_bike.CustomerID.unique().shape)
# dropping duplicate if they exist
customer_info.drop_duplicates(subset='CustomerID',keep='last', inplace=True)
customer_spending.drop_duplicates(subset='CustomerID',keep='last',inplace=True)
customer_has_bike.drop_duplicates(subset='CustomerID',keep='last',inplace=True)
# checking if duplicate are dropped
print("For customer_info: " + "\n")
print(customer_info.shape)
print(customer_info.CustomerID.unique().shape)
print("\n" + "For customer_spending" + "\n")
print(customer_spending.shape)
print(customer_spending.CustomerID.unique().shape)
print("\n" + "For customer_has_bike" + "\n")
print(customer_has_bike.shape)
print(customer_has_bike.CustomerID.unique().shape)
print((customer_info.astype(np.object).isnull()).any())
print((customer_spending.astype(np.object).isnull().any()))
print((customer_has_bike.astype(np.object).isnull().any()))
Doing some exploratory analysis once the data is cleaned
print(round(customer_info.describe(),2))
print("\n")
print(round(customer_spending.describe(),2))
print("\n")
print(round(customer_has_bike.describe(),2))
data = customer_info.merge(customer_spending, on='CustomerID', how='left')
data.head()
from datetime import datetime
from dateutil.parser import parse
def generate_age(data, format):
collect_date = birthday = datetime(1998,1,1,0,0,0)
age = []
for index, row in data.iterrows():
cust_date = datetime.strptime(row['BirthDate'], format)
age.append(int((collect_date - cust_date).days/365))
return age
data['Age'] = generate_age(data, '%Y-%m-%d')
data[['BirthDate','Age']].head()
generating age since we given the birthrate
This function for generating age work but it not safe since it does work with one form of format.
from datetime import datetime from datetime import date def calcute_age(age): cust_date = datetime.strptime(age, "%Y-%m-%d") f_date = date(1998,1,1) return f_date.year - cust_date.year - ((f_date.month, f_date.day) <(cust_date.month, cust_date.day)) data['Age'] = data['BirthDate'].apply(calcute_age)
data[['BirthDate','Age']].head()
def plot_scatter(auto_prices, cols, col_y= 'AveMonthSpend'):
for col in cols:
fig = plt.figure(figsize=(7,6)) # define plot area
ax = fig.gca() # define axis
auto_prices.plot.scatter(x= col, y=col_y, ax= ax)
ax.set_title('Scatter plot of ' + col_y + ' vs. ' + col) #title of the plot
ax.set_xlabel(col) #set x axis text
ax.set_ylabel(col_y) #set y axis text
plt.show()
cols=['NumberChildrenAtHome','NumberCarsOwned','TotalChildren']
plot_scatter(data,cols)
cols= ['AveMonthSpend','YearlyIncome','Age']
sns.pairplot(data[cols], palette="Set2", diag_kind="kde", size=2).map_upper(sns.kdeplot,cmap="Blues_d")
def plot_box(auto_prices, cols, col_y='AveMonthSpend'):
for col in cols:
sns.set_style("whitegrid")
sns.boxplot(col,col_y, data=auto_prices)
plt.xlabel(col) #set x axis text
plt.ylabel(col_y) #set y axis text
plt.show()
cols= ['Occupation','Gender','MaritalStatus','HomeOwnerFlag']
plot_box(data, cols)
After visualizations above we selected the following features for model: Gender, MaritalStatus, HomeOwnerFlag, Occupation, Age, YearlyIncme and NumberChildrenAtHome
categorical_features= ['Gender','MaritalStatus','HomeOwnerFlag','Occupation']
numeric_features= ['Age','YearlyIncome','NumberChildrenAtHome']
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
def encode_string(cat_features):
enc= preprocessing.LabelEncoder()
enc.fit(cat_features)
enc_cat_features= enc.transform(cat_features)
ohe= preprocessing.OneHotEncoder()
encoded= ohe.fit(enc_cat_features.reshape(-1,1))
return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()
def encode_cat_features(features):
categorical_features= ['Gender','MaritalStatus','HomeOwnerFlag']
f= encode_string(features['Occupation'])
for cat in categorical_features:
enc= encode_string(features[cat])
f= np.concatenate([f,enc],1)
return f
labels = np.array(data.AveMonthSpend)
selected = numeric_features + categorical_features
features = data[selected]
print(labels)
print(features.head())
encoded_features= encode_cat_features(features)
print(encoded_features[:,:])
#selecting numeric features and converting them to array
numeric_features= np.array(data[numeric_features])
print(numeric_features[:,:])
features= np.concatenate([encoded_features,numeric_features],1)
print(features.shape)
print(features[:1,:])
nr.seed(9988)
indx= range(features.shape[0])
indx= ms.train_test_split(indx, test_size= 300)
X_train= features[indx[0],:]
y_train= np.ravel(labels[indx[0]])
X_test= features[indx[1],:]
y_test= np.ravel(labels[indx[1]])
# Scaling the data to avoid features having different magnitudes
#scalar= preprocessing.MinMaxScaler(feature_range=(-1,1)).fit(X_train[:,11:])
scaler = preprocessing.StandardScaler().fit(X_train[:,11:13])
X_train[:,11:13] = scaler.transform(X_train[:,11:13])
X_test[:,11:13] = scaler.transform(X_test[:,11:13])
X_train[:2]
Now Features are prepared we try it on models
lin_mod= linear_model.Ridge(alpha = 0.05)
lin_mod.fit(X_train,y_train)
print(lin_mod.intercept_)
print(lin_mod.coef_)
alphas = np.array([0.1,0.01,0.001,0.0001,0,0.01,0.05,0.04,0.03,0.02,1,2,3,4,5,6,7,8,9,10])
lin_mod= linear_model.Ridge()
linRidge_clf = ms.GridSearchCV(estimator=lin_mod, param_grid=dict(alpha=alphas))
linRidge_clf.fit(X_train,y_train)
#summarize results of grid search
print(linRidge_clf.best_score_)
print(linRidge_clf.best_estimator_.alpha)
lin_mod= linear_model.Ridge(alpha = 3.0)
lin_mod.fit(X_train,y_train)
print(lin_mod.intercept_)
print(lin_mod.coef_)
def print_metrics(y_true, y_predicted):
# compute R^2 and the adjusted R^2
r2= sklm.r2_score(y_true,y_predicted)
n= X_test.shape[0]
p= X_test.shape[1]-1
r2_adj= 1-(1-r2)*((n-1)/(n-p-1))
## Print the usual metrics and the R^2 values
print('Mean Square Error = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
print('Mean Absolute Error = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
print('Median Absolute Error = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
print('R^2 = ' + str(r2))
print('Adjusted R^2 = ' + str(r2_adj))
def print_evalute(y_true_, y_predicted_):
errors= abs(y_predicted_ - y_true_)
mape_= 100* np.mean(errors/y_true_)
accuracy= 100 - mape_
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy= {:0.2f}%.'.format(accuracy))
scores= lin_mod.predict(X_test)
print_metrics(y_test, scores)
print_evalute(y_test, scores)
def hist_residue(y_test, y_score):
## compute vector of residuals
residue = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
# making a plot
sns.distplot(residue)
plt.title('Histogram of residuals')
plt.xlabel('Residual value')
plt.ylabel('Count')
plt.show()
hist_residue(y_test,scores)
def plot_residue(y_test, y_score):
## compute vector of residuals
residue = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
# making a plot
sns.regplot(y_score, residue, fit_reg= False)
plt.title('Residuals vs Predicted values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()
plot_residue(y_test,scores)
The residual are not normally distrubuted as expected. Also there is a pattern for lower Average residuals. This indicate the model is not generalize as expected.
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
poly_mod= make_pipeline(PolynomialFeatures(4),
linear_model.LinearRegression())
poly_mod.fit(X_train,y_train)
scores = poly_mod.predict(X_test)
print_metrics(y_test,scores)
print_evalute(y_test, scores)
hist_residue(y_test,scores)
plot_residue(y_test,scores)
Comparing the polynomial feature to the linear regression. It can be seen that polynomial regression performs beter. The R2 and adj. R2 shows a good residual distrubution and also the histogram shows a form of a normal distribution. Due to this I will expore other model to see how best it goes.
# to choose the best parameter for the GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
gbrt_mod= GradientBoostingRegressor(random_state=0)
param_grid= {
'n_estimators': [10,20,30,40,50,100,200,300,500],
'max_features': ['auto'],
'max_depth': [1,2,4,6,8,10],
'learning_rate': [0.1],
'subsample': [1]
}
gbrt_clf= ms.GridSearchCV(estimator=gbrt_mod,
param_grid=param_grid,
n_jobs=4,
cv=5,
scoring='neg_mean_squared_error')
gbrt_clf.fit(X_train,y_train)
print(gbrt_clf.best_score_)
print(gbrt_clf.best_params_)
from sklearn.ensemble import GradientBoostingRegressor
gbrt_mod= GradientBoostingRegressor(n_estimators=200,
max_depth=4)
gbrt_mod.fit(X_train,y_train)
scores= gbrt_mod.predict(X_test)
print_metrics(y_test,scores)
print_evalute(y_test, scores)
hist_residue(y_test,scores)
plot_residue(y_test,scores)
from sklearn.neural_network import MLPRegressor
regressor_mod= MLPRegressor(hidden_layer_sizes= (100,),
activation= 'tanh',
learning_rate= 'adaptive',
max_iter=1000,
random_state=9,
learning_rate_init=0.001)
regressor_mod.fit(X_train, y_train)
scores= regressor_mod.predict(X_test)
print_metrics(y_test,scores)
print_evalute(y_test, scores)
hist_residue(y_test,scores)
plot_residue(y_test,scores)
from sklearn.ensemble import RandomForestRegressor
rf_Regressor_mod= RandomForestRegressor(n_estimators=40)
rf_Regressor_mod.fit(X_train, y_train)
scores= rf_Regressor_mod.predict(X_test)
print_metrics(y_test,scores)
print_evalute(y_test, scores)
hist_residue(y_test,scores)
plot_residue(y_test,scores)
# parameters
n_estimators= [int(x) for x in np.linspace(10,500,10)]# # trees in random forest
max_features= ['auto','sqrt']# # features to consider at every split
max_depth= [int(x) for x in np.linspace(10,100,10)]# # maximum number of levels in tree
max_depth.append(None)
min_samples_split= [2,5,10] # minimum # samples required at each split a node
min_samples_leaf= [1,2,4] # minimum # of samples required at each leaf node
bootstrap= [True, False] # Method of selecting sample for training each tree
param_distributions= {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf_Regressor_mod= RandomForestRegressor()
rf_Regressor_clf= ms.RandomizedSearchCV(estimator= rf_Regressor_mod,
param_distributions= param_distributions,
n_iter= 100,
cv=3,
random_state=42,
n_jobs=-1)
rf_Regressor_clf.fit(X_train,y_train)
print('\n')
print(rf_Regressor_clf.best_score_)
print(rf_Regressor_clf.best_params_)
from sklearn.ensemble import RandomForestRegressor
rf_Regressor_mod= RandomForestRegressor(n_estimators= 227,
min_samples_split= 5,
min_samples_leaf= 1,
max_features= 'auto',
max_depth= 10,
bootstrap= 'True')
rf_Regressor_mod.fit(X_train, y_train)
scores= rf_Regressor_mod.predict(X_test)
print_metrics(y_test,scores)
print_evalute(y_test, scores)
hist_residue(y_test,scores)
plot_residue(y_test,scores)
From all the models, it could be seen that ML regressor does good in general compared to the other models.
# importing the final test data
final= pd.read_csv('Data/AW_test.csv')
# checking if there are duplicate
print(final.shape)
print(final.CustomerID.unique().shape)
final['Age'] = generate_age(final,'%m/%d/%Y')
final[['Age','BirthDate']].head()
encoded = encode_cat_features(final)
numeric_final_features = np.array(final[['Age','YearlyIncome', 'NumberChildrenAtHome']])
final_test = np.concatenate([encoded,numeric_final_features], 1)
final_test[:,11:13]= scaler.transform(final_test[:,11:13])
final_scores= regressor_mod.predict(final_test)
np.savetxt('final_answer_regression.csv', final_scores, delimiter=',',fmt='%i')