Jane Street Prediction Competition with Fastai+PCA+XGBOOST
Kaggle Jane Street Prediction Competition with Fastai + PCA and XGBOOST
- Introduction
- Installing libraries and downloading Datasets
- Downloading datasets from Kagggle.
- Loading Data and Preprocessing
- NN using Fastai and using output of model as input to another model.
- PCA and Normalization
- Xgboost and Hypertunning
- kaggle Submission Method
- Conclusion
- Reference:
This notebook ocntains my solution (submission not made yet due some challenges with my submission pipeline) to the Kaggle's Jane Prediction Challenge.
Introduction
Jane Street Market Prediction is a competition organized by kaggle, which you build a model to predict financial market. This competition is a classification competition with the goal to predict an action of class 0 or 1.
More about the competition can be found in the link above.
Why THIS Competition
Currently, I am Reading Jeremy Howard and Sylvain Gugger's Deep Learning for Coders with Fastai and Pytorch book and to understand more about using NN to solve tabular challenges, I entered this challenge.
Further, the rules of the competition that caught my attention, as your model is passed through 1M test dataset in 5 hours. This means your model should be optimzed, efficient and fast.
My Goal:
My major goal in 2021 is to solve more data science problem and also ship codes in production and I believe participating in kaggle challenges will help me achieve that.
To to waste your time let start.
%%capture
# installing libaries
!pip install fastai==2.1.2
!pip install --upgrade fastai
!pip install nbdev
!pip install numba
!pip install optuna
!pip install xgboost
!pip install kaggle
import sys
from fastai.tabular.all import *
import joblib
import numpy as np
import numpy.ma as ma
from numba import njit
import seaborn as sns
import matplotlib
matplotlib.rc('image', cmap='Greys')
from IPython.display import Image, display_svg, SVG
import time
from tqdm.notebook import tqdm
#import cudf # install cudf on google colab before running this.
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
pd.options.display.max_rows = 20
pd.options.display.max_columns = 8
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.inspection import plot_partial_dependence
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import optuna
from fastai.tabular.all import *
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
creds = '{"username":"xxxxxxxx","key":"xxxxxxxxxxxxx"}'
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
cred_path.parent.mkdir(exist_ok=True)
cred_path.write_text(creds)
cred_path.chmod(0o600)
from kaggle import api
path = URLs.path('jane_street_prediction')
print(path)
Path.BASE_PATH = path
if not path.exists():
path.mkdir(parents=true)
api.competition_download_cli('jane-street-market-prediction',path=path)
file_extract(path/'jane-street-market-prediction.zip')
path.ls(file_type='text')
!rm -rf /root/.fastai/archive/jane_street_prediction/jane-street-market-prediction.zip
"""
%%time
train_cudf = cudf.read_csv(path/'train.csv', nrows=250000)
df = train_cudf.to_pandas()
del train_cudf
features = pd.read_csv(path/'features.csv')
example_test = pd.read_csv(path/'example_test.csv')
sample_prediction_df = pd.read_csv(path/'example_sample_submission.csv')
print ("Data is loaded!")
"""
df = pd.read_csv(path/'train.csv', nrows=200000)
features = pd.read_csv(path/'features.csv')
example_test = pd.read_csv(path/'example_test.csv')
sample_prediction_df = pd.read_csv(path/'example_sample_submission.csv')
print('train shape is {}'.format(df.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))
df.head()
df = df[df['weight'] != 0].reset_index(drop = True)
dep_var = 'action'
df[dep_var] = (((df['resp']* df['weight'])>0)*1).astype('category')
df = df.loc[:, ~df.columns.str.contains('resp')]
features_one = [col for col in list(df.columns) if 'feature' in col]
features_nn = features_one + [dep_var]
len(features_one), len(features_nn)
Due to the nature of the competition, the code below was not executed in my final submission as it decreased the number of iterations/sec. Ideally, the number of iterations/sec should be about 50-60. However the code below when included in the final run, reduces the number of iterations/sec to 7 (maximum).
Also, it is important to note that the accuray of our model slightly increases when applied.
def augment_df(df, features):
for feature in features:
df[f'{feature}_sq'] = np.square(df[feature].to_numpy())
#df[f'{feature}_log'] = df[feature].apply(lambda x: 0 if x==0 else np.log(x))
df[f'{feature}_log'] = ma.log(df[feature].to_numpy())
df['feature_min'] = df[features].min(axis=1)
df['feature_mean'] = df[features].mean(axis=1)
df['feature_max'] = df[features].max(axis=1)
df['feature_median'] = df[features].median(axis=1)
df['feature_std'] = df[features].std(axis=1)
df['feature_var'] = df[features].var(axis=1)
df['feature_abs_mean'] = df[features].abs().mean(axis=1)
df['feature_abs_median'] = df[features].abs().median(axis=1)
df['feature_abs_std'] = df[features].abs().std(axis=1)
df['feature_skew'] = df[features].skew(axis=1)
df['feature_kurt'] = df[features].kurt(axis=1)
df['feature_sq_kurt'] = np.square(df[features_one].kurt(axis=1))
return df
%time df = augment_df(df, features_one)
features_nn = [col for col in list(df.columns) if 'feature' in col] + [dep_var]
feature_nn_test = [col for col in list(df.columns) if 'feature' in col]
len(feature_nn_test), len(features_nn)
df.shape
df_median = df.median()
# replacing missing values.
df = df.fillna(df_median)
df.isnull().sum().sum()
df[feature_nn_test].astype('float32')
df[dep_var].astype('int')
# getting continous and categorical variables from dataset
def cont_cat_split(df, max_card=20, dep_var=None):
"Helper function that returns column names of cont and cat variables from given `df`."
cont_names, cat_names = [], []
for label in df:
if label in L(dep_var): continue
if (pd.api.types.is_integer_dtype(df[label].dtype) and
df[label].unique().shape[0] > max_card or
pd.api.types.is_float_dtype(df[label].dtype)):
cont_names.append(label)
else: cat_names.append(label)
return cont_names, cat_names
cont_nn,cat_nn = cont_cat_split(df.loc[:,features_nn], max_card=9000, dep_var=dep_var)
def create_train_ds(df, cat, cont, y_name, splits):
splits = RandomSplitter(valid_pct=0.1)(range_of(df))
procs = [Categorify, FillMissing, Normalize]
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
xs, y = to.train.xs.astype(np.float32), to.train.y.astype(np.float32)
valid_xs,valid_y = to.valid.xs.astype(np.float32),to.valid.y.astype(np.float32)
print(f"length of train {len(to.train)} and legnth of valid {len(to.valid)}")
return to, xs, valid_xs, y, valid_y
def m_accuracy(m, xs, y):
return accuracy_score(y.values, m.predict(xs))
def _feature_importance(m, df):
return pd.DataFrame({'cols':df.columns,'imp':m.feature_importances_}).sort_values('imp',
ascending=False)
def plot_feature_importance(fi):
return fi.plot('cols','imp', 'barh', figsize=(15,15), legend=False)
def roc_auc(m, xs, valid_xs, y, valid_y):
y_proba = m.predict_proba(valid_xs)
roc_auc = roc_auc_score(valid_y,m.predict(valid_xs))
fpr, tpr, thresholds = roc_curve(valid_y, y_proba[:,1])
plt.figure()
plt.plot(fpr, tpr, label='(area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
from numba import njit
@njit
def fillna_npwhere(array, values):
if np.isnan(array.sum()):
array = np.where(np.isnan(array), values, array)
return array
# We impute the missing values with the medians
def fill_nan(test_values, features):
na_arr=np.ones((1,len(feature_nn_test)), dtype=np.float32)
na_arr=na_arr*(999)
xar = test_values - na_arr
#xar = np.nan_to_num(xar, nan=-999)
xar = fillna_npwhere(xar, -999)
xar = xar + na_arr
test_df = torch.FloatTensor(xar)
return test_df
splits = RandomSplitter(valid_pct=0.1)(range_of(df))
xs, valid_xs = df.loc[splits[0], feature_nn_test], df.loc[splits[1], feature_nn_test]
y, valid_y = df.loc[splits[0], dep_var], df.loc[splits[1], dep_var]
class Fastai_dls():
def __init__(self, df, cats=None, y=None):
df = df.copy()
if cats is not None:
self.dfcats = df[cats]
self.cats = np.stack([c.values for n, c in self.dfcats.items()], axis=1).astype(np.long)
self.dfconts = df.drop(cats, axis=1)
self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
else:
self.dfconts = df.copy()
self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
self.cats = np.zeros(len(df),).astype(np.long)
self.y = y.values
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return [self.cats[idx], self.conts[idx], self.y[idx]]
train_ds = Fastai_dls(df=xs, y=y)
valid_ds = Fastai_dls(df=valid_xs, y=valid_y)
train_dl = DataLoader(train_ds, batch_size = 4096, drop_last=True, shuffle=False)
valid_dl = DataLoader(valid_ds, batch_size = 2048, drop_last=True, shuffle=False)
dls = DataLoaders(train_dl, valid_dl, device='cuda')
dls.cats.shape, dls.conts.shape, dls.y.shape
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
loss_func = CrossEntropyLossFlat()
roc_auc = RocAucBinary()
cbs = [GradientAccumulation(),
EarlyStoppingCallback(monitor='accuracy', comp=np.greater, min_delta=0.01, patience=4),
SaveModelCallback(monitor='accuracy', comp=np.greater, min_delta=0.01),
ReduceLROnPlateau(monitor='accuracy', comp=np.greater, min_delta=0.01, patience=2)]
class JaneStreet(Module):
def __init__(self):
dropout_rate = 0.2
hidden_size = 256
n = len(feature_nn_test)
self.batch_norm0 = nn.BatchNorm1d(n)
self.dropout0 = nn.Dropout(dropout_rate)
self.dense1 = nn.Linear(n, hidden_size)
self.batch_norm1 = nn.BatchNorm1d(hidden_size)
self.dropout1 = nn.Dropout(dropout_rate)
self.dense2 = nn.Linear(hidden_size + n, hidden_size)
self.batch_norm2 = nn.BatchNorm1d(hidden_size)
self.dropout2 = nn.Dropout(dropout_rate)
self.dense3 = nn.Linear(hidden_size + hidden_size, hidden_size)
self.batch_norm3 = nn.BatchNorm1d(hidden_size)
self.dropout3 = nn.Dropout(dropout_rate)
self.dense4 = nn.Linear(hidden_size + hidden_size, hidden_size)
self.batch_norm4 = nn.BatchNorm1d(hidden_size)
self.dropout4 = nn.Dropout(dropout_rate)
self.dense5 = nn.Linear(hidden_size+hidden_size, 2)
self.LeakyReLU = nn.LeakyReLU(negative_slope=0.1, inplace=True)
self.layers = nn.Sequential(
nn.BatchNorm1d(n),
LinBnDrop(n, 400, bn=True, p=0, act=Mish(), lin_first=False),
LinBnDrop(400, 800, bn=True, p=0.2289, act=Mish(), lin_first=False),
LinBnDrop(800, 400, bn=True, p=0.2289, act=Mish(), lin_first=False),
LinBnDrop(400, n, bn=False, act=None, lin_first=False),)
def forward(self, cat,x):
x = self.layers(x)
x = self.batch_norm0(x)
x = self.dropout0(x)
x1 = self.dense1(x)
x1 = self.batch_norm1(x1)
x1 = self.LeakyReLU(x1)
x1 = self.dropout1(x1)
x = torch.cat([x, x1], 1)
x2 = self.dense2(x)
x2 = self.batch_norm2(x2)
x2 = self.LeakyReLU(x2)
x2 = self.dropout2(x2)
x = torch.cat([x1, x2], 1)
#x3 = self.dense3(x)
#x3 = self.batch_norm3(x3)
#x3 = self.LeakyReLU(x3)
#x3 = self.dropout3(x3)
# x = torch.cat([x2, x3], 1)
#x4 = self.dense4(x)
#x4 = self.batch_norm4(x4)
#x4 = self.LeakyReLU(x4)
#x4 = self.dropout4(x4)
# x = torch.cat([x3, x4], 1)
x = self.dense5(x)
return F.softmax(x, dim=1)
model_nn = JaneStreet()
model_nn = model_nn.to(device)
learn = Learner(dls, model_nn, loss_func = loss_func, metrics=[accuracy,roc_auc]).to_fp32()
learn.lr_find()
learn.fit_one_cycle(20, 1e-2, cbs=cbs)
learn.fit_one_cycle(20, 1e-3, wd = 0.0001, cbs=cbs)
learn.fit_one_cycle(20,slice(1e-03),wd = 0.0001, cbs=cbs)
%time X_test = fill_nan(df.loc[:,features_nn].drop(dep_var, axis=1).values, feature_nn_test).cuda()
%time preds = learn.model(0, X_test).argmax(dim=1).detach().cpu().numpy()
df['feature_fastai'] = pd.DataFrame(preds)
df.head()
df.isnull().sum().sum()
features = [col for col in list(df.columns) if 'feature' in col]
len(features)
df_median = df.median()
X = df.loc[:,features]
y =df.loc[:, 'action']
xs, valid_xs, y, valid_y = train_test_split(X, y, test_size=0.1, random_state = 42)
scalar = StandardScaler()
scalar.fit(xs)
xs_norm = scalar.transform(xs)
pca = PCA()
comp = pca.fit(xs_norm)
plt.plot(np.cumsum(comp.explained_variance_ratio_))
plt.grid()
plt.xlabel('Number of PC')
plt.ylabel('Explained Variance')
sns.despine()
pca = PCA(n_components=175).fit(xs_norm)
xs_transform = pca.transform(xs_norm)
# transform validation set
valid_xs_transform = pca.transform(scalar.transform(valid_xs))
dtrain = xgb.DMatrix(xs_transform, label=y)
dvalid = xgb.DMatrix(valid_xs_transform, label=valid_y)
def objective(trial):
# parameters for hypertunning
params = {'n_estimators': trial.suggest_int('n_estimators',400, 600),
'max_depth': trial.suggest_int('max_depth', 10, 20),
'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
'subsample' : trial.suggest_uniform('subsample', 0.50, 1),
'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
'gamma': trial.suggest_int('gamma', 0, 10),
'tree_method': 'gpu_hist',
'objective': 'binary:logistic'}
best = xgb.train(params, dtrain)
preds = np.rint(best.predict(dvalid))
accuracy = accuracy_score(valid_y, preds)
return accuracy
study = optuna.create_study()
study.optimize(objective,n_trials=15)
best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['objective'] = 'binary:logistic'
best_params['missing'] = -999
best_params['random_state'] = 2020
clf = xgb.XGBClassifier(**best_params)
%time clf.fit(xs_transform, y)
fig = optuna.visualization.plot_optimization_history(study)
fig.show()
fig = optuna.visualization.plot_param_importances(study)
fig.show()
m_accuracy(clf, xs_transform, y), m_accuracy(clf, valid_xs_transform, valid_y)
cm = confusion_matrix(valid_y, clf.predict(valid_xs_transform))
cm
del dvalid, dtrain
from tqdm import tqdm
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an ite-ator which loops over the test set
for (test_df, pred_df) in tqdm(iter_test):
wt = test_df.iloc[0].weight
if (wt == 0):
pred_df.action = 0
else:
test_df = augment_df(test_df, features_one)
#X_test = fill_nan(test_df).cuda()
X_test = fill_nan(test_df.values, feature_nn_test).cuda()
preds = learn.model(0, X_test).argmax(dim=1).detach().cpu().numpy()
test_df['feature_fastai'] = preds
pred_df.action = clf.predict(pca.transform(scalar.transform(
fillna_npwhere(test_df[features].values, df_median[features].values))), validate_features=False)
env.predict(pred_df)
Reference:
- https://www.kaggle.com/ahmedelhaddad/fastai-submission-custom-ds
- Collaborative filtering tutorial. (https://docs.fast.ai/tutorial.collab)