This notebook ocntains my solution (submission not made yet due some challenges with my submission pipeline) to the Kaggle's Jane Prediction Challenge.

Introduction

Jane Street Market Prediction is a competition organized by kaggle, which you build a model to predict financial market. This competition is a classification competition with the goal to predict an action of class 0 or 1.

More about the competition can be found in the link above.

Why THIS Competition

Currently, I am Reading Jeremy Howard and Sylvain Gugger's Deep Learning for Coders with Fastai and Pytorch book and to understand more about using NN to solve tabular challenges, I entered this challenge.

Further, the rules of the competition that caught my attention, as your model is passed through 1M test dataset in 5 hours. This means your model should be optimzed, efficient and fast.

My Goal:

My major goal in 2021 is to solve more data science problem and also ship codes in production and I believe participating in kaggle challenges will help me achieve that.

To to waste your time let start.

Installing libraries and downloading Datasets

%%capture
# installing libaries 
!pip install fastai==2.1.2 
!pip install --upgrade fastai 
!pip install nbdev 

!pip install numba 
!pip install optuna 
!pip install xgboost
!pip install kaggle 

import sys
from fastai.tabular.all import *

import joblib

import numpy as np 
import numpy.ma as ma
from numba import njit

import seaborn as sns
import matplotlib
matplotlib.rc('image', cmap='Greys')
from IPython.display import Image, display_svg, SVG


import time
from tqdm.notebook import tqdm

#import cudf # install cudf on google colab before running this. 
import pandas as pd 
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
pd.options.display.max_rows = 20
pd.options.display.max_columns = 8


from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.inspection import plot_partial_dependence
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


import xgboost as xgb 
import optuna 

from fastai.tabular.all import *
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
import torch

Downloading datasets from Kagggle.

Before running this make sure you have registered for kaggle, accepted the competition rules and also generated a key to access kaggle's API.

creds = '{"username":"xxxxxxxx","key":"xxxxxxxxxxxxx"}'

cred_path = Path('~/.kaggle/kaggle.json').expanduser() 
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds) 
    cred_path.chmod(0o600)

from kaggle import api

path = URLs.path('jane_street_prediction')
print(path)
Path.BASE_PATH = path

if not path.exists():
    path.mkdir(parents=true)
    api.competition_download_cli('jane-street-market-prediction',path=path)
    file_extract(path/'jane-street-market-prediction.zip')

path.ls(file_type='text')

!rm -rf /root/.fastai/archive/jane_street_prediction/jane-street-market-prediction.zip

Loading Data and Preprocessing

"""
%%time
train_cudf  = cudf.read_csv(path/'train.csv', nrows=250000)
df = train_cudf.to_pandas()
del train_cudf
features = pd.read_csv(path/'features.csv')
example_test = pd.read_csv(path/'example_test.csv')
sample_prediction_df = pd.read_csv(path/'example_sample_submission.csv')
print ("Data is loaded!")
"""

'\n%%time\ntrain_cudf  = cudf.read_csv(path/\'train.csv\', nrows=250000)\ndf = train_cudf.to_pandas()\ndel train_cudf\nfeatures = pd.read_csv(path/\'features.csv\')\nexample_test = pd.read_csv(path/\'example_test.csv\')\nsample_prediction_df = pd.read_csv(path/\'example_sample_submission.csv\')\nprint ("Data is loaded!")\n'

df  = pd.read_csv(path/'train.csv', nrows=200000)
features = pd.read_csv(path/'features.csv')
example_test = pd.read_csv(path/'example_test.csv')
sample_prediction_df = pd.read_csv(path/'example_sample_submission.csv')

print('train shape is {}'.format(df.shape))
print('features shape is {}'.format(features.shape))
print('example_test shape is {}'.format(example_test.shape))
print('sample_prediction_df shape is {}'.format(sample_prediction_df.shape))

train shape is (200000, 138)

df.head()

df = df[df['weight'] != 0].reset_index(drop = True)

dep_var = 'action'
df[dep_var] = (((df['resp']* df['weight'])>0)*1).astype('category')

df = df.loc[:, ~df.columns.str.contains('resp')]

features_one = [col for col in list(df.columns) if 'feature' in col]
features_nn = features_one + [dep_var]

len(features_one), len(features_nn)

(130, 131)

Due to the nature of the competition, the code below was not executed in my final submission as it decreased the number of iterations/sec. Ideally, the number of iterations/sec should be about 50-60. However the code below when included in the final run, reduces the number of iterations/sec to 7 (maximum).

Also, it is important to note that the accuray of our model slightly increases when applied.

def augment_df(df, features):
    for feature in features:
        df[f'{feature}_sq'] = np.square(df[feature].to_numpy())
        #df[f'{feature}_log'] = df[feature].apply(lambda x: 0 if x==0 else np.log(x))
        df[f'{feature}_log'] = ma.log(df[feature].to_numpy())
        
    df['feature_min'] = df[features].min(axis=1)
    df['feature_mean'] = df[features].mean(axis=1)
    df['feature_max'] = df[features].max(axis=1)
    df['feature_median'] = df[features].median(axis=1)
    df['feature_std'] = df[features].std(axis=1)
    df['feature_var'] = df[features].var(axis=1)
    df['feature_abs_mean'] = df[features].abs().mean(axis=1)
    df['feature_abs_median'] = df[features].abs().median(axis=1)
    df['feature_abs_std'] = df[features].abs().std(axis=1)
    df['feature_skew'] = df[features].skew(axis=1)
    df['feature_kurt'] = df[features].kurt(axis=1)
    df['feature_sq_kurt'] = np.square(df[features_one].kurt(axis=1))
    return df

%time df = augment_df(df, features_one)

CPU times: user 6.07 s, sys: 445 ms, total: 6.51 s
Wall time: 6.53 s

features_nn =  [col for col in list(df.columns) if 'feature' in col] + [dep_var] 
feature_nn_test = [col for col in list(df.columns) if 'feature' in col]


len(feature_nn_test), len(features_nn)

(402, 403)

df.shape

(155528, 406)

df_median = df.median()


# replacing missing values. 
df = df.fillna(df_median)

df.isnull().sum().sum()

0

df[feature_nn_test].astype('float32')
df[dep_var].astype('int')

0         0
1         0
2         1
3         1
4         0
         ..
155523    1
155524    1
155525    1
155526    1
155527    1
Name: action, Length: 155528, dtype: int64

utility function.

# getting continous and categorical variables from dataset 
def cont_cat_split(df, max_card=20, dep_var=None):
    "Helper function that returns column names of cont and cat variables from given `df`."
    cont_names, cat_names = [], []
    for label in df:
        if label in L(dep_var): continue
        if (pd.api.types.is_integer_dtype(df[label].dtype) and
            df[label].unique().shape[0] > max_card or
            pd.api.types.is_float_dtype(df[label].dtype)):
            cont_names.append(label)
        else: cat_names.append(label)
    return cont_names, cat_names


cont_nn,cat_nn = cont_cat_split(df.loc[:,features_nn], max_card=9000, dep_var=dep_var)

def create_train_ds(df, cat, cont, y_name, splits):
    splits = RandomSplitter(valid_pct=0.1)(range_of(df))
    procs  = [Categorify, FillMissing, Normalize] 
    to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
    
    xs, y = to.train.xs.astype(np.float32), to.train.y.astype(np.float32)
    valid_xs,valid_y = to.valid.xs.astype(np.float32),to.valid.y.astype(np.float32)
    
    print(f"length of train {len(to.train)} and legnth of valid {len(to.valid)}")
    
    return to, xs, valid_xs, y, valid_y



def m_accuracy(m, xs, y):
    return accuracy_score(y.values, m.predict(xs))



def _feature_importance(m, df):
    return pd.DataFrame({'cols':df.columns,'imp':m.feature_importances_}).sort_values('imp',
                                                                                     ascending=False)



def plot_feature_importance(fi):
    return fi.plot('cols','imp', 'barh', figsize=(15,15), legend=False)


def roc_auc(m, xs, valid_xs, y, valid_y):
    y_proba = m.predict_proba(valid_xs)
    roc_auc = roc_auc_score(valid_y,m.predict(valid_xs))
    fpr, tpr, thresholds = roc_curve(valid_y, y_proba[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='(area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

from numba import njit

@njit
def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array



# We impute the missing values with the medians
def fill_nan(test_values, features):
    na_arr=np.ones((1,len(feature_nn_test)), dtype=np.float32)
    na_arr=na_arr*(999)
    xar = test_values - na_arr
    #xar = np.nan_to_num(xar, nan=-999)
    xar = fillna_npwhere(xar, -999)
    xar = xar + na_arr
    test_df = torch.FloatTensor(xar)
    return test_df

NN using Fastai and using output of model as input to another model.

In this section, we could have Fastai's TabularPandas function but due to some bugs I encounter (still encountering more), I created custom dataloarders for the task.

splits = RandomSplitter(valid_pct=0.1)(range_of(df))

xs, valid_xs = df.loc[splits[0], feature_nn_test], df.loc[splits[1], feature_nn_test]
y, valid_y = df.loc[splits[0], dep_var], df.loc[splits[1], dep_var]

class Fastai_dls():
    def __init__(self, df, cats=None, y=None):
        df = df.copy() 
        
        if cats is not None:
            self.dfcats = df[cats] 
            self.cats = np.stack([c.values for n, c in self.dfcats.items()], axis=1).astype(np.long)
            self.dfconts = df.drop(cats, axis=1)
            self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
        else:
            self.dfconts = df.copy() 
            self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32) 
            self.cats = np.zeros(len(df),).astype(np.long)
        
        self.y = y.values
    
    def __len__(self):
        return len(self.y) 
    
    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

train_ds = Fastai_dls(df=xs, y=y)
valid_ds = Fastai_dls(df=valid_xs, y=valid_y)

train_dl = DataLoader(train_ds, batch_size = 4096, drop_last=True, shuffle=False)
valid_dl = DataLoader(valid_ds, batch_size = 2048, drop_last=True, shuffle=False)

dls = DataLoaders(train_dl, valid_dl, device='cuda')

dls.cats.shape, dls.conts.shape, dls.y.shape

((139976,), (139976, 402), (139976,))

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

loss_func = CrossEntropyLossFlat()
roc_auc = RocAucBinary()

cbs = [GradientAccumulation(),
       EarlyStoppingCallback(monitor='accuracy', comp=np.greater, min_delta=0.01, patience=4), 
       SaveModelCallback(monitor='accuracy', comp=np.greater, min_delta=0.01),
       ReduceLROnPlateau(monitor='accuracy', comp=np.greater, min_delta=0.01, patience=2)]

class JaneStreet(Module):
    def __init__(self):
        dropout_rate = 0.2
        hidden_size = 256
        n = len(feature_nn_test)
        self.batch_norm0 = nn.BatchNorm1d(n)
        self.dropout0 = nn.Dropout(dropout_rate)

        self.dense1 = nn.Linear(n, hidden_size)
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(dropout_rate)

        self.dense2 = nn.Linear(hidden_size + n, hidden_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(dropout_rate)

        self.dense3 = nn.Linear(hidden_size + hidden_size, hidden_size)
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(dropout_rate)

        self.dense4 = nn.Linear(hidden_size + hidden_size, hidden_size)
        self.batch_norm4 = nn.BatchNorm1d(hidden_size)
        self.dropout4 = nn.Dropout(dropout_rate)

        self.dense5 = nn.Linear(hidden_size+hidden_size, 2)

      
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.1, inplace=True)


        self.layers = nn.Sequential(
            nn.BatchNorm1d(n),
            LinBnDrop(n, 400, bn=True, p=0, act=Mish(), lin_first=False),
            LinBnDrop(400, 800, bn=True, p=0.2289, act=Mish(), lin_first=False),   
            LinBnDrop(800, 400, bn=True, p=0.2289, act=Mish(), lin_first=False),
            LinBnDrop(400, n, bn=False, act=None, lin_first=False),) 
    
    def forward(self, cat,x):
        x = self.layers(x)

        x = self.batch_norm0(x)
        x = self.dropout0(x)

        x1 = self.dense1(x) 
        x1 = self.batch_norm1(x1)
        x1 = self.LeakyReLU(x1)
        x1 = self.dropout1(x1)

        x = torch.cat([x, x1], 1)

        x2 = self.dense2(x)
        x2 = self.batch_norm2(x2)

        x2 = self.LeakyReLU(x2)
        x2 = self.dropout2(x2)

        x = torch.cat([x1, x2], 1)

        #x3 = self.dense3(x)
        #x3 = self.batch_norm3(x3)
        #x3 = self.LeakyReLU(x3)
        #x3 = self.dropout3(x3)

       # x = torch.cat([x2, x3], 1)

        #x4 = self.dense4(x)
        #x4 = self.batch_norm4(x4)
        #x4 = self.LeakyReLU(x4)
        #x4 = self.dropout4(x4)

       # x = torch.cat([x3, x4], 1)

        x = self.dense5(x)

        return F.softmax(x, dim=1)
    

model_nn = JaneStreet()
model_nn = model_nn.to(device)

learn = Learner(dls, model_nn, loss_func = loss_func, metrics=[accuracy,roc_auc]).to_fp32()

learn.lr_find()

SuggestedLRs(lr_min=0.04365158379077912, lr_steep=0.010964781977236271)

learn.fit_one_cycle(20, 1e-2, cbs=cbs)

Better model found at epoch 0 with accuracy value: 0.5408063530921936.
Better model found at epoch 2 with accuracy value: 0.5608258843421936.
Better model found at epoch 4 with accuracy value: 0.5765206217765808.
Epoch 6: reducing lr to 0.0009580172347940181
Better model found at epoch 7 with accuracy value: 0.6057477593421936.
Epoch 9: reducing lr to 0.0007526650921209491
Better model found at epoch 10 with accuracy value: 0.62451171875.
Better model found at epoch 12 with accuracy value: 0.6393694281578064.
Epoch 14: reducing lr to 0.0002526794836121448
Better model found at epoch 15 with accuracy value: 0.6517857313156128.
Epoch 17: reducing lr to 4.449824457047561e-05
No improvement since epoch 15: early stopping

learn.fit_one_cycle(20, 1e-3, wd = 0.0001, cbs=cbs)

Better model found at epoch 0 with accuracy value: 0.6538783311843872.
Epoch 2: reducing lr to 6.598670704145258e-05
No improvement since epoch 0: early stopping

learn.fit_one_cycle(20,slice(1e-03),wd = 0.0001, cbs=cbs)

Better model found at epoch 0 with accuracy value: 0.6538783311843872.
Epoch 2: reducing lr to 6.598670704145258e-05
No improvement since epoch 0: early stopping

%time X_test = fill_nan(df.loc[:,features_nn].drop(dep_var, axis=1).values, feature_nn_test).cuda()

CPU times: user 1.26 s, sys: 274 ms, total: 1.54 s
Wall time: 1.57 s

%time preds = learn.model(0, X_test).argmax(dim=1).detach().cpu().numpy()

CPU times: user 133 ms, sys: 93.2 ms, total: 226 ms
Wall time: 227 ms

df['feature_fastai'] = pd.DataFrame(preds)

df.head()

df.isnull().sum().sum()

0

features = [col for col in list(df.columns) if 'feature' in col]
len(features)

403

df_median = df.median()

PCA and Normalization

Since we did not used fastai's Tabularpandas, we out performed dimension reduction using PCA and also used the StandardScaler to normlize our dataset.

X = df.loc[:,features]
y =df.loc[:, 'action']

xs, valid_xs, y, valid_y = train_test_split(X, y, test_size=0.1, random_state = 42)

scalar = StandardScaler()
scalar.fit(xs)
xs_norm = scalar.transform(xs)

pca = PCA() 
comp = pca.fit(xs_norm)

plt.plot(np.cumsum(comp.explained_variance_ratio_)) 
plt.grid()
plt.xlabel('Number of PC')
plt.ylabel('Explained Variance')
sns.despine()

pca = PCA(n_components=175).fit(xs_norm)
xs_transform = pca.transform(xs_norm)

# transform validation set 
valid_xs_transform = pca.transform(scalar.transform(valid_xs))

Xgboost and Hypertunning

Now that all is set and done. We use optuna for hypertunning to get the best params to run our final model

dtrain = xgb.DMatrix(xs_transform, label=y)
dvalid = xgb.DMatrix(valid_xs_transform, label=valid_y)

def objective(trial):
    # parameters for hypertunning 
    params = {'n_estimators': trial.suggest_int('n_estimators',400, 600),
              'max_depth': trial.suggest_int('max_depth', 10, 20),
              'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
              'subsample' : trial.suggest_uniform('subsample', 0.50, 1),
              'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
              'gamma': trial.suggest_int('gamma', 0, 10),
              'tree_method': 'gpu_hist',
              'objective': 'binary:logistic'}
    
    best = xgb.train(params, dtrain) 
    preds = np.rint(best.predict(dvalid))
    
    accuracy = accuracy_score(valid_y, preds)
    
    return accuracy

study = optuna.create_study()
study.optimize(objective,n_trials=15)

[I 2021-01-26 17:21:58,843] A new study created in memory with name: no-name-cef3f5eb-a80c-4775-a9ab-1e89b9c5d961
[I 2021-01-26 17:22:07,762] Trial 0 finished with value: 0.712852825821385 and parameters: {'n_estimators': 432, 'max_depth': 20, 'learning_rate': 0.05316590799747048, 'subsample': 0.6030339348493646, 'colsample_bytree': 0.5041951234556589, 'gamma': 0}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:18,330] Trial 1 finished with value: 0.7280267472513341 and parameters: {'n_estimators': 495, 'max_depth': 20, 'learning_rate': 0.06994275310474216, 'subsample': 0.7878025178834351, 'colsample_bytree': 0.8803972779006508, 'gamma': 7}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:21,231] Trial 2 finished with value: 0.7388285218285862 and parameters: {'n_estimators': 575, 'max_depth': 12, 'learning_rate': 0.08469370253489612, 'subsample': 0.9542683974475639, 'colsample_bytree': 0.9402130470011565, 'gamma': 5}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:29,584] Trial 3 finished with value: 0.7240403780621102 and parameters: {'n_estimators': 581, 'max_depth': 17, 'learning_rate': 0.09737274652139076, 'subsample': 0.7626282267740945, 'colsample_bytree': 0.8350626423397698, 'gamma': 1}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:41,359] Trial 4 finished with value: 0.7266122291519321 and parameters: {'n_estimators': 574, 'max_depth': 19, 'learning_rate': 0.0866660132820294, 'subsample': 0.9174260796450562, 'colsample_bytree': 0.969457811009965, 'gamma': 7}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:44,620] Trial 5 finished with value: 0.7390214106603228 and parameters: {'n_estimators': 580, 'max_depth': 13, 'learning_rate': 0.012933330058851871, 'subsample': 0.6227659686086489, 'colsample_bytree': 0.7404931965812653, 'gamma': 1}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:46,335] Trial 6 finished with value: 0.7400501510962515 and parameters: {'n_estimators': 480, 'max_depth': 11, 'learning_rate': 0.0282232395401555, 'subsample': 0.7980018979397375, 'colsample_bytree': 0.6231721323422814, 'gamma': 5}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:54,191] Trial 7 finished with value: 0.7307914871728927 and parameters: {'n_estimators': 588, 'max_depth': 18, 'learning_rate': 0.07896845483088494, 'subsample': 0.622006053733118, 'colsample_bytree': 0.9691729657870658, 'gamma': 7}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:22:55,488] Trial 8 finished with value: 0.7435221500675111 and parameters: {'n_estimators': 580, 'max_depth': 10, 'learning_rate': 0.01675790370010787, 'subsample': 0.8766339219100194, 'colsample_bytree': 0.7468055644625144, 'gamma': 6}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:23:06,888] Trial 9 finished with value: 0.7300199318459462 and parameters: {'n_estimators': 575, 'max_depth': 20, 'learning_rate': 0.016929814973638317, 'subsample': 0.8505585597383483, 'colsample_bytree': 0.9721352043360518, 'gamma': 8}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:23:11,057] Trial 10 finished with value: 0.7226258599627081 and parameters: {'n_estimators': 400, 'max_depth': 15, 'learning_rate': 0.04571749310434029, 'subsample': 0.5052117411246719, 'colsample_bytree': 0.5032220267129726, 'gamma': 10}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:23:15,119] Trial 11 finished with value: 0.7253263036070211 and parameters: {'n_estimators': 404, 'max_depth': 15, 'learning_rate': 0.04547748707170171, 'subsample': 0.5094138178759925, 'colsample_bytree': 0.528006777428292, 'gamma': 10}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:23:19,125] Trial 12 finished with value: 0.7247476371118112 and parameters: {'n_estimators': 405, 'max_depth': 15, 'learning_rate': 0.05165598894060981, 'subsample': 0.501102929169433, 'colsample_bytree': 0.520841589905559, 'gamma': 2}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:23:25,420] Trial 13 finished with value: 0.7284768211920529 and parameters: {'n_estimators': 446, 'max_depth': 17, 'learning_rate': 0.03830772039565017, 'subsample': 0.6137824255626344, 'colsample_bytree': 0.6199330542446446, 'gamma': 3}. Best is trial 0 with value: 0.712852825821385.
[I 2021-01-26 17:23:29,049] Trial 14 finished with value: 0.7336848196489423 and parameters: {'n_estimators': 436, 'max_depth': 14, 'learning_rate': 0.06400777080482419, 'subsample': 0.5591318563800007, 'colsample_bytree': 0.6050716610853394, 'gamma': 10}. Best is trial 0 with value: 0.712852825821385.

best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['objective'] = 'binary:logistic'
best_params['missing'] = -999
best_params['random_state'] = 2020

clf = xgb.XGBClassifier(**best_params)
%time clf.fit(xs_transform, y)

CPU times: user 1min 50s, sys: 28 s, total: 2min 18s
Wall time: 2min 18s

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5041951234556589, gamma=0,
              learning_rate=0.05316590799747048, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=-999, n_estimators=432, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=2020,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.6030339348493646, tree_method='gpu_hist',
              verbosity=1)

fig = optuna.visualization.plot_optimization_history(study)
fig.show()

fig = optuna.visualization.plot_param_importances(study)
fig.show()

m_accuracy(clf, xs_transform, y), m_accuracy(clf, valid_xs_transform, valid_y)

(1.0, 0.7538738507040442)

cm = confusion_matrix(valid_y,  clf.predict(valid_xs_transform))
cm

array([[5519, 2000],
       [1828, 6206]])

del dvalid, dtrain

kaggle Submission Method

from tqdm import tqdm
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an ite-ator which loops over the test set

for (test_df, pred_df) in tqdm(iter_test):
    wt = test_df.iloc[0].weight
    if (wt == 0):
        pred_df.action = 0
    else:
        test_df = augment_df(test_df, features_one)
        #X_test = fill_nan(test_df).cuda()
        X_test = fill_nan(test_df.values, feature_nn_test).cuda()
        preds = learn.model(0, X_test).argmax(dim=1).detach().cpu().numpy()
        test_df['feature_fastai'] = preds
        pred_df.action = clf.predict(pca.transform(scalar.transform(
            fillna_npwhere(test_df[features].values, df_median[features].values))), validate_features=False)
    env.predict(pred_df)

15219it [49:10,  5.16it/s]

Conclusion

As discussed earlier, I have not submitted yet due to some bugs in my submission pipeline above, and also exchausting my kaggle weekly GPU quota. Will do that next week. For now, I go back to algo and data structures and reading Jeremy's Book

Reference:

https://www.kaggle.com/ahmedelhaddad/fastai-submission-custom-ds
Collaborative filtering tutorial. (https://docs.fast.ai/tutorial.collab)

	weight	resp_1	resp_2	...	feature_127	feature_128	feature_129	ts_id
0	0.000000	0.009916	0.014079	...	12.600292	2.301488	11.445807	0
1	16.673515	-0.002828	-0.003226	...	2.297459	-1.304614	1.898684	1
2	0.000000	0.025134	0.027607	...	10.060014	6.638248	9.427299	2
3	0.000000	-0.004730	-0.003273	...	1.266037	3.856384	1.013469	3
4	0.138531	0.001252	0.002165	...	5.233243	0.362636	3.926633	4

epoch	train_loss	valid_loss	accuracy	roc_auc_score	time
0	0.691607	0.684890	0.540806	0.560280	00:02
1	0.685817	0.685720	0.550293	0.573549	00:03
2	0.681287	0.678165	0.560826	0.594965	00:03
3	0.676755	0.673536	0.568429	0.605073	00:03
4	0.671056	0.670174	0.576521	0.617103	00:02
5	0.664675	0.673411	0.559849	0.599095	00:03
6	0.658145	0.668097	0.586147	0.633828	00:02
7	0.650803	0.651233	0.605748	0.659833	00:02
8	0.642964	0.655354	0.606934	0.659029	00:02
9	0.634794	0.650624	0.604771	0.661042	00:02
10	0.626779	0.643351	0.624512	0.683313	00:02
11	0.617199	0.660196	0.611328	0.666140	00:02
12	0.607306	0.632880	0.639369	0.701388	00:02
13	0.595890	0.641265	0.624163	0.684749	00:02
14	0.585098	0.635842	0.636370	0.700417	00:02
15	0.573853	0.628843	0.651786	0.717595	00:03
16	0.564586	0.627700	0.652692	0.721528	00:02
17	0.555632	0.627453	0.654715	0.722589	00:02
18	0.548891	0.627366	0.655971	0.723653	00:03
19	0.544899	0.627353	0.655552	0.723790	00:02

epoch	train_loss	valid_loss	accuracy	roc_auc_score	time
0	0.554790	0.627454	0.653878	0.719309	00:02
1	0.552611	0.626815	0.653809	0.722306	00:02
2	0.550028	0.628586	0.651786	0.721557	00:02
3	0.548423	0.628071	0.654018	0.722426	00:03
4	0.546515	0.626714	0.654785	0.724814	00:02

epoch	train_loss	valid_loss	accuracy	roc_auc_score	time
0	0.551819	0.627065	0.653878	0.720447	00:02
1	0.550544	0.627526	0.653878	0.722107	00:03
2	0.549451	0.626863	0.654576	0.723404	00:03
3	0.547920	0.626525	0.655622	0.722520	00:02
4	0.546614	0.626699	0.655901	0.722730	00:02

	weight	feature_0	feature_1	...	feature_skew	feature_kurt	feature_sq_kurt	feature_fastai
0	16.673515	-1	-1.349537	...	0.333055	1.320575	1.743918	0
1	0.138531	1	-3.172026	...	0.565369	0.448392	0.201055	1
2	0.190575	-1	-3.172026	...	0.587222	0.820025	0.672440	1
3	3.820844	-1	0.446050	...	0.435421	0.071277	0.005080	1
4	0.116557	1	-3.172026	...	1.260976	3.391439	11.501862	1