Hi @mdo , thank you for your reply. Yes of course, I was referring to TimeSeries cross validation (otherwise there wouldn’t be any issue). I will give a look to the code mentioned by @objectscience .
I made a subclass of BorutaShap that should be era wise and split the time series correctly. It runs, but I’m not sure of the results that seem not too consistent over time (unfortunately running the selection on the whole dataset, without sampling, takes too long on my machine to test it deeply; I expect the results to be the same over multiple runs. With sampling it’s not granted given the random process). I changed the original code as little as possible, but added early stopping to gradient descent models to improve reliability and speed (and fixed an issue in sampling that could lead to infinite loop; I made a PR for it). If someone wants to try it, here is the code:
from BorutaShap import BorutaShap
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from scipy.stats import ks_2samp
from sklearn.ensemble import IsolationForest
class BorutaShapNumerai(BorutaShap):
"""
Subclass of BorutaShap to be used in Numerai tournament
It will split the data era wise.
To be used with train_or_test = 'test' (no matter what you choose, it will always be 'test')
Sampling should work fine, era wise, and is highly recommended
given the time and RAM needed for full training
"""
def Check_if_chose_train_or_test_and_train_model(self):
"""
Decides to fit the model to either the training data or the test/unseen data a great discussion on the
differences can be found here.
https://compstat-lmu.github.io/iml_methods_limitations/pfi-data.html#introduction-to-test-vs.training-data
"""
if self.stratify is not None and not self.classification:
raise ValueError('Cannot take a strtified sample from continuous variable please bucket the variable and try again !')
if self.train_or_test.lower() == 'test':
# keeping the same naming convenetion as to not add complexit later on
self.X_boruta_train, self.X_boruta_test, self.y_train, self.y_test, self.w_train, self.w_test = self.train_test_split(self.X_boruta,
self.y,
self.sample_weight,
test_size=0.3,
random_state=self.random_state)
# Pass also X_test and y_test to allow early stopping
self.Train_model(X=self.X_boruta_train, y=self.y_train, sample_weight = self.w_train,
X_test=self.X_boruta_test, y_test=self.y_test, eval_sample_weight=self.w_test)
elif self.train_or_test.lower() == 'train':
# model will be trained and evaluated on the same data
self.Train_model(self.X_boruta, self.y, sample_weight = self.sample_weight)
self.X_boruta_test = self.X_boruta
else:
raise ValueError('The train_or_test parameter can only be "train" or "test"')
def train_test_split(self, X, y, sample_weight=None, test_size=0.3, random_state=None):
X_boruta_train = X[self.cutoff]
X_boruta_test = X[~self.cutoff]
y_train = y[self.cutoff]
y_test = y[~self.cutoff]
w_train = None
w_test = None
return X_boruta_train, X_boruta_test, y_train, y_test, w_train, w_test
def Train_model(self, X, y, sample_weight = None, X_test=None, y_test=None, eval_sample_weight=None):
"""
Trains Model also checks to see if the model is an instance of catboost as it needs extra parameters
also the try except is for models with a verbose statement
Parameters
----------
X: Dataframe
A pandas dataframe of the features.
y: Series/ndarray
A pandas series or numpy ndarray of the target
sample_weight: Series/ndarray
A pandas series or numpy ndarray of the sample weights
Returns
----------
fitted model object
"""
# Use early stopping when possible (that is in test mode; in train mode early stopping is not available)
if 'catboost' in str(type(self.model)).lower():
if X_test is not None and y_test is not None:
self.model.fit(X, y, sample_weight = sample_weight,
eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
else:
self.model.fit(X, y, sample_weight = sample_weight, cat_features = self.X_categorical, verbose=False)
elif 'lightgbm' in str(type(self.model)).lower():
if X_test is not None and y_test is not None:
self.model.fit(X, y, sample_weight = sample_weight,
eval_set=[(X_test, y_test)], early_stopping_rounds=100,eval_sample_weight = [eval_sample_weight], verbose=False)
else:
self.model.fit(X, y, sample_weight = sample_weight, verbose=False)
elif 'xgboost' in str(type(self.model)).lower():
if X_test is not None and y_test is not None:
self.model.fit(X, y, sample_weight = sample_weight,
eval_set=[(X_test, y_test)], early_stopping_rounds=100, eval_sample_weight = [eval_sample_weight], verbose=False)
else:
self.model.fit(X, y, sample_weight = sample_weight, verbose=False)
else:
try:
self.model.fit(X, y, sample_weight = sample_weight, verbose=False)
except:
self.model.fit(X, y, sample_weight = sample_weight)
@staticmethod
def create_cutoff(df):
eras = sorted(df.era.unique())
# Eras do not have all the same size so to identify the right era we need to find what is the era at the 70% of the index, add 1 and use it as our limit
cutoff_era = int(df.iloc[int(len(df.index)*0.7)].era+1)
cutoff = df.era < cutoff_era
return cutoff
def fit(self, X, y, sample_weight = None, n_trials = 20, random_state=0, sample=False,
train_or_test = 'test', normalize=True, verbose=True, stratify=None):
"""
The main body of the program this method it computes the following
1. Extend the information system by adding copies of all variables (the information system
is always extended by at least 5 shadow attributes, even if the number of attributes in
the original set is lower than 5).
2. Shuffle the added attributes to remove their correlations with the response.
3. Run a random forest classifier on the extended information system and gather the
Z scores computed.
4. Find the maximum Z score among shadow attributes (MZSA), and then assign a hit to
every attribute that scored better than MZSA.
5. For each attribute with undetermined importance perform a two-sided test of equality
with the MZSA.
6. Deem the attributes which have importance significantly lower than MZSA as ‘unimportant’
and permanently remove them from the information system.
7. Deem the attributes which have importance significantly higher than MZSA as ‘important’.
8. Remove all shadow attributes.
9. Repeat the procedure until the importance is assigned for all the attributes, or the
algorithm has reached the previously set limit of the random forest runs.
10. Stores results.
Parameters
----------
X: Dataframe
A pandas dataframe of the features.
y: Series/ndarray
A pandas series or numpy ndarray of the target
sample_weight: Series/ndarray
A pandas series or numpy ndarray of the sample weight of the observations (optional)
random_state: int
A random state for reproducibility of results
Sample: Boolean
if true then a rowise sample of the data will be used to calculate the feature importance values
sample_fraction: float
The sample fraction of the original data used in calculating the feature importance values only
used if Sample==True.
train_or_test: string
Decides whether the feature importance should be calculated on out of sample data see the dicussion here.
https://compstat-lmu.github.io/iml_methods_limitations/pfi-data.html#introduction-to-test-vs.training-data
normalize: boolean
if true the importance values will be normalized using the z-score formula
verbose: Boolean
a flag indicator to print out all the rejected or accepted features.
stratify: array
allows the train test splits to be stratified based on given values.
"""
if sample_weight is None:
sample_weight = np.ones(len(X))
np.random.seed(random_state)
self.starting_X = X.copy()
self.X = X.copy()
self.y = y.copy()
self.cutoff=None
self.cutoff=self.create_cutoff(self.X)
self.sample_weight = sample_weight.copy()
self.n_trials = n_trials
self.random_state = random_state
self.ncols = self.X.shape[1]
self.all_columns = self.X.columns.to_numpy()
self.rejected_columns = []
self.accepted_columns = []
self.check_X()
self.check_missing_values()
self.sample = sample
self.train_or_test = 'test' # train is not implemented yet
self.stratify = stratify
self.features_to_remove = []
self.hits = np.zeros(self.ncols)
self.order = self.create_mapping_between_cols_and_indices()
self.create_importance_history()
if self.sample: self.preds = self.isolation_forest(self.X, self.sample_weight)
for trial in tqdm(range(self.n_trials)):
self.remove_features_if_rejected()
self.columns = self.X.columns.to_numpy()
self.create_shadow_features()
# early stopping
if self.X.shape[1] == 0:
break
else:
self.Check_if_chose_train_or_test_and_train_model()
self.X_feature_import, self.Shadow_feature_import = self.feature_importance(normalize=normalize)
self.update_importance_history()
hits = self.calculate_hits()
self.hits += hits
self.history_hits = np.vstack((self.history_hits, self.hits))
self.test_features(iteration=trial+1)
self.store_feature_importance()
self.calculate_rejected_accepted_tentative(verbose=verbose)
@staticmethod
def isolation_forest(df, weights=None):
'''
fits isloation forest to the dataset and gives an anomaly score to every sample
'''
print('Calculating anomaly scores')
preds = []
for _,x in tqdm(df.groupby('era', group_keys=False)):
clf = IsolationForest(n_jobs=-1, random_state=68).fit(x)
preds.append(clf.score_samples(x))
preds = pd.Series(np.hstack(preds), index=df.index, name='preds').to_frame()
preds['era'] = df.era
return preds
def find_sample(self):
'''
Finds a sample by comparing the distributions of the anomaly scores between the sample and the original
distribution using the KS-test. Starts at 5% however will increase to 10% and then 15% etc. if a significant sample can not be found
'''
index_for_sample = []
for _,X in self.preds.groupby('era', group_keys=False):
loop = True
iteration = 0
size = self.get_5_percent_splits(X.shape[0])
element = 1 # we start at 10%, if you want to start at 5% set it to zero
while loop:
sample_indices = np.random.choice(np.arange(X.shape[0]), size=size[element], replace=False)
sample = self.preds.loc[X.iloc[sample_indices].index]['preds']
if ks_2samp(X.preds, sample).pvalue > 0.95:
index_for_sample.append(X.iloc[sample_indices].index)
break
iteration+=1
if iteration == 20:
element += 1
iteration = 0
index_for_sample = np.hstack(index_for_sample)
#print(f'Sample size is {len(index_for_sample)} about {(len(index_for_sample)/len(self.preds)*100):.2f}% of the original size')
return self.X_boruta.loc[index_for_sample]
Save the code as BorutaShapNumerai.py
and import as:
from BorutaShapNumerai import BorutaShapNumerai as BorutaShap
and you are ready to go.