Here’s a slightly modified version introducing also the purging. In my understanding the embargo makes sense only for KFold CV so simply purging the periods between the Train and Test sets should be enough to avoid leakages in case of time series CV.
I haven’t had the chance to test it yet but I’ll post an update when I have run the tests.
import numpy as np
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
class PurgedTimeSeriesSplitGroups(_BaseKFold):
def __init__(self, n_splits=5, purge_groups=0):
super().__init__(n_splits, shuffle=False, random_state=None)
self.purge_groups = purge_groups
def split(self, X, y=None, groups=None):
X, y, groups = indexable(X, y, groups)
n_samples = _num_samples(X)
n_folds = self.n_splits + 1
group_list = np.unique(groups)
n_groups = len(group_list)
if n_folds + self.purge_groups > n_groups:
raise ValueError((f"Cannot have number of folds plus purged groups "
f"={n_folds+self.purge_groups} greater than the "
f"number of groups: {n_groups}."))
indices = np.arange(n_samples)
test_size = ((n_groups-self.purge_groups) // n_folds)
test_starts = [n_groups-test_size*c for c in range(n_folds-1, 0, -1)]
for test_start in test_starts:
yield (indices[groups.isin(group_list[:test_start-self.purge_groups])],
indices[groups.isin(group_list[test_start:test_start + test_size])])