Era Boosted Models

Era boosting with XGBoost.

from xgboost import XGBRegressor

def spearmanr(target, pred):
    return np.corrcoef(
        target,
        pred.rank(pct=True, method="first")
    )[0, 1]

def era_boost_train(X, y, era_col, proportion=0.5, trees_per_step=10, num_iters=200):
    model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=trees_per_step, n_jobs=-1, colsample_bytree=0.1)
    features = X.columns
    model.fit(X, y)
    new_df = X.copy()
    new_df["target"] = y
    new_df["era"] = era_col
    for i in range(num_iters-1):
        print(f"iteration {i}")
        # score each era
        print("predicting on train")
        preds = model.predict(X)
        new_df["pred"] = preds
        era_scores = pd.Series(index=new_df["era"].unique())
        print("getting per era scores")
        for era in new_df["era"].unique():
            era_df = new_df[new_df["era"] == era]
            era_scores[era] = spearmanr(era_df["pred"], era_df["target"])
        era_scores.sort_values(inplace=True)
        worst_eras = era_scores[era_scores <= era_scores.quantile(proportion)].index
        print(list(worst_eras))
        worst_df = new_df[new_df["era"].isin(worst_eras)]
        era_scores.sort_index(inplace=True)
        era_scores.plot(kind="bar")
        print("performance over time")
        plt.show()
        print("autocorrelation")
        print(ar1(era_scores))
        print("mean correlation")
        print(np.mean(era_scores))
        print("sharpe")
        print(np.mean(era_scores)/np.std(era_scores))
        print("smart sharpe")
        print(smart_sharpe(era_scores))
        model.n_estimators += trees_per_step
        booster = model.get_booster()
        print("fitting on worst eras")
        model.fit(worst_df[features], worst_df["target"], xgb_model=booster)
    return model

boost_model = era_boost_train(train_features, train_targets["target_kazutsugi"], era_col=train_targets["era"], proportion=0.5, trees_per_step=10, num_iters=20)
10 Likes