Era boosting with XGBoost.
from xgboost import XGBRegressor
def spearmanr(target, pred):
return np.corrcoef(
target,
pred.rank(pct=True, method="first")
)[0, 1]
def era_boost_train(X, y, era_col, proportion=0.5, trees_per_step=10, num_iters=200):
model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=trees_per_step, n_jobs=-1, colsample_bytree=0.1)
features = X.columns
model.fit(X, y)
new_df = X.copy()
new_df["target"] = y
new_df["era"] = era_col
for i in range(num_iters-1):
print(f"iteration {i}")
# score each era
print("predicting on train")
preds = model.predict(X)
new_df["pred"] = preds
era_scores = pd.Series(index=new_df["era"].unique())
print("getting per era scores")
for era in new_df["era"].unique():
era_df = new_df[new_df["era"] == era]
era_scores[era] = spearmanr(era_df["pred"], era_df["target"])
era_scores.sort_values(inplace=True)
worst_eras = era_scores[era_scores <= era_scores.quantile(proportion)].index
print(list(worst_eras))
worst_df = new_df[new_df["era"].isin(worst_eras)]
era_scores.sort_index(inplace=True)
era_scores.plot(kind="bar")
print("performance over time")
plt.show()
print("autocorrelation")
print(ar1(era_scores))
print("mean correlation")
print(np.mean(era_scores))
print("sharpe")
print(np.mean(era_scores)/np.std(era_scores))
print("smart sharpe")
print(smart_sharpe(era_scores))
model.n_estimators += trees_per_step
booster = model.get_booster()
print("fitting on worst eras")
model.fit(worst_df[features], worst_df["target"], xgb_model=booster)
return model
boost_model = era_boost_train(train_features, train_targets["target_kazutsugi"], era_col=train_targets["era"], proportion=0.5, trees_per_step=10, num_iters=20)