Optimizing for FNC and TB scores

Think I came up with an implementation that would work on GPU as uses PyTorch. However, reading this post True Contribution Details, exposure dissimilarity seems to be relevant just combined with FNCv3 on a multiplicative way so it might not make sense to use it without it.
Any feedback is more than welcome!

for f in feature_cols:
  train_data[f] -= 0.5

for epoch in range(epochs):
    np.random.shuffle(era_list)
    batch_count = 0
    acc_loss_train = 0
    for era in era_list:
        batch_count += 1

        # get features and target from data and put in tensors
        features = torch.tensor(train_data[train_data.erano == era].filter(like='feature').values)
        target = torch.tensor(train_data[train_data.erano == era]['target'])

        # zero gradient buffer and get model output
        optimizer.zero_grad()
        model.train()
        model_output = model(features)

        orig_loss = -numerair_tb(model_output, target)

        #dissimilarity
        train_era = train_data[train_data.erano == era]

        example_preds = torch.as_tensor(train_era['example_preds'].values) #Needs to be created previously
        example_preds = example_preds - example_preds.mean()
        corr_example_preds = (features.T * example_preds).sum(dim=1) / ((features.T * features.T).sum(dim=1) * (example_preds * example_preds).sum()).sqrt()

        preds = model_output
        preds = preds - preds.mean()
        corr_preds = (features.T * preds).sum(dim=1) / ((features.T * features.T).sum(dim=1) * (preds * preds).sum()).sqrt()

        num = corr_preds.pinverse(rcond=1e-6).dot(corr_example_preds)
        denom = corr_example_preds.pinverse(rcond=1e-6).dot(corr_example_preds)

        dissimilarity = (num/denom).sum()

        #final loss
        loss = - orig_loss + dissimilarity

        acc_loss_train += loss 
        loss.backward()
        optimizer.step()

    loss_train = acc_loss_train / batch_count
3 Likes