Hey. I’m just starting, and I thought sharing this was something useful.
Here is the code of an enhanced read_csv
function, which is a lot faster when memory='high'
is set.
The top consumption for both training and tournament data was almost 9GiB (including my system).
import csv
import datatable
import numpy as np
import pandas as pd
import psutil
# Read the csv file into a pandas Dataframe as float16 to save space
def read_csv(file_path, memory="high"):
if memory == "high":
csv_datatable = datatable.fread(file_path)
dtypes = {
x: np.float16
for x in csv_datatable.names
if x.startswith(("feature", "target"))
}
df = csv_datatable.to_pandas().astype(dtypes)
print('Top used RAM')
print_used_ram()
del csv_datatable
else:
with open(file_path, "r") as f:
column_names = next(csv.reader(f))
if memory == "medium":
dtypes = {
x: np.float16
for x in column_names
if x.startswith(("feature", "target"))
}
df = pd.read_csv(file_path, dtype=dtypes, index_col=0)
elif memory == "low":
dtypes = {f"target": np.float16}
to_uint8 = lambda x: np.uint8(float(x) * 4)
converters = {x: to_uint8 for x in column_names if x.startswith("feature")}
df = pd.read_csv(file_path, dtype=dtypes, converters=converters)
else:
raise ValueError('memory parameter not in ["high", "medium", "low"]')
return df
def print_used_ram():
vm = psutil.virtual_memory()
used = (vm.total - vm.available) / 1024**3
print(f'Used RAM: {used} GiB')
return used
def main():
u1 = print_used_ram()
print("Loading data...")
# The training data is used to train your model how to predict the targets.
training_data = read_csv("numerai_training_data.csv")
print('Loaded training data')
u2 = print_used_ram()
# The tournament data is the data that Numerai uses to evaluate your model.
tournament_data = read_csv("numerai_tournament_data.csv")
print('Loaded tournament data')
u3 = print_used_ram()
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")
if __name__ == "__main__":
main()
I want to create a pretty serious pipeline, so I can iterate properly. I’ve seen many great posts, but any github repo with that pipeline would be appreciated. If you don’t have one, I’ll share it soon. Hopefully we can improve the pipeline together and focus on modeling. Cheers