During S2E1 of OHwA I made a short presentation about the feather file format, which is language agnostic between R and Pandas (Python). Here is the code to store the training and tournament data as feather files, which preserves the float32 dtype for feature columns and significantly reduces memory usage and storage space.
Step 1: create a dictionary of column names and dtypes for pandas’ read_csv to use to import the CSV files in the correct dtype.
import pandas as pd
import numpy as np
from joblib import dump
#download Numerai training data and load as a pandas dataframe
TRAINING_DATAPATH = 'https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz'
df = pd.read_csv(TRAINING_DATAPATH)
#create a list of the feature columns
features = [c for c in df if c.startswith("feature")]
#create a list of the column names
col_list = ["id", "era", "data_type"]
col_list = col_list + features + ["target_kazutsugi"]
#create a list of corresponding data types to match the column name list
dtype_list_back = [np.float32] * 311
dtype_list_front = [str, str, str]
dtype_list = dtype_list_front + dtype_list_back
#use Python's zip function to combine the column name list and the data type list
dtype_zip = zip(col_list, dtype_list)
#convert the combined list to a dictionary to conform to pandas convention
dtype_dict = dict(dtype_zip)
#save the dictionary as a joblib file for future use
dump(dtype_dict, 'dtype_dict.joblib')
Step 2: use the newly created dictionary to import both data files and save them as feather format
import pandas as pd
import numpy as np
from joblib import load
import pyarrow.feather as feather
#load dictionary to import data in specific data types
dtype_dict = load('dtype_dict.joblib')
#download Numerai training data and load as a pandas dataframe
TRAINING_DATAPATH = 'https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz'
df = pd.read_csv(TRAINING_DATAPATH, dtype=dtype_dict)
#download Numerai tournament data and load as a pandas dataframe
TOURNAMENT_DATAPATH = 'https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz'
df_tournament = pd.read_csv(TOURNAMENT_DATAPATH, dtype=dtype_dict)
#save Numerai training data as a compressed feather file
feather.write_feather(df, 'training_compressed.feather', compression='lz4')
#save Numerai tournament data as a compressed feather file
feather.write_feather(df_tournament, 'tournament_compressed.feather', compression='lz4')
Using the above code in production is very simple:
import pandas as pd
df = pd.read_feather('training_compressed.feather')