I'm a little Tpot: Tuning the Ensemble Framework


#1

To go along with the Ensemble Framework posted up yesterday, here are a couple of Tpot started scripts. Tpot can be used to search through the entire model space, across a single model to find optimized parameters, or over a small subset to see which one performs best. This will get you started, have fun.

Classification Models

#!/usr/bin/env python

# tpot pipeline

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from numpy import loadtxt

# load data
print("Loading data...")
dataset = loadtxt('train_tpot.csv', delimiter=",")

# split data into X and y (NumPy array format)
X = dataset[:,0:21]
Y = dataset[:,21]

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25)

classifier_config_dict = {

    'sklearn.ensemble.ExtraTreesClassifier': {
        'n_estimators': [5, 10, 15, 20, 25, 50, 75, 100],
        'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    },

    # 'sklearn.ensemble.RandomForestClassifier': {
    #     'n_estimators': [5, 10, 15, 20, 25, 50, 75, 100],
    #     'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    # },

    # 'xgboost.XGBClassifier': {
    #     'n_estimators': [5, 10, 15, 20, 25, 50, 75, 100],
    #     'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    # },

    # 'lightgbm.LGBMClassifier': {
    #     'n_estimators': [5, 10, 15, 20, 25, 50, 75, 100],
    #     'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    # }
}


tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=12, config_dict=classifier_config_dict)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_classification.py')

Regression Models

#!/usr/bin/env python

# tpot pipeline

from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from numpy import loadtxt

# load data
print("Loading data...")
dataset = loadtxt('train_tpot.csv', delimiter=",")

# split data into X and y (NumPy array format)
X = dataset[:,0:21]
Y = dataset[:,21]

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25)

Regressor_config_dict = {

    # 'sklearn.ensemble.ExtraTreesRegressor': {
    #     'n_estimators': [25, 50, 75, 100, 125, 150],
    #     'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    # },

    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [25, 50, 75, 100, 125, 150],
        'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    },

    # 'xgboost.XGBRegressor': {
    #     'n_estimators': [25, 50, 75, 100, 125, 150],
    #     'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    # },

    # 'lightgbm.LGBMRegressor': {
    #     'n_estimators': [25, 50, 75, 100, 125, 150],
    #     'max_depth': [2, 3, 4, 5, 6, 7, 8]  
    # },
}


tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, n_jobs=12, config_dict=Regressor_config_dict)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('numerai/n_tpot.py')