In [None]:
# HIDDEN": "remove-cell
# cell to import all dependencies to run in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
  !"{sys.executable}" -m pip install -U mlxtend pycaret pandas-profiling optuna
else:
  print('Not running on CoLab')

# Model Tuning

Bij model tuning gaan we search capaciteiten gebruiken om hyperparameters te zoeken die optimaal performatie geven.  Er zijn verschillende frameworks die we hiervoor kunnen gebruiken. Hieronder een voorbeeld met optuna. Waar we tijdens het trainen de parameters zoeken van een ML model zoals de gewichten bij linieare regresssie en de spilts bij een decision tree gaan we hier meerdere modelen gaan trainen met andere begin waarden. bijvoorbeeld hoeveel leafs heeft een decision tree.

In [1]:
"""
Optuna example that optimizes a classifier configuration for Iris dataset using sklearn.
In this example, we optimize a classifier configuration for Iris dataset. Classifiers are from
scikit-learn. We optimize both the choice of classifier (among SVC and RandomForest) and their
hyperparameters.
"""

import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm


# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
def objective(trial):
    iris = sklearn.datasets.load_iris()
    x, y = iris.data, iris.target

    classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto")
    else:
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = sklearn.ensemble.RandomForestClassifier(
            max_depth=rf_max_depth, n_estimators=10
        )

    score = sklearn.model_selection.cross_val_score(classifier_obj, x, y, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
study.best_trial

[32m[I 2021-08-30 18:06:40,772][0m A new study created in memory with name: no-name-57635065-d92c-4581-a6cb-37e337ffa2e9[0m
[32m[I 2021-08-30 18:06:41,899][0m Trial 0 finished with value: 0.9666666666666667 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 11}. Best is trial 0 with value: 0.9666666666666667.[0m
[32m[I 2021-08-30 18:06:42,633][0m Trial 1 finished with value: 0.9533333333333333 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 3}. Best is trial 0 with value: 0.9666666666666667.[0m
[32m[I 2021-08-30 18:06:42,683][0m Trial 2 finished with value: 0.9533333333333333 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 9}. Best is trial 0 with value: 0.9666666666666667.[0m
[32m[I 2021-08-30 18:06:42,705][0m Trial 3 finished with value: 0.32 and parameters: {'classifier': 'SVC', 'svc_c': 1.5617287595894665e-08}. Best is trial 0 with value: 0.9666666666666667.[0m
[32m[I 2021-08-30 18:06:42,721][0m Trial 4 finished with value:

FrozenTrial(number=47, values=[0.9866666666666667], datetime_start=datetime.datetime(2021, 8, 30, 18, 6, 44, 89444), datetime_complete=datetime.datetime(2021, 8, 30, 18, 6, 44, 110203), params={'classifier': 'SVC', 'svc_c': 3.89501930288337}, distributions={'classifier': CategoricalDistribution(choices=('SVC', 'RandomForest')), 'svc_c': LogUniformDistribution(high=10000000000.0, low=1e-10)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=47, state=TrialState.COMPLETE, value=None)

Pycaret voorziet ook een tuning stap die gebaseerd is op een grid search.

In [None]:
from pycaret.datasets import get_data 
diabetes = get_data('diabetes') 
# Importing module and initializing setup 
from pycaret.classification import * 
clf1 = setup(data = diabetes, target = 'Class variable')
# train a decision tree model
dt = create_model('dt')
# tune hyperparameters with custom_grid
params = {"max_depth": np.random.randint(1, (len(data.columns)*.85),20),
          "max_features": np.random.randint(1, len(data.columns),20),
          "min_samples_leaf": [2,3,4,5,6],
          "criterion": ["gini", "entropy"]
          }
tuned_dt_custom = tune_model(dt, custom_grid = params)