GridSearchCVで複数のモデルを評価する
行2ではデータを75対25に分割しています。
行4-63では評価するモデルを定義しています。
GridSearchCV()を使用するときは、モデルのパラメータ値を連続ではなく不連続な飛び飛びの値を指定します。
なぜかと言えば、計測時間を節約するためです。
ここではPythonの「list(range())」を使用して不連続な値を生成しています。
ちなみにRandomizedSearchCV()を使用するときは連続したパラメータ値を指定します。
行68-81ではGridSearchCV()とfit()で複数のモデルの学習・評価を行っています。
GridSearchCV()の引数に「cv=5」を指定しているので各モデルとも5回学習・評価を行います。
行93-121ではモデルの評価結果を表示しています。
### Modeling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
model_params = {
'clf0': {
'model': KNeighborsClassifier(),
'params': {
'n_neighbors': list(range(3, 15, 2)),
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
},
'clf1': {
'model': DecisionTreeClassifier(),
'params': {
'criterion': ['gini','entropy'],
'splitter': ['best','random']
}
},
'clf2': {
'model': RandomForestClassifier(),
'params': {
'n_estimators': list(range(10, 100, 10)),
'criterion': ['gini','entropy']
}
},
'clf3': {
'model': GaussianNB(),
'params': {}
},
'clf4': {
'model': SVC(),
'params': {
'kernel': ['linear', 'poly', 'rbf'],
'gamma': ['scale','auto'],
'decision_function_shape': ['ovr','ovo']
}
},
'clf5': {
'model': ExtraTreeClassifier(),
'params': {
#'n_estimators': list(range(10, 100, 10)),
'criterion': ['gini','entropy'],
'max_features': ['auto','sqrt','log2'],
'class_weight': ['balanced', 'balanced_subsample']
}
},
'clf6': {
'model': GradientBoostingClassifier(),
'params': {
'n_estimators': list(range(10, 100, 10)),
'loss' : ['deviance','exponential'],
'max_features':['auto','sqrt','log2']
}
},
'clf7': {
'model': AdaBoostClassifier(),
'params': {
'n_estimators': list(range(10, 100, 10)),
'algorithm': ['SAMME.R','SAMME']
}
}
}
df = pd.DataFrame()
# Grid Search Cross-Validation
for _, mp in model_params.items():
grid = GridSearchCV(mp['model'], mp['params'], cv=5, verbose=0,
scoring={
'f1_score': make_scorer(f1_score),
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score),
'recall': make_scorer(recall_score),
'ROC_AUC': make_scorer(roc_auc_score)
},
n_jobs=-1,
refit='accuracy',
return_train_score=True)
grid.fit(X, y)
classifier_name = mp['model'].__class__.__name__
grid_best_params = grid.best_params_
grid_score = grid.score(X_test, y_test)
results = grid.cv_results_
results['classifier_name'] = classifier_name
results['grid_score'] = grid_score
results['grid_best_params'] = grid_best_params
df = df.append(results, ignore_index=True)
# Print results
for ix, row in df.iterrows():
x60 = '-'*60
cls_name = row['classifier_name']
print(f'{x60} {ix}: {cls_name}')
grid_score = row['grid_score'] * 100
print(f'Grid_Score: {grid_score:.2f}')
f1_score_min = row['mean_test_f1_score'].min() * 100
f1_score_max = row['mean_test_f1_score'].max() * 100
print(f'F1_Score: {f1_score_min:.2f} - {f1_score_max:.2f}')
accuracy_min = row['mean_test_accuracy'].min() * 100
accuracy_max = row['mean_test_accuracy'].max() * 100
print(f'Accuracy: {accuracy_min:.2f} - {accuracy_max:.2f}')
precision_min = row['mean_test_precision'].min() * 100
precision_max = row['mean_test_precision'].max() * 100
print(f'Precision: {precision_min:.2f} - {precision_max:.2f}')
recall_min = row['mean_test_recall'].min() * 100
recall_max = row['mean_test_recall'].max() * 100
print(f'Recall: {recall_min:.2f} - {recall_max:.2f}')
roc_auc_min = row['mean_test_ROC_AUC'].min() * 100
roc_auc_max = row['mean_test_ROC_AUC'].max() * 100
print(f'ROC_AUC: {roc_auc_min:.2f} - {roc_auc_max:.2f}')
grid_best_params = row['grid_best_params']
print('Grid_best_params:', grid_best_params)
図3-1は実行結果です。
各モデルのスコアとベスト・パラメータが表示されています。
もっとのスコアが高いのは「DecisionTreeClassifier」の「92.38」です。
そして「DecisionTreeClassifier」のベスト・パラメータは「{'criterion': 'gini', 'splitter': 'best'}」ということになります。
F1-Score, Precision, Racall, Accuracy等の意味と使い方については「記事(Article048)」で詳細に解説しています。
図3-2は実行結果です。
ここで解説したコードをまとめて掲載
最後にここで解説したすべてのコードをまとめて掲載しましたので参考にしてください。
### Import the libraries
from functools import reduce
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.core.reshape.reshape import stack
import seaborn as sns
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Cross Validation(k-fold)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Pipeline and GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, make_scorer, f1_score, roc_auc_score
import sklearn.metrics as skm
import warnings
warnings.simplefilter('ignore')
# %%
### Load the data
train_file = 'data/csv/titanic/train_cleaned.csv'
#train_file = 'https://money-or-ikigai.com/menu/python/article/data/titanic/train_cleaned.csv'
train = pd.read_csv(train_file)
temp = train.copy()
X = temp.drop('Survived', axis=1) # Exclude Survived column
y = temp['Survived'] # Survived column only
# %%
### Modeling
#clf0 = KNeighborsClassifier() # n_neighbors=[3,5,10,15], weights={'uniform', 'distance'}, algorithm={'auto', 'ball_tree', 'kd_tree', 'brute'}
# KNeighborsClassifier(n_neighbors=5, *, weights={‘uniform’, ‘distance’} , algorithm={‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’},
# leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
#clf1 = DecisionTreeClassifier() # criterion={'gini','entropy'}, splitter={'best','random'}
# DecisionTreeClassifier(*, criterion={'gini','entropy'}, splitter={'best','random'}, max_depth=None,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0,
# class_weight=None, ccp_alpha=0.0)
#clf2 = RandomForestClassifier() # n_estimators=1-100, criterion={'gini','entropy'}
# RandomForestClassifier(n_estimators=100, *, criterion={'gini','entropy'},
# max_depth=None, min_samples_split=2, min_samples_leaf=1,
# min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None,
# min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
# n_jobs=None, random_state=None, verbose=0, warm_start=False,
# class_weight=None, ccp_alpha=0.0, max_samples=None)
#clf3 = GaussianNB() # None
# GaussianNB(*, priors=None, var_smoothing=1e-09)
#clf4 = SVC() # kernel={'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, gamma={'scale','auto'}, decision_function_shape={'ovr','ovo'}
# SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0,
# shrinking=True, probability=False, tol=0.001, cache_size=200,
# class_weight=None, verbose=False, max_iter=- 1,
# decision_function_shape='ovr', break_ties=False, random_state=None)
#clf5 = ExtraTreesClassifier() # n_estimators=10-100, criterion={'gini','entropy'}, max_features={'auto','sqrt','log2'}, class_weight={'balanced', 'balanced_subsample'}
# ExtraTreesClassifier(n_estimators=100, *, criterion='gini', max_depth=None,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0,
# warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
#clf6 = GradientBoostingClassifier() # loss={'deviance','exponential'}, n_estimators=10-100
# GradientBoostingClassifier(*, loss='deviance', learning_rate=0.1, n_estimators=100,
# subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
# min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
# init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None,
# warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
#clf7 = AdaBoostClassifier() # n_estimators=10-100, algorithm={'SAMME.R','SAMME'}
# AdaBoostClassifier(base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)
# Train-test Split
# sklearn.model_selection.train_test_split(# *arrays, test_size=None, train_size=None,
# random_state=None, shuffle=True, stratify=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
model_params = {
'clf0': {
'model': KNeighborsClassifier(),
'params': {
'n_neighbors': list(range(3, 15, 2)),
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
},
'clf1': {
'model': DecisionTreeClassifier(),
'params': {
'criterion': ['gini','entropy'],
'splitter': ['best','random']
}
},
'clf2': {
'model': RandomForestClassifier(),
'params': {
'n_estimators': list(range(10, 100, 10)),
'criterion': ['gini','entropy']
}
},
'clf3': {
'model': GaussianNB(),
'params': {}
},
'clf4': {
'model': SVC(),
'params': {
'kernel': ['linear', 'poly', 'rbf'],
'gamma': ['scale','auto'],
'decision_function_shape': ['ovr','ovo']
}
},
'clf5': {
'model': ExtraTreeClassifier(),
'params': {
#'n_estimators': list(range(10, 100, 10)),
'criterion': ['gini','entropy'],
'max_features': ['auto','sqrt','log2'],
'class_weight': ['balanced', 'balanced_subsample']
}
},
'clf6': {
'model': GradientBoostingClassifier(),
'params': {
'n_estimators': list(range(10, 100, 10)),
'loss' : ['deviance','exponential'],
'max_features':['auto','sqrt','log2']
}
},
'clf7': {
'model': AdaBoostClassifier(),
'params': {
'n_estimators': list(range(10, 100, 10)),
'algorithm': ['SAMME.R','SAMME']
}
}
}
df = pd.DataFrame()
# Grid Search Cross-Validation
for _, mp in model_params.items():
grid = GridSearchCV(mp['model'], mp['params'], cv=5, verbose=0,
scoring={
'f1_score': make_scorer(f1_score),
'accuracy': make_scorer(accuracy_score),
'precision': make_scorer(precision_score),
'recall': make_scorer(recall_score),
'ROC_AUC': make_scorer(roc_auc_score)
},
n_jobs=-1,
refit='accuracy',
return_train_score=True)
# GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None,
# refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
# error_score=nan, return_train_score=False)
grid.fit(X, y) # X_train, y_train
classifier_name = mp['model'].__class__.__name__
#print('+'*60, classifier_name)
grid_best_params = grid.best_params_
#print(f'best_params = {grid_best_params}')
grid_score = grid.score(X_test, y_test)
#print(f'grid_score = {grid_score*100:.2f}')
results = grid.cv_results_
#print('type(results) =', type(results)) # dict type
results['classifier_name'] = classifier_name
results['grid_score'] = grid_score
results['grid_best_params'] = grid_best_params
df = df.append(results, ignore_index=True)
# Print results
for ix, row in df.iterrows():
#print(ix, row)
x60 = '-'*60
cls_name = row['classifier_name']
print(f'{x60} {ix}: {cls_name}')
grid_score = row['grid_score'] * 100
print(f'Grid_Score: {grid_score:.2f}')
f1_score_min = row['mean_test_f1_score'].min() * 100
f1_score_max = row['mean_test_f1_score'].max() * 100
print(f'F1_Score: {f1_score_min:.2f} - {f1_score_max:.2f}')
accuracy_min = row['mean_test_accuracy'].min() * 100
accuracy_max = row['mean_test_accuracy'].max() * 100
print(f'Accuracy: {accuracy_min:.2f} - {accuracy_max:.2f}')
precision_min = row['mean_test_precision'].min() * 100
precision_max = row['mean_test_precision'].max() * 100
print(f'Precision: {precision_min:.2f} - {precision_max:.2f}')
recall_min = row['mean_test_recall'].min() * 100
recall_max = row['mean_test_recall'].max() * 100
print(f'Recall: {recall_min:.2f} - {recall_max:.2f}')
roc_auc_min = row['mean_test_ROC_AUC'].min() * 100
roc_auc_max = row['mean_test_ROC_AUC'].max() * 100
print(f'ROC_AUC: {roc_auc_min:.2f} - {roc_auc_max:.2f}')
grid_best_params = row['grid_best_params']
print('Grid_best_params:', grid_best_params)