機械学習回帰編 ハイパーパラメーターの調整

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from skearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.simplefilter('ignore')

with open('imp_data.csv', mode='rb') as f:
    imp_data=pickle.load(f)
with open('rf_model.text', mode='rb') as f:
    rf_model=pickle.load(f)
with open('xgb_model.text', mode ='rb') as f:
    xgb_model=pickle.load(f)
with open('gbm_model.text', mode'rb') as f:
    gbm_model=pickle.load(f)

dataはcsv, モデルはtext

y_target=imp_data['price']
col=imp_data.columns.tolist()
col.remove('price')
x_explanatory=imp_data[col]
x_train, x_test, y_train, y_test =train_test_split(x_explanatory, y_target, random_state=1)

GridSearchCV()を利用したパラメーター調整
ランダムフォレストのパラメーター調整
XGBoostのパラメーター調整
LibhtGBMのパラメーター調整
最適化されたモデルによる予測・評価

GridSearchCV()を利用したパラメーター調整

使用するにはGridSearchCV(model,param_grid,cv,scoring)と表記します。

引数はそれぞれ以下の通りです。

model:機械学習モデルを指定。random_stateで学習に使用する乱数を指定できる。(今回は1で固定)
param_grid:チューニングするパラメータ名と範囲が入った辞書型orリスト型を指定(モデルにより異なる)
cv:チューニングで最適なパラメータを見つけるための交差検証の回数を指定(今回は5回で固定)
scoring:チューニング時に評価する指標を指定(今回は決定係数r2で固定)

ランダムフォレストのパラメーター調整

まずはランダムフォレストです。最初にparam_gridに入れるパラメータを指定します。
ランダムフォレストで調整するパラメータには、様々なものがあります。

max_depth : 決定木の深さの最大値。過学習を避けるためにはこれを調節するのが最も重要。デフォルトは6。
n_estimators : 多数決を行う決定木の数。今回はデフォルトの10で固定。
criterion:決定木のデータ分割基準。今回はデフォルトのmseで固定。
今回は比較のためmax_depthのみを動かします。

パラメータは辞書型で記述し、パラメータの名前:パラメータの値で与えます。

このとき、パラメータの値にリストで複数の値を入れると、それぞれのパラメータについてモデルを試してくれます。

params={'max_depth':[4,6,8]}

今回はmax_depthに4,6,8を入れて、3種類のモデルを試します。

grid_rf_model=GridSearchCV(
      RandomForestRegressor(random_state=1),
      param_grid=params,
      cv=5,
      scoring='r2',
      n_jobs=-1
      )

grid_rf_model.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1), n_jobs=-1,
param_grid={‘max_depth’: [4, 6, 8]}, scoring=’r2′)

grid_rf_model.best_estimator_
with open ('grid_rf_model.text', mode='wb') as f:
     pickle.dump(grid_rf_model.best_estimator_, f)

XGBoostのパラメーター調整

XGBoostでは、以下の3種類のパラメータを調整します。

max_depth: 決定木の深さの最大値
learning_rate: 最適化においてどのくらいの幅で値を動かすかというパラメータ(デフォルトは0.3)
min_child_weight: 決定木の葉の重みの下限(デフォルトは1)

gridParams = {
         'max_depth':[4,6,8],
         'learning_rate':[0.1,0.3,0.5],
         'min_child_weight':[0.5,1,1.5]
          }

また、XGBoostには「アーリーストッピング」という機能があります。

丁度良い時点で学習を打ち切ってくれるのです。

このアーリーストッピングにもハイパーパラメータが関わっているので、調整しましょう。

early_stoppingパラメータ

early_stopping_rounds: 損失減少しない場合の学習打ち切りのラウンド数
eval_set: 評価するデータセット

fitParams = {'early_stopping_rounds':10,
             'eval_set':[[x_test,y_test]]}

grid_xgb_model=GridSearchCV(
                xgb.XGBRegressor(random_state=1),
                param_grid=gridParams,
                cv=5,
                scoring='r2',
                n_jobs=-1
                )

grid_xgb_model.fit(
                   x_train,
                   y_train,
                   ***fitParams,
                   verbose=2)

grid_xgb_model.best_estimator_
with open('grid_xgb_model.text', mode='wb') as f:
         pickle.dump(grid_xgb_model.best_estimator_,f)

LibhtGBMのパラメーター調整

LightGBMでは、以下の4種類のパラメータを調整します。

max_depth: 決定木の深さの最大値
learning_rate: 最適化においてどのくらいの幅で値を動かすかというパラメータ
min_data_in_leaf: 決定木のノード（葉）の最小データ数(デフォルトは20)
num_leaves:決定木の葉の数(デフォルトは31)

gridParams={
           'max_depth':[4,6,8],
           'learning_rate':[0.1,0.3,0.5],
           'min_data_in_leaf':[15,20,25],
           'num_leaves':[27,31,35]
           }

fitParams = {'early_stopping_rounds':10,
             'eval_set':[[x_test,y_test]]}

lgb_train=lgb.Dataset(x_train,y_train)
lgb_eval=lgb.Dataset(x_test, y_test)

grid_gbm_model = GridSearchCV(
                 lgb.LGBMRegressor(random_state=1),
                 param_grid = gridParams,
                 cv=5,
                 scoring='r2',
                 n_jobs=-1,
                 )

grid_gbm_model.fit(
                x_train,
                y_train,
                **fitParams,
                verbose=2)

grid_gbm_model.best_estimator_

with open('grid_gbm_model.text', mode='wb') as f:
         pickle.dump(grid_gbm_model.best_estimator_,f)

最適化されたモデルによる予測・評価

ランダムフォレスト

with open ('grid_rf_model.text', mode='rb') as f:
           rf_best_mode = pickle.load(f)

rf_pred= rf_best_model.predict(x_test)
X= mean_absolute_error(y_test, rf_pred)
print("モデルの平均絶対誤差は",X)
Y=r2_score(y_test, rf_pred)
print("モデルの決定係数は",Y)

モデルの平均絶対誤差は 1617.9484923105947
モデルの決定係数は 0.8065933202105451
決定係数は以前より下がってしまいました。

グリッドサーチの範囲によっては、デフォルトより精度が低下する場合もあります。

fig=plt.figure(figsize=(8,24))

ax1=fig.add_subplot(3,1,1)
plt.scatter(y_test, rf_pred, color='blue')
x=np.arrange(0,4000)
plt.title('predicted_data & test_data')
plt.xlabel('test_data')
plt.ylabel('predicted_data')

ax2=fig.add_subplot(3,1,2)
plt.scatter(rf_pred, rf_pred-y_test, color='blue')
plt.hlines(y=0, xmin=-2000, xmax=38000, color='black')
plt.title('Residual Plot')
plt.xlabel('Predicted Values')
plt.ylabel('Resicuals')
plt.grid()

ax3=fig.add_subplot(3,1,3)
features=x_train.columns
importances=rf_best_model.feature_importances_
indices=np.argsort(importances)
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), features[indices])
plt.title('feature_importances_')
plt.xlabel('importance')
plt.ylabel('feature')

plt.show()

XGBoost

with open('grid_xgb_model.text', f) as f:
        xgb_best_model=pickle.load(f)

xgb_pred=xgb_best_model.predict(x_test)
X=mean_absolute_error(y_test, xgb_pred)
print('XGBモデルの平均絶対誤差は',X)
Y=r2_score(y_test, xgb_pred)
print('モデルの決定係数は', Y)

モデルの平均絶対誤差は 1141.8473923665006
モデルの決定係数は 0.8892338503360859
決定係数は前回のモデルよりも高くなりました。最適化の効果が出ていますね。

可視化してみると、前回のモデルよりも微妙に誤差が減少しているのが分かると思います。

fig=plt.figure(figsize=(8,24))

ax1=fig.add_subplot(3,1,1)
plt.scatter(y_test, xgb_pred, color='blue')
x=np.arrange(0,4000)
plt.plot(x,x,color='red')
plt.title('predicted_data & test_data')
plt.xtitle('test_data')
plt.ytitle('predicted_data')

ax2=fig.add_subplot(3,1,2)
plt.scatter(xgb_pred, xgb_pred-y_test, color='blue')
plt.hlines(y=0, xmin=-2000, xmax=38000, color='black')
plt.title('Residual Plot')
plt.xtitle('Predicted Values')
plt.ytitle('Residuals')
plt.grid()

ax3=plt.add_subplot(3,1,3)
feature = x_train.columns
importances=xgb_model.feature_importances_
indices=np.argsort(importances)
plt.barh(range(len(indices)), importances[indices}, align='center')　変数の数、値
plt.yticks(range(len(indices)), feature[indices]) 変数の数、名前
plt.title('feature_importances')
plt.xlabel('improtances')
plt.ylabel('feature')

plt.show()

最適化したLightGBM

with open('grid_gbm.model.text', mode='rb') as f:
     gbm_best_model = pickle.load(f)

gbm_pred= gbm_best_model.pred(x_test)
X=mean_absolute_error(y_test, gbm_pred)
print('モデルの平均絶対誤差は',X)
Y=r2_score(y_test, gbm_pred)
print('モデルの決定係数は',Y)

モデルの平均絶対誤差は 1217.8595305590084
モデルの決定係数は 0.8783477361882305
こちらは前回のモデルとあまり変わらないですね。

fig=plt.figure(figsize=(8,24))

ax1=plt.add_subplot(3,1,1)
plt.scatter(y_test, gbm_pred, color='blue')
x=np.arrange(0,4000)
plt.plot(x,x,color='red')
plt.title('predicted_data & test_data')
plt.xlabel('test_data')
plt.ylabel('predicted_data')

ax2=fig.add_subplot(3,1,2)
plt.scatter(gbm_pred, gbm_pred-y_test, color='blue')
plt.hlines(y=0 , xmin= -2000, xmax=38000, color='black')
plt.title('Residual Plot')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.grid()

ax3= fig.add_subplot(3,1,3) ここはfig
feature = x_train.columns
importances= gbm_best_model.feature_importances_
indices = np.argsort(importances)
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), feature[indices])
plt.title('feature_importances')
plt.xtitle('importances')
plt.ytitle('feature')

plt.show()