決定木解析：過学習

ノイズを使ったモデルは、トレーニングデータにはよく当てはまりますが、テストデータにはあまり当てはまりません。
この状態が、過学習です。

モデルは複雑にするほど、過学習の傾向が出ます。
決定木では、max_depthオプションでモデルの複雑さを変更できました。

モデルの複雑さとスコア（正解率）がどう変わるか見てみましょう。

折れ線グラフで、横軸にモデルの複雑さ、縦軸にスコアを見ていきます。
青い線がトレーニングデータ、オレンジの線がテストデータです。
黒い線は、max_depthオプションを指定しないの時のスコアです。

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap


def arange2(a, num):
    """numpy.ndarrayを最小と最大の間でnum+1分割する"""
    amin, amax = a.min(), a.max()
    return np.arange(amin, amax + 1e-8, (amax - amin) / num)


def plot_tree(clf, X, y, num=50, colors=('red', 'blue')):
    """
    X.iloc[:, 0]とX.iloc[:, 1]の2次元で、yの0-1ラベルによる散布図と
    予測（clf.predict）の等高線（contourf）を描画
    """
    columns = X.columns
    X, y = X.to_numpy(), y.to_numpy()
    xx0, xx1 = np.meshgrid(arange2(X[:, 0], num), arange2(X[:, 1], num))
    df = pd.DataFrame([xx0.ravel(), xx1.ravel()], index=columns).T
    Z = clf.predict(df).reshape(xx0.shape)
    plt.contourf(xx0, xx1, Z, alpha=0.4, cmap=ListedColormap(colors))
    for i in range(2):
        plt.scatter(X[y == i, 0], X[y == i, 1], alpha=0.8, c=colors[i])
    plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# データの読込
df = pd.read_csv('input/data14.csv')

# 特徴行列 X と目的変数 y へ分離
X = df[['x0', 'x1']]
y = df['y']

# 決定木のスコア確認
def check_tree(X, y, max_depth):
    # トレーニング・テスト用に分割
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0)
    tree = DecisionTreeClassifier(max_depth=max_depth, random_state=0)
    tree.fit(X_train, y_train)
    return tree, tree.score(X_train, y_train), tree.score(X_test, y_test)

# デフォルトの決定木のトレーニングのスコアとテストのスコア
tree0, score_train0, score_test0 = check_tree(X, y, None)
print('train', score_train0, 'test', score_test0)
plot_tree(tree0, X, y)

train 1.0 test 0.7222222222222222

# モデルの複雑さとスコア
max_depths = [1, 2, 3, 4, 5, 6, 7, 8, 9]
score_trains = []
score_tests = []
for max_depth in max_depths:
    _, score_train, score_test = check_tree(X, y, max_depth)
    print('max_depth', max_depth, 'train', score_train, 'test', score_test)
    score_trains.append(score_train)
    score_tests.append(score_test)

max_depth 1 train 0.746031746031746 test 0.6481481481481481
max_depth 2 train 0.753968253968254 test 0.6851851851851852
max_depth 3 train 0.8650793650793651 test 0.7592592592592593
max_depth 4 train 0.9285714285714286 test 0.7962962962962963
max_depth 5 train 0.9444444444444444 test 0.7777777777777778
max_depth 6 train 0.9603174603174603 test 0.7592592592592593
max_depth 7 train 0.9682539682539683 test 0.7592592592592593
max_depth 8 train 0.9841269841269841 test 0.7407407407407407
max_depth 9 train 1.0 test 0.7222222222222222

# モデルの複雑さとスコアのプロット
plt.xlabel('max_depth')
plt.hlines(score_test0, max_depths[0], max_depths[-1])
plt.plot(max_depths, score_trains, label='train')
plt.plot(max_depths, score_tests, label='test')
plt.text(5, 0.7, 'max_depth=default')
plt.legend();

# 最も当てはまったモデル
tree4, score_train4, score_test4 = check_tree(X, y, 4)
print('train', score_train4, 'test', score_test4)
plot_tree(tree4, X, y)

train 0.9285714285714286 test 0.7962962962962963

グラフを見ると次のことがわかります。

モデルを複雑にするほど、トレーニングのスコアが上がります。
モデルを複雑にすると、テストのスコアは、上がってから下がります。
max_depthオプションを指定しないと複雑なモデルになります。
テストのスコアが良いのが良いモデルです。
今回は、max_depthが4のときに、最も良くなっています。

モデルは単純すぎても複雑すぎても悪くなることに注意しましょう。