Support Vector Machine 機械学習

ハードマージン
ソフトマージン

ハードマージン

#データの設定
iris = load_iris()
x = iris.data[:,[2,3]]#petal_lengthとpetal_width
y = iris.target
X_train,X_test,Y_train,Y_test = train_test_split(x,y,random_state=0)

#ハードマージン
from sklearn.svm import LinearSVC
#線形SVM
svml1 = LinearSVC(random_state=0)
svml1.fit(X_train,Y_train)

#性能評価
svml1_score = svml1.score(X_test,Y_test)
print('linear SVM score:{}'.format(svml1_score))

linear SVM score:0.8157894736842105

#CV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

stratifiedkfold = StratifiedKFold(n_splits=5)#5分割交差検証してみる

svml1_scores = cross_val_score(svml1,iris.data[:,[2,3]],iris.target,cv=stratifiedkfold)
print('linear SVM Cross-Validation scores:{}'.format(svml1_scores))
print('linear SVM Average score:{}'.format(np.mean(svml1_scores)))

linear SVM Cross-Validation scores:[0.96666667 0.96666667 0.9        0.9        0.96666667]
linear SVM Average score:0.9400000000000001

また、mixtendというモジュールで識別境界を可視化できる。

from mlxtend.plotting import plot_decision_regions
plt.style.use('ggplot')

x_combined = np.vstack((X_train,X_test))
y_combined = np.hstack((Y_train,Y_test))

fig = plt.figure(figsize=(13,8))
plot_decision_regions(x_combined,y_combined,clf=svml1)
plt.xlabel('petal_length')
plt.ylabel('petal_width')
plt.show()

ソフトマージン

パラメータCの値を5種類用意して、それぞれについてモデルを作って性能を計測する。

#パラメータの設定(簡単なグリッドサーチ)
parameter = [0.1,1,10,100,1000]
score3 = []

for i in range(5):
    
    svmls = LinearSVC(C=parameter[i],random_state=0)
    svmls.fit(X_train,Y_train)    
    score3.append(svmls.score(X_test,Y_test))
    

#性能評価
print(score3)#C=10か1000が最適
plt.ylim((0.8,1.0))
plt.xlabel("parameter C")
plt.ylabel("score")
plt.plot(parameter,score3)
plt.show()

[0.6578947368421053, 0.8157894736842105, 0.9473684210526315, 0.9473684210526315, 0.9473684210526315]

グリッドサーチの結果、C=10かC=1000の場合が最適だと分かった。

C=10の場合で識別境界を可視化したものが下図である。

svmls = LinearSVC(C=10,random_state=0)
svmls.fit(X_train,Y_train)

#性能評価
svmls_score = svmls.score(X_test,Y_test)
print('linear C-SVM score:{}'.format(svmls_score))

svmls_scores = cross_val_score(svmls,iris.data[:,[2,3]],iris.target,cv=stratifiedkfold)
print('linear C-SVM Cross-Validation scores:{}'.format(svmls_scores))
print('linear C-SVM Average score:{}'.format(np.mean(svmls_scores)))

plt.style.use('ggplot')

fig = plt.figure(figsize=(13,8))
plot_decision_regions(x_combined,y_combined,clf=svmls)
plt.xlabel('petal_length')
plt.ylabel('petal_width')
plt.show()

linear C-SVM score:0.9473684210526315
linear C-SVM Cross-Validation scores:[1.         0.96666667 0.93333333 0.93333333 1.        ]
linear C-SVM Average score:0.9666666666666668