データセット 前処理
SVM(サポートベクターマシン) 線形モデル 決定木 次元削減 クラスタリング
精度・評価指標
データセット
一覧 アイリス 糖尿病 手書き数字 ワイン認識 乳がん診断
make_blobs make_regression
データセット(外部リンク):https://scikit-learn.org/stable/datasets
・一覧
from sklearn import datasets
for element in dir(datasets):
print(element, end=' | ')
データセットに戻る
・アイリスfrom sklearn.datasets import load_iris
data = load_iris()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る
・糖尿病from sklearn.datasets import load_diabetes
data = load_diabetes()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る
・手書き数字from sklearn.datasets import load_digits
data = load_digits()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る
・ワイン認識from sklearn.datasets import load_wine
data = load_wine()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る
・乳がん診断from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る
・make_blobsimport matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
plt.style.use('dark_background')
x, y = make_blobs(n_samples=100, n_features=2, centers=3, random_state=2022)
plt.scatter(x[:,0], x[:,1]);
データセットに戻る
・make_regressionimport matplotlib.pyplot as plt
from sklearn.datasets import make_regression
plt.style.use('dark_background')
x, y = make_regression(n_samples=100, n_features=1, noise=3, random_state=2022)
plt.scatter(x, y);
データセットに戻る
前処理
標準化 正規化
ワンホット(One-Hot) 多クラスワンホット カテゴリ変数の数値化
(データ分割)
train_test_split KFold StratifiedKFold
・標準化
StandardScaler(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# 平均0・分散1
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
ss = StandardScaler()
# 訓練データで.fit()して、テストデータにも使う
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)
print('X_train')
print(pd.DataFrame(X_train).describe()['mean':'std'].round(), '\n')
print('X_train_ss')
print(pd.DataFrame(X_train_ss).describe()['mean':'std'].round(), '\n')
print('X_test_ss')
print(pd.DataFrame(X_test_ss).describe()['mean':'std'].round())
前処理に戻る
・正規化MinMaxScaler(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# 範囲:0-1
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
mms = MinMaxScaler()
# 訓練データで.fit()して、テストデータにも使う
mms.fit(X_train)
X_train_mms = mms.transform(X_train)
X_test_mms = mms.transform(X_test)
print('X_train')
print(pd.DataFrame(X_train).describe().loc[['min', 'max']], '\n')
print('X_train_mms')
print(pd.DataFrame(X_train_mms).describe().loc[['min', 'max']], '\n')
print('X_test_mms')
print(pd.DataFrame(X_test_mms).describe().loc[['min', 'max']].round(1))
前処理に戻る
・ワンホット(One-Hot)
LabelBinarizer(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html
from sklearn.preprocessing import LabelBinarizer
data = ['a', 'b', 'c', 'b', 'd', 'a', 'e', 'c', 'd', 'b']
print(data)
lb = LabelBinarizer()
print(lb.fit_transform(data))
print('.classes_')
print(lb.classes_)
print('.inverse_transform')
print(lb.inverse_transform(lb.transform(data)))
前処理に戻る
・多クラスワンホットMultiLabelBinarizer(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
from sklearn.preprocessing import MultiLabelBinarizer
data = [['a', 'b'], ['c', 'b'], ['d', 'a'], ['e', 'c'], ['d', 'b']]
print(data)
mb = MultiLabelBinarizer()
print(mb.fit_transform(data))
前処理に戻る
・カテゴリ変数の数値化LabelEncoder(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.preprocessing import LabelEncoder
data = ['a', 'b', 'c', 'b', 'c', 'a', 'a', 'c', 'b', 'b']
print(data)
le = LabelEncoder()
le.fit(["a", "b", "c", "a"])
print(le.transform(data))
前処理に戻る
・train_test_split
train_test_split(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
X = data.data
y = data.target
# train_test_split(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=0, shuffle=True)
print('(class_array, count_array)')
print('y_train:', np.unique(y_train, return_counts=True))
print('y_test:', np.unique(y_test, return_counts=True))
前処理に戻る
・KFoldKFold(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
data = load_iris()
X = data.data
y = data.target
kf = KFold(n_splits=3, random_state=0, shuffle=True)
print('(class_array, count_array)')
count = 1
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(count)
print('y_train:', np.unique(y_train, return_counts=True))
print('y_test:', np.unique(y_test, return_counts=True))
count += 1
前処理に戻る
・StratifiedKFoldStratifiedKFold(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
data = load_iris()
X = data.data
y = data.target
skf = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
print('(class_array, count_array)')
count = 1
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(count)
print('y_train:', np.unique(y_train, return_counts=True))
print('y_test:', np.unique(y_test, return_counts=True))
count += 1
前処理に戻る
SVM(サポートベクターマシン)
SVC LinearSVC
・SVC
SVC(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
# Support Vector Classification
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('精度:', accuracy_score(y_test, y_pred).round(3))
SVM(サポートベクターマシン)に戻る
・LinearSVCLinearSVC(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
model = LinearSVC(max_iter=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('精度:', accuracy_score(y_test, y_pred).round(3))
SVM(サポートベクターマシン)に戻る
線形モデル
SGD(確率的勾配降下法)
・SGD(確率的勾配降下法)
SGD(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
data = load_digits()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
model = SGDClassifier(loss="log_loss")
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
線形モデルに戻る
決定木
決定木 ランダムフォレスト
・決定木
分類
DecisionTreeClassifier(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
決定木に戻る
・ランダムフォレスト
分類
RandomForestClassifier(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify=y)
model = RandomForestClassifier()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
決定木に戻る
次元削減
・次元削減
PCA(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
data = load_digits()
X = data.data
# n_componets=0.95,0.99,etc, whiten:平均0分散1
pca = PCA(n_components=0.99, whiten=True)
X_pca = pca.fit_transform(X)
print('次元削減前:', X.shape[1])
print('次元削減後:', X_pca.shape[1])
次元削減に戻る
クラスタリング
・クラスタリング
KMeans(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
plt.style.use('dark_background')
# クラスターデータ作成
X, y = make_blobs(1000, n_features=2, centers=3)
# 全データを透明度つけてプロット
plt.scatter(X[:,0], X[:,1], alpha=0.3)
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify=y)
km = KMeans(n_clusters=3, n_init=10)
km.fit(X_train.reshape(-1,2))
pred = km.predict(X_test.reshape(-1,2))
# プロット用にデータフレーム作成
df = pd.DataFrame({'x':X_test[:,0], 'y':X_test[:,1], 'c':pred})
# クラスデータを学習に使ってないため(教師なし学習)、プロットで確認
# テストデータの予測結果をプロット
for n in [0,1,2]:
temp = df[df.c==n]
plt.scatter(temp.x, temp.y, label=f'pred_class_{n}')
# 中心点をプロット
for center in km.cluster_centers_:
plt.scatter(center[0], center[1], s=50, c='lime', ec='k')
plt.legend();
クラスタリングに戻る
精度・評価指標
混同行列 ROC曲線 PR曲線
・混同行列
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
y_true = [1,0,1,0,1,0,1,0,1,0]
y_pred = [1,0,0,0,1,0,1,1,1,1]
print('実際:', y_true)
print('予測:', y_pred, '\n')
cm = confusion_matrix(y_true, y_pred)
print('confusion_matrix output')
print(cm, '\n')
cm_df = pd.DataFrame({'予測(陰:0)': [f'TN={cm[0,0]}', f'FN={cm[1,0]}'],
'予測(陽:1)': [f'FP={cm[0,1]}', f'TP={cm[1,1]}']},
index=['実際(偽:0)', '実際(真:1)'])
print('混同行列(Confusion Matrix)')
print(cm_df, '\n')
print('TP(True Positive): 真陽性')
print('TN(True Negative): 真陰性')
print('FP(False Positive): 偽陽性')
print('FN(False Negative): 偽陰性', '\n')
print('精度 = (TP+TN) / (TP+TN+FP+FN)')
print('accuracy:', accuracy_score(y_true, y_pred).round(3))
print('適合率 = TP / (TP+FP)')
print('precision:', precision_score(y_true, y_pred).round(3))
print('再現率 = TP / (TP+FN)')
print('recall:', recall_score(y_true, y_pred).round(3), '\n')
print('classification_report')
print(classification_report(y_true, y_pred))
精度・評価指標に戻る
・ROC曲線
roc_curve(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
# ROC(Receiver Operatorating Characteristic)曲線
# (受信者操作特性曲線)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
plt.style.use('dark_background')
data = load_breast_cancer()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# model = LogisticRegression(max_iter=1500)
model = SVC()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
y_pred = model.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
# AUC(Area Under the Curve)
auc = metrics.auc(fpr, tpr)
print('AUC:', auc.round(3))
plt.plot(fpr, tpr)
plt.fill_between(fpr, tpr, np.zeros(len(tpr)), alpha=0.3)
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(ls=':')
精度・評価指標に戻る
・PR曲線
precision_recall_curve(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
plt.style.use('dark_background')
data = load_breast_cancer()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
model = SVC()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
y_pred = model.predict(X_test)
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
# AUC(Area Under the Curve)
auc = metrics.auc(recall, precision)
print('AUC:', auc.round(3))
plt.plot(recall, precision)
plt.fill_between(recall, precision, np.zeros(len(precision)), alpha=0.3)
plt.title('Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(ls=':')
精度・評価指標に戻る