チートシート (ラボクオリティなので、ご利用は自己責任でお願いします)
scikit-learn
データセット  前処理 
SVM(サポートベクターマシン)  線形モデル  決定木  次元削減  クラスタリング 
精度・評価指標 

データセット
一覧  アイリス  糖尿病  手書き数字  ワイン認識  乳がん診断 
make_blobs  make_regression 

データセット(外部リンク):https://scikit-learn.org/stable/datasets

・一覧
from sklearn import datasets
for element in dir(datasets):
    print(element, end=' | ')
データセットに戻る ・アイリス
from sklearn.datasets import load_iris
data = load_iris()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る ・糖尿病
from sklearn.datasets import load_diabetes
data = load_diabetes()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る ・手書き数字
from sklearn.datasets import load_digits
data = load_digits()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る ・ワイン認識
from sklearn.datasets import load_wine
data = load_wine()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る ・乳がん診断
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
print(data.keys())
print(data.data.shape, data.target.shape)
データセットに戻る ・make_blobs
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
plt.style.use('dark_background')
x, y = make_blobs(n_samples=100, n_features=2, centers=3, random_state=2022)
plt.scatter(x[:,0], x[:,1]);
データセットに戻る ・make_regression
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
plt.style.use('dark_background')
x, y = make_regression(n_samples=100, n_features=1, noise=3, random_state=2022)
plt.scatter(x, y);
データセットに戻る

前処理
標準化  正規化 
ワンホット(One-Hot)  多クラスワンホット  カテゴリ変数の数値化 
(データ分割)
train_test_split  KFold  StratifiedKFold 

・標準化

StandardScaler(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

# 平均0・分散1
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

ss = StandardScaler()
# 訓練データで.fit()して、テストデータにも使う
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

print('X_train')
print(pd.DataFrame(X_train).describe()['mean':'std'].round(), '\n')
print('X_train_ss')
print(pd.DataFrame(X_train_ss).describe()['mean':'std'].round(), '\n')
print('X_test_ss')
print(pd.DataFrame(X_test_ss).describe()['mean':'std'].round())
前処理に戻る ・正規化

MinMaxScaler(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

# 範囲:0-1
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

mms = MinMaxScaler()
# 訓練データで.fit()して、テストデータにも使う
mms.fit(X_train)
X_train_mms = mms.transform(X_train)
X_test_mms = mms.transform(X_test)

print('X_train')
print(pd.DataFrame(X_train).describe().loc[['min', 'max']], '\n')
print('X_train_mms')
print(pd.DataFrame(X_train_mms).describe().loc[['min', 'max']], '\n')
print('X_test_mms')
print(pd.DataFrame(X_test_mms).describe().loc[['min', 'max']].round(1))
前処理に戻る
・ワンホット(One-Hot)

LabelBinarizer(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html

from sklearn.preprocessing import LabelBinarizer
data = ['a', 'b', 'c', 'b', 'd', 'a', 'e', 'c', 'd', 'b']
print(data)
lb = LabelBinarizer()
print(lb.fit_transform(data))
print('.classes_')
print(lb.classes_)
print('.inverse_transform')
print(lb.inverse_transform(lb.transform(data)))
前処理に戻る ・多クラスワンホット

MultiLabelBinarizer(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

from sklearn.preprocessing import MultiLabelBinarizer
data = [['a', 'b'], ['c', 'b'], ['d', 'a'], ['e', 'c'], ['d', 'b']]
print(data)
mb = MultiLabelBinarizer()
print(mb.fit_transform(data))
前処理に戻る ・カテゴリ変数の数値化

LabelEncoder(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

from sklearn.preprocessing import LabelEncoder
data = ['a', 'b', 'c', 'b', 'c', 'a', 'a', 'c', 'b', 'b']
print(data)
le = LabelEncoder()
le.fit(["a", "b", "c", "a"])
print(le.transform(data))
前処理に戻る
・train_test_split

train_test_split(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data = load_iris()
X = data.data
y = data.target

# train_test_split(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=0, shuffle=True)
print('(class_array, count_array)')
print('y_train:', np.unique(y_train, return_counts=True))
print('y_test:', np.unique(y_test, return_counts=True))
前処理に戻る ・KFold

KFold(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold

data = load_iris()
X = data.data
y = data.target

kf = KFold(n_splits=3, random_state=0, shuffle=True)
print('(class_array, count_array)')
count = 1
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(count)
    print('y_train:', np.unique(y_train, return_counts=True))
    print('y_test:', np.unique(y_test, return_counts=True))
    count += 1
前処理に戻る ・StratifiedKFold

StratifiedKFold(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold

data = load_iris()
X = data.data
y = data.target

skf = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
print('(class_array, count_array)')
count = 1
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(count)
    print('y_train:', np.unique(y_train, return_counts=True))
    print('y_test:', np.unique(y_test, return_counts=True))
    count += 1
前処理に戻る

SVM(サポートベクターマシン)
SVC  LinearSVC

・SVC

SVC(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

# Support Vector Classification
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('精度:', accuracy_score(y_test, y_pred).round(3))
SVM(サポートベクターマシン)に戻る ・LinearSVC

LinearSVC(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = LinearSVC(max_iter=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('精度:', accuracy_score(y_test, y_pred).round(3))
SVM(サポートベクターマシン)に戻る

線形モデル
SGD(確率的勾配降下法) 
・SGD(確率的勾配降下法)

SGD(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

data = load_digits()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = SGDClassifier(loss="log_loss")
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
線形モデルに戻る

決定木
決定木  ランダムフォレスト 
・決定木

分類 


分類

DecisionTreeClassifier(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
決定木に戻る

・ランダムフォレスト

分類 


分類

RandomForestClassifier(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

data = load_iris()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify=y)

model = RandomForestClassifier()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))
決定木に戻る

次元削減
・次元削減

PCA(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

from sklearn.datasets import load_digits
from sklearn.decomposition import PCA

data = load_digits()
X = data.data

# n_componets=0.95,0.99,etc, whiten:平均0分散1
pca = PCA(n_components=0.99, whiten=True)
X_pca = pca.fit_transform(X)
print('次元削減前:', X.shape[1])
print('次元削減後:', X_pca.shape[1])
次元削減に戻る

クラスタリング
・クラスタリング

KMeans(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
plt.style.use('dark_background')

# クラスターデータ作成
X, y = make_blobs(1000, n_features=2, centers=3)
# 全データを透明度つけてプロット
plt.scatter(X[:,0], X[:,1], alpha=0.3)
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify=y)

km = KMeans(n_clusters=3, n_init=10)
km.fit(X_train.reshape(-1,2))
pred = km.predict(X_test.reshape(-1,2))

# プロット用にデータフレーム作成
df = pd.DataFrame({'x':X_test[:,0], 'y':X_test[:,1], 'c':pred})
# クラスデータを学習に使ってないため(教師なし学習)、プロットで確認
# テストデータの予測結果をプロット
for n in [0,1,2]:
    temp = df[df.c==n]
    plt.scatter(temp.x, temp.y, label=f'pred_class_{n}')
# 中心点をプロット
for center in km.cluster_centers_:
    plt.scatter(center[0], center[1], s=50, c='lime', ec='k')
plt.legend();
クラスタリングに戻る

精度・評価指標
混同行列  ROC曲線  PR曲線 
・混同行列
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report

y_true = [1,0,1,0,1,0,1,0,1,0]
y_pred = [1,0,0,0,1,0,1,1,1,1]
print('実際:', y_true)
print('予測:', y_pred, '\n')

cm = confusion_matrix(y_true, y_pred)
print('confusion_matrix output')
print(cm, '\n')

cm_df = pd.DataFrame({'予測(陰:0)': [f'TN={cm[0,0]}', f'FN={cm[1,0]}'],
                      '予測(陽:1)': [f'FP={cm[0,1]}', f'TP={cm[1,1]}']},
                      index=['実際(偽:0)', '実際(真:1)'])
print('混同行列(Confusion Matrix)')
print(cm_df, '\n')

print('TP(True Positive): 真陽性')
print('TN(True Negative): 真陰性')
print('FP(False Positive): 偽陽性')
print('FN(False Negative): 偽陰性', '\n')

print('精度 = (TP+TN) / (TP+TN+FP+FN)')
print('accuracy:', accuracy_score(y_true, y_pred).round(3))
print('適合率 = TP / (TP+FP)')
print('precision:', precision_score(y_true, y_pred).round(3))
print('再現率 = TP / (TP+FN)')
print('recall:', recall_score(y_true, y_pred).round(3), '\n')

print('classification_report')
print(classification_report(y_true, y_pred))
精度・評価指標に戻る
・ROC曲線

roc_curve(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html

# ROC(Receiver Operatorating Characteristic)曲線
# (受信者操作特性曲線)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
plt.style.use('dark_background')

data = load_breast_cancer()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# model = LogisticRegression(max_iter=1500)
model = SVC()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))

y_pred = model.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
# AUC(Area Under the Curve)
auc = metrics.auc(fpr, tpr)
print('AUC:', auc.round(3))

plt.plot(fpr, tpr)
plt.fill_between(fpr, tpr, np.zeros(len(tpr)), alpha=0.3)
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(ls=':')
精度・評価指標に戻る
・PR曲線

precision_recall_curve(外部リンク):https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
plt.style.use('dark_background')

data = load_breast_cancer()
X = data.data
y = data.target
# データ分割(stratify:クラス比率保持、層化抽出法)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = SVC()
model.fit(X_train, y_train)
print('精度:', model.score(X_test, y_test).round(3))

y_pred = model.predict(X_test)

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
# AUC(Area Under the Curve)
auc = metrics.auc(recall, precision)
print('AUC:', auc.round(3))

plt.plot(recall, precision)
plt.fill_between(recall, precision, np.zeros(len(precision)), alpha=0.3)
plt.title('Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(ls=':')
精度・評価指標に戻る