# 공통
from warnings import simplefilter
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.io import loadmat
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
simplefilter(action='ignore', category=FutureWarning)
np.random.seed(42)
%matplotlib inline
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['axes.unicode_minus'] = False
# 그림을 저장할 폴드
PROJECT_ROOT_DIR = "C:/Users/Admin/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/sally/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/User/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/sally/Dropbox/2019-Fall-Semester/ML"
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
def save_fig(fig_id, tight_layout=True):
path = os.path.join(IMAGES_PATH, fig_id + ".png")
if tight_layout:
plt.tight_layout()
plt.savefig(path, format='png', dpi=300)
https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.io.loadmat.html
mnist_path = os.path.join("datasets","mnist","mnist-original.mat")
mnist_raw = loadmat(mnist_path)
print(type(mnist_raw))
print("")
print(mnist_raw.keys())
print("")
print(type(mnist_raw['data']))
print("")
print(mnist_raw['data'].shape)
print("")
print(mnist_raw['label'].shape)
mnist = {
"data": mnist_raw["data"].T, # 전치 70000 * 784 행렬
"target": mnist_raw["label"][0],# 첫 번째 행
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original"
}
X, y = mnist["data"], mnist["target"]
print(X.shape )
print("")
print(y.shape)
some_digit = X[36000]
some_digit.shape # 28 * 28 픽셀이 있고 개체 특성은 0(흰색)~255(검은색)의 픽셀 강도
some_digit_image = some_digit.reshape(28, 28)
https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.imshow.html
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
interpolation="nearest")
save_fig("some_digit_plot")
plt.show()
y[36000]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
np.random.seed(42)
shuffle_index = np.random.permutation(60000)
shuffle_index[:10]
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
y_train
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
y_train_5
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)
sgd_clf.predict([some_digit])
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
# 정확도(accuracy) : 정확히 예측한 비율
correct = cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
print(correct)
print(correct.mean(),correct.std())
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
# 클래스별 비율이 유지되도록 층화추춯
skfolds = StratifiedKFold(n_splits=3, random_state=42)
# Generate indices to split data into training and test set.
for train_index, test_index in skfolds.split(X_train, y_train_5):
X_train_folds = X_train[train_index]
y_train_folds = y_train_5[train_index]
X_test_fold = X_train[test_index]
y_test_fold = y_train_5[test_index]
sgd_clf.fit(X_train_folds, y_train_folds)
y_pred = sgd_clf.predict(X_test_fold)
n_correct = sum(y_pred == y_test_fold)
print(n_correct / len(y_pred))
http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
class Never5Classifier(BaseEstimator):
def fit(self, X, y=None):
pass
def predict(self, X):
return np.zeros((len(X), 1), dtype=bool)
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
y_train_pred
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
confusion_matrix(y_train_5, y_train_pred)
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html
precision_score(y_train_5, y_train_pred)
recall_score(y_train_5, y_train_pred)
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
f1_score(y_train_5, y_train_pred)
never_5_pred = cross_val_predict(never_5_clf, X_train, y_train_5, cv=3)
confusion_matrix(y_train_5, never_5_pred)
# print("정밀도=",precision_score(y_train_5, never_5_pred))
print("재현율=",recall_score(y_train_5, never_5_pred))
print("F1 스코어=",f1_score(y_train_5, never_5_pred))
# 결정 함수 반환
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
method="decision_function")
y_scores
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
thresholds.shape
np.unique(y_scores)
thresholds.shape, y_scores.shape
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="정밀도", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="재현율", linewidth=2)
plt.xlabel("임계값", fontsize=16)
plt.legend(loc="upper left", fontsize=16)
plt.ylim([0, 1])
plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-700000, 700000])
save_fig("precision_recall_vs_threshold_plot")
plt.show()
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls, precisions, "b-", linewidth=2)
plt.xlabel("재현율", fontsize=16)
plt.ylabel("정밀도", fontsize=16)
plt.axis([0, 1, 0, 1])
plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
save_fig("precision_vs_recall_plot")
plt.show()
y_train_pred_90 = (y_scores > 70000)
precision_score(y_train_5, y_train_pred_90)
recall_score(y_train_5, y_train_pred_90)
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
# 랜덤 분류기의 ROC 곡선
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('거짓 양성 비율', fontsize=16)
plt.ylabel('진짜 양성 비율', fontsize=16)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
save_fig("roc_curve_plot")
plt.show()
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
roc_auc_score(y_train_5, y_scores)
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
method="predict_proba")
y_probas_forest
y_scores_forest = y_probas_forest[:, 1] # 점수는 양상 클래스의 확률
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "랜덤 포레스트")
plt.legend(loc="lower right", fontsize=16)
save_fig("roc_curve_comparison_plot")
plt.show()
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])
https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores
np.argmax(some_digit_scores)
sgd_clf.classes_
sgd_clf.classes_[5]
http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsOneClassifier.html
ovo_clf = OneVsOneClassifier(sgd_clf)
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
ovo_clf.decision_function([some_digit])
len(ovo_clf.estimators_)
df = np.arange(0,45)
cl = np.arange(0,45)
for i in range(45):
df[i] = ovo_clf.estimators_[i].decision_function([some_digit])
cl[i] = ovo_clf.estimators_[i].predict([some_digit])
np.c_[df,cl][:5]
np.array([some_digit]).dot(ovo_clf.estimators_[0].coef_.T)+ovo_clf.estimators_[0].intercept_
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
forest_clf.predict_proba([some_digit])
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")
cross_val_score(ovo_clf, X_train, y_train, cv=3, scoring="accuracy")
cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")
cross_val_score(never_5_clf, X_train, y_train, cv=3, scoring="accuracy")
scaler = StandardScaler()
# uint8 : unsigned integer
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")
http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
y_train_large = (y_train >= 7) # 7 이상 여부
y_train_odd = (y_train % 2 == 1) # 홀수 여부
y_multilabel = np.c_[y_train_large, y_train_odd]
y_multilabel.shape
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)
knn_clf.predict([some_digit])
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)
f1_score(y_multilabel, y_train_knn_pred, average="macro")
https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randint.html
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test
def plot_digit(data):
image = data.reshape(28, 28)
plt.imshow(image, cmap = matplotlib.cm.binary,
interpolation="nearest")
plt.axis("off")
some_index = 5500
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
save_fig("noisy_digit_example_plot")
plt.show()
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)