3장 분류¶

Outline¶

. MNIST¶

. 이진분류기¶

. 오차행렬: 정확도, 재현율, ROC, AUC¶

. 다중 분류¶

. 다중 레이블 분류¶

. 다중 출력 분류¶

3.0 설정¶

# 공통
from warnings import simplefilter
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

from scipy.io import loadmat
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier

simplefilter(action='ignore', category=FutureWarning)

np.random.seed(42)

%matplotlib inline
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['axes.unicode_minus'] = False

# 그림을 저장할 폴드
PROJECT_ROOT_DIR = "C:/Users/Admin/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/sally/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/User/Desktop/ML/"
# PROJECT_ROOT_DIR = "C:/Users/sally/Dropbox/2019-Fall-Semester/ML"

CHAPTER_ID = "classification"

IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(IMAGES_PATH, fig_id + ".png")
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

3.1 MNIST (Modified National Institute of Standards and Technology Database)¶

. 고등학생과 미국 인구조사국 직원들이 손글씨로 쓴 7만 개의 작은 숫자 이미지 자료¶

Cap%202018-09-06%2021-27-43-376.jpg

. scipy.io.loadmat : 행렬 파일을 로드¶

https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.io.loadmat.html

mnist_path = os.path.join("datasets","mnist","mnist-original.mat")
mnist_raw = loadmat(mnist_path)

print(type(mnist_raw))
print("")
print(mnist_raw.keys())
print("")
print(type(mnist_raw['data']))
print("")
print(mnist_raw['data'].shape)
print("")
print(mnist_raw['label'].shape)

<class 'dict'>

dict_keys(['__header__', '__version__', '__globals__', 'mldata_descr_ordering', 'data', 'label'])

<class 'numpy.ndarray'>

(784, 70000)

(1, 70000)

mnist = {
    "data": mnist_raw["data"].T, # 전치 70000 * 784 행렬
    "target": mnist_raw["label"][0],# 첫 번째 행
    "COL_NAMES": ["label", "data"],
    "DESCR": "mldata.org dataset: mnist-original"
}

X, y = mnist["data"], mnist["target"]
print(X.shape )
print("")
print(y.shape)

(70000, 784)

(70000,)

some_digit = X[36000]
some_digit.shape # 28 * 28 픽셀이 있고 개체 특성은 0(흰색)~255(검은색)의 픽셀 강도

(784,)

some_digit_image = some_digit.reshape(28, 28)

. matplotlib.pyplot.imshow : 2차원 행렬로 저장된 이미지 자료 출력¶

https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.imshow.html

plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
           interpolation="nearest")
save_fig("some_digit_plot")
plt.show()

y[36000]

5.0

. 훈련 세트 : 60000개, 테스트 세트 : 10000개¶

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

. 학습 알고리즘이 훈련 샘플의 순서에 민감할 때는 ...¶

np.random.seed(42)
shuffle_index = np.random.permutation(60000)
shuffle_index[:10]

array([12628, 37730, 39991,  8525,  8279, 51012, 14871, 15127,  9366,
       33322])

X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

3.2 이진 분류기 훈련¶

. binary classifier : 5-감지기¶

y_train

array([1., 6., 6., ..., 0., 2., 9.])

y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
y_train_5

array([False, False, False, ..., False, False, False])

. sklearn.linear_model.SGDClassifier : 확률적 경사하강법에 의한 선형 분류¶

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

sgd_clf.predict([some_digit])

array([ True])

3.3 성능 측정¶

3.3.1 교차 검증을 사용한 정확도 측정¶

. sklearn.model_selection.cross_val_score : 교차 검증에 의한 스코어 계산¶

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

# 정확도(accuracy) : 정확히 예측한 비율
correct = cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
print(correct)

[0.9502  0.96565 0.96495]

print(correct.mean(),correct.std())

0.9602666666666666 0.007123942416636676

. 교차 검증 구현 과정 좀 더 들여다 보면 ...¶

. sklearn.model_selection.StratifiedKFold : 층화 3-겹 교차 검증 구현¶

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

# 클래스별 비율이 유지되도록 층화추춯
skfolds = StratifiedKFold(n_splits=3, random_state=42)

# Generate indices to split data into training and test set.
for train_index, test_index in skfolds.split(X_train, y_train_5):
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]
   
    sgd_clf.fit(X_train_folds, y_train_folds)
    y_pred = sgd_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.9502
0.96565
0.96495

. 더미 분류기 (dummy classifier) : "5 아님 클래스"로만 분류¶

. sklearn.base.BaseEstimator : base 클래스로 추정¶

http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

never_5_clf = Never5Classifier()

cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([0.909  , 0.90715, 0.9128 ])

. Why around 90%?¶

. 정확도(accuracy)는 좋은 측도인가?¶

3.3.2 오차 행렬 (confusion matrix)¶

. sklearn.model_selection.cross_val_predict : 개별 입력 자료의 교차 검증 추정값을 반환¶

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

y_train_pred

array([False, False, False, ..., False, False, False])

. sklearn.model_selection.confusion_matrix : 오차 행렬¶

http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

confusion_matrix(y_train_5, y_train_pred)

array([[53272,  1307],
       [ 1077,  4344]], dtype=int64)

. 오차 행렬¶

Cap%202018-09-06%2013-22-31-984.jpg

. 정밀도(precision) = TP/(TP+FP)¶

. 재현율(recall) = TP/(TP+FN)¶

.. 민감도(sensitivity) 혹은 진짜 양성 비율(true positive rate; TPR)¶

. sklearn.metrics.precision_score : 정밀고 반환¶

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html

. sklearn.metrics.recall_score : 재현율 반환¶

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html

precision_score(y_train_5, y_train_pred)

0.7687135020350381

recall_score(y_train_5, y_train_pred)

0.801328168234643

. sklearn.metrics.f1_score : F1 점수 반환¶

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

. $F_1$ 점수=정밀도와 재현율의 조화 평균=$2\times\frac{정밀도\times재현율}{정밀도+재현율}$¶

.. 실수 $a_1,a_2,\ldots,a_n$의 조화 평균 $H=\frac{n}{1/a_1 +1/a_2+\cdots+1/a_n}$¶

f1_score(y_train_5, y_train_pred)

0.7846820809248555

never_5_pred = cross_val_predict(never_5_clf, X_train, y_train_5, cv=3)
confusion_matrix(y_train_5, never_5_pred)

array([[54579,     0],
       [ 5421,     0]], dtype=int64)

# print("정밀도=",precision_score(y_train_5, never_5_pred))
print("재현율=",recall_score(y_train_5, never_5_pred))
print("F1 스코어=",f1_score(y_train_5, never_5_pred))

재현율= 0.0
F1 스코어= 0.0

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)

3.3.4 정밀도와 재현율의 트레이드오프¶

. 결정 함수 (decision function)를 이용¶

. SGDClassifier는 이 값이 임계값보다 크면 P 클래스, 작으면 N 클래스로 분류¶

$Cap%202018-09-06%2019-53-13-879.jpg$

. 결정 임계값을 높이면 정밀도는 증가, 재현율은 감소¶

. 결정 임계값을 줄이면 정밀도는 감소, 재현율은 증가¶

. 적절한 임계값은?¶

# 결정 함수 반환
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
                             method="decision_function")

y_scores

array([ -434076.49813641, -1825667.15281624,  -767086.76186905, ...,
        -867191.25267994,  -565357.11420164,  -366599.16018198])

. sklearn.metrics.precision_recall_curve : 임계값에 따른 정밀도와 재현율의 변화 그림¶

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
thresholds.shape

(59698,)

np.unique(y_scores)

array([-2922518.84137436, -2820548.13505244, -2637608.46868595, ...,
         954170.04762499,   987947.69103863,  1002862.09359785])

thresholds.shape, y_scores.shape

((59698,), (60000,))

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="정밀도", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="재현율", linewidth=2)
    plt.xlabel("임계값", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-700000, 700000])
save_fig("precision_recall_vs_threshold_plot")
plt.show()

def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("재현율", fontsize=16)
    plt.ylabel("정밀도", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
save_fig("precision_vs_recall_plot")
plt.show()

y_train_pred_90 = (y_scores > 70000)

precision_score(y_train_5, y_train_pred_90)

0.8659205116491548

recall_score(y_train_5, y_train_pred_90)

0.6993174691016417

3.3.5 ROC (receiver operating characteristic) 곡선¶

. 민감도 vs. 1-특이도¶

.. 특이도 (specificity) : 진짜 음성 비율, 음성으로 정확하게 분류된 음성 샘플의 비율 (TNR)¶

.. 1-특이도 : 양성으로 잘못 분류된 음성 샘플의 비율(FPR)¶

. sklearn.metrics.roc_curve : ROC 그림¶

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    # 랜덤 분류기의 ROC 곡선
    plt.plot([0, 1], [0, 1], 'k--') 
    plt.axis([0, 1, 0, 1])
    plt.xlabel('거짓 양성 비율', fontsize=16)
    plt.ylabel('진짜 양성 비율', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
save_fig("roc_curve_plot")
plt.show()

. AUC (area under the curve)¶

.. perfect : 1, random : 0.5¶

. sklearn.metrics.roc_auc_score : AUC 계산¶

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

roc_auc_score(y_train_5, y_scores)

0.9624496555967156

. 랜덤 포레스트 결과와 비교¶

.. decision_function 메서드 대신 predict_prob 메서드 사용¶

forest_clf = RandomForestClassifier(random_state=42)

y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
                                    method="predict_proba")
y_probas_forest

array([[1. , 0. ],
       [0.9, 0.1],
       [1. , 0. ],
       ...,
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ]])

y_scores_forest = y_probas_forest[:, 1] # 점수는 양상 클래스의 확률
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "랜덤 포레스트")
plt.legend(loc="lower right", fontsize=16)
save_fig("roc_curve_comparison_plot")
plt.show()

3.4 다중 분류¶

. 일대다 (one-versus-all, one-versus-the rest; OvA) 전략 vs. 일대일 (one-versus-one; OvO) 전략¶

.. OvA : 숫자별 이진 분류기를 훈련시켜 클래스가 10개인 숫자 이미지를 분류¶

... 이진 분류기 10개를 훈련¶

.. OvO : 숫자의 조합마다 이진 분류기를 훈련시켜 클래스가 10개인 숫자 이미지를 분류¶

... 0과1을 구별, 0과2를 구별 등과 각 숫자의 조합마다 이진 분류기 45(=$10\choose 2$)개를 훈련¶

... 전체 훈련 세트 중 구별할 두 클래스에 해당하는 샘플만 필요¶

. 다중 분류기 (multiclass classifier) : 둘 이상의 클래스로 구별¶

.. 랜덤 포레스트 분류기, 나이브 베이즈 분류기¶

. OvA 전략 (default)¶

. 10개의 이진 분류기를 훈련시키고 결정 점수가 가장 높은 클래스를 선택¶

sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

sgd_clf.predict([some_digit])

array([5.])

. OvA 전략 결과 직접 확인하기¶

. numpy.argmax : 최댓값의 인덱스 반환¶

https://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html

some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

array([[-311402.62954431, -363517.28355739, -446449.5306454 ,
        -183226.61023518, -414337.15339485,  161855.74572176,
        -452576.39616343, -471957.14962573, -518542.33997148,
        -536774.63961222]])

np.argmax(some_digit_scores)

5

sgd_clf.classes_

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

sgd_clf.classes_[5]

5.0

. OvO 전략¶

. sklearn.multiclass.OneVsOneClassifier¶

http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsOneClassifier.html

ovo_clf = OneVsOneClassifier(sgd_clf)
ovo_clf.fit(X_train, y_train)

OneVsOneClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
          n_jobs=None)

ovo_clf.predict([some_digit])

array([5.])

ovo_clf.decision_function([some_digit])

array([[ 7.24259379,  3.89923485,  5.03865373,  8.31502003,  2.91600505,
         9.5       , -0.47623459,  3.98901933,  1.7598166 ,  2.81589119]])

len(ovo_clf.estimators_)

45

df = np.arange(0,45)
cl = np.arange(0,45)

for i in range(45):
    df[i] = ovo_clf.estimators_[i].decision_function([some_digit])
    cl[i] = ovo_clf.estimators_[i].predict([some_digit])
np.c_[df,cl][:5]

array([[ -249385,        0],
       [ -265538,        0],
       [  408345,        1],
       [-1598651,        0],
       [  998648,        1]])

np.array([some_digit]).dot(ovo_clf.estimators_[0].coef_.T)+ovo_clf.estimators_[0].intercept_

array([[-249385.66078875]])

. 분류 결과 (도수)¶

.. 0 1 2 3 4 5 6 7 8 9 합계¶

.. 7 4 5 8 3 9 0 4 2 3 45¶

. 다중 분류기 : 랜덤 포레스트 분류기¶

forest_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

forest_clf.predict([some_digit])

array([5.])

forest_clf.predict_proba([some_digit])

array([[0.1, 0. , 0. , 0.1, 0. , 0.8, 0. , 0. , 0. , 0. ]])

. "5"일 가능성이 80%로 가장 높음. "0"과 "3"이 각 10%의 확률로 가능¶

. 분류기 (OvA, OvO, 랜덤 포레스트) 평가 : 교차 검증¶

. 랜덤 분류기 : 약 10%의 정확도 기대¶

cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.84063187, 0.84899245, 0.86652998])

cross_val_score(ovo_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.90796841, 0.91494575, 0.91383708])

cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.94041192, 0.93879694, 0.93949092])

cross_val_score(never_5_clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.10035, 0.097  , 0.0988 ])

. 스케일 조정으로 OvA 전략의 정확도 높임 : 약 91%까지¶

scaler = StandardScaler()
# uint8 : unsigned integer
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

array([0.91011798, 0.90874544, 0.906636  ])

3.6 다중 레이블 분류¶

. 여러 개의 이진 레이블을 출력하는 분류기 (multilabel classifier)¶

. eg, 얼굴 인식 시스템 : 사진에 어떤 인물들이 있는지 없는지를 구별¶

. sklearn.neighbors.KNeighborsClassifier : k-최근접 이웃 분류¶

http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

y_train_large = (y_train >= 7) # 7 이상 여부
y_train_odd = (y_train % 2 == 1) # 홀수 여부
y_multilabel = np.c_[y_train_large, y_train_odd]
y_multilabel.shape

(60000, 2)

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

knn_clf.predict([some_digit])

array([[False,  True]])

. 평가 : 각 레이블의 $F_1$ 점수의 평균¶

. 레이블에 클래스의 지지도 (support 즉 target 레이블에 속하는 샘플 수)를 가중치로 줄 수 있음. average = "weighted"라고 설정¶

경고: 다음 셀은 실행하는데 매우 오래 걸립니다.¶

y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)
f1_score(y_multilabel, y_train_knn_pred, average="macro")

0.97709078477525

3.7 다중 출력 분류¶

. 다중 레이블 분류에서 한 레이블이 다중 클래스가 될 수 있도록 일반화 한 것¶

. 이미지에서 노이즈를 제거하는 시스템 작성¶

.. 분류기의 출력이 다중 레이블 (픽셀당 한 레이블 784개)이고 각 레이블은 0~255까지의 값을 가진 다중 클래스(256개) $\Rightarrow$ 다중 레이블 다중 클래스 분류¶

. np.random.randint : 정수 난수 발생¶

https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.randint.html

noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = matplotlib.cm.binary,
               interpolation="nearest")
    plt.axis("off")

some_index = 5500
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
save_fig("noisy_digit_example_plot")
plt.show()

. 노이즈를 제거한 깨끗한 이미지 만들기¶

knn_clf.fit(X_train_mod, y_train_mod)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)