关键词搜索

源码搜索 ×
×

机器学习第3章分类

发布2021-01-10浏览394次

详情内容

机器学习实战:基于Scikit-Learn和TensorFlow的笔记

参考:作者的Jupyter Notebook
Chapter 2 – End-to-end Machine Learning project

获取MNISTpython基础教程据集的代码:

def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
sort_by_target(mnist) # fetch_openml() returns an unsorted dataset

    查看这些数组

    #print(mnist["data"], mnist["target"])
    #print(mnist.data.shape)
    X, y = mnist["data"], mnist["target"]
    #print(X.shape)
    #print(y.shape)
    
    some_digit = X[36000]
    some_digit_image = some_digit.reshape(28, 28)
    plt.imshow(some_digit_image, cmap = mpl.cm.binary,
            interpolation="nearest")
    plt.axis("off")
    #plt.show()
    #print(y[36000])
    
      12
    • 13

    MNIST数据集中的部分数字图像

    def plot_digits(instances, images_per_row=10, **options):
        size = 28
        images_per_row = min(len(instances), images_per_row)
        images = [instance.reshape(size,size) for instance in instances]
        n_rows = (len(instances) - 1) // images_per_row + 1
        row_images = []
        n_empty = n_rows * images_per_row - len(instances)
        images.append(np.zeros((size, size * n_empty)))
        for row in range(n_rows):
            rimages = images[row * images_per_row : (row + 1) * images_per_row]
            row_images.append(np.concatenate(rimages, axis=1))
        image = np.concatenate(row_images, axis=0)
        plt.imshow(image, cmap = mpl.cm.binary, **options)
        plt.axis("off")
    
    plt.figure(figsize=(9,9))
    example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]
    plot_digits(example_images, images_per_row=10)
    #save_fig("more_digits_plot")
    #plt.show()
    
      12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20

    给数据集洗牌

    X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
    shuffle_index = np.random.permutation(60000)
    X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
    
    • 1
    • 2
    • 3

    训练一个二元分类器,为此分类任务创建目标向量:

    y_train_5 = (y_train == 5)  # True for all 5s, False for all other digits.
    y_test_5 = (y_test == 5)
    
    • 1
    • 2

    创建一个SGDClassifier(随机梯度下降分类器)并在整个训练集上进行训练:

    from sklearn.linear_model import SGDClassifier
    sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)   #random_state=42
    sgd_clf.fit(X_train, y_train_5)
    #print(sgd_clf.fit(X_train, y_train_5))
    #现在可以用它来检测数字5的图像了:
    sgd_clf.predict([some_digit])
    #print(sgd_clf.predict([some_digit]))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

    交叉验证

    from sklearn.model_selection import cross_val_score
    cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")
    print(cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
    
    #下面这段代码与前面的cross_val_score()大致相同,并打印出相同的结果:
    from sklearn.model_selection import StratifiedKFold
    from sklearn.base import clone
    skfolds = StratifiedKFold(n_splits=3, random_state=42)   #random_state=42
    for train_index, test_index in skfolds.split(X_train, y_train_5):
        clone_clf = clone(sgd_clf)
        X_train_folds = X_train[train_index]
        y_train_folds = (y_train_5[train_index])
        X_test_fold = X_train[test_index]
        y_test_fold = (y_train_5[test_index])
    
        clone_clf.fit(X_train_folds, y_train_folds)
        y_pred = clone_clf.predict(X_test_fold)
        n_correct = sum(y_pred == y_test_fold)
        print(n_correct / len(y_pred))
    
      12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19

    一个蠢笨的分类器(不是我说的),它将每张图都分类成“非5”:

    from sklearn.base import BaseEstimator
    class Never5Classifier(BaseEstimator):
        def fit(self, X, y=None):
            pass
        def predict(self, X):
            return np.zeros((len(X), 1), dtype=bool)
    #准确度
    never_5_clf = Never5Classifier()
    print(cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy"))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    混淆矩阵:评估分类器性能的更好方法是混淆矩阵。

    from sklearn.model_selection import cross_val_predict
    y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
    #cross_val_predict()函数同样执行K-fold交叉验证,但返回的不是评估分数,而是每个折叠的预测。这意味着对于每个实例都可以得到一个干净的预测
    from sklearn.metrics import confusion_matrix
    #confusion_matrix(y_train_5, y_train_pred)
    #print(confusion_matrix(y_train_5, y_train_pred))
    y_train_perfect_predictions = y_train_5
    #print(confusion_matrix(y_train_5, y_train_perfect_predictions))
    精度和召回率
    
    #精度=TP/(TP+FP):TP是真正类的数量,FP是假正类的数量。
    #召回率=TP/(TP+FN):FN是假负类的数量。
    from sklearn.metrics import precision_score, recall_score
    print(precision_score(y_train_5, y_train_pred))  #精度4344 / (4344 + 1307)
    print(recall_score(y_train_5, y_train_pred))  #召回率4344 / (4344 + 1077)
    
    #F1分数:F1=2/(1/精度+1/召回率)=TP/(TP+(FN+FP)https://cdn.jxasp.com:9143/image/2)
    from sklearn.metrics import f1_score
    print(f1_score(y_train_5, y_train_pred))
    精度/召回率权衡:阈值
    
    y_scores = sgd_clf.decision_function([some_digit])
    print(y_scores)
    threshold = 0
    y_some_digit_pred = (y_scores > threshold)
    print(y_some_digit_pred)
    #提高阈值
    threshold = 200000
    y_some_digit_pred_a = (y_scores > threshold)
    print(y_some_digit_pred_a)
    决定使用什么阈值
    
    #获取训练集中所有实例的分数
    y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
    #计算所有可能的阈值的精度和召回率
    from sklearn.metrics import precision_recall_curve
    precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
    #使用Matplotlib绘制精度和召回率相对于阈值的函数图
    def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
        plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
        plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
        plt.xlabel("Threshold")
        plt.legend(loc="upper left")
        plt.ylim([0, 1])
    plt.figure(figsize=(8, 4))
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    plt.xlim([-700000, 700000])
    plt.show()
    #print((y_train_pred == (y_scores > 0)).all())
    y_train_pred_90 = (y_scores > 70000)
    from sklearn.metrics import precision_score, recall_score
    print(precision_score(y_train_5, y_train_pred_90)) #精度
    print(recall_score(y_train_5, y_train_pred_90)) #召回率
    精度和召回率的函数图PR
    
    def plot_precision_vs_recall(precisions, recalls):
        plt.plot(recalls, precisions, "b-", linewidth=2)
        plt.xlabel("Recall", fontsize=16)
        plt.ylabel("Precision", fontsize=16)
        plt.axis([0, 1, 0, 1])
    
    plt.figure(figsize=(8, 6))
    plot_precision_vs_recall(precisions, recalls)
    plt.show()
    ROC曲线(受试者工作特征曲线):真正类率和假正类率
    
    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
    def plot_roc_curve(fpr, tpr, label=None):
        plt.plot(fpr, tpr, linewidth=2, label=label)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.axis([0, 1, 0, 1])
        plt.xlabel('False Positive Rate', fontsize=16)
        plt.ylabel('True Positive Rate', fontsize=16)
    '''
    plt.figure(figsize=(8, 6))
    plot_roc_curve(fpr, tpr)
    plt.show()
    from sklearn.metrics import roc_auc_score
    print(roc_auc_score(y_train_5, y_scores))
    训练一个RandomForestClassifier分类器,并比较它和SGDClassifier分类器的ROC曲线和ROC AUC分数。
    
    from sklearn.ensemble import RandomForestClassifier
    forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
    y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
    y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
    fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
    plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
    plt.legend(loc="lower right", fontsize=16)
    plt.show()
    from sklearn.metrics import roc_auc_score
    print(roc_auc_score(y_train_5, y_scores_forest))
    多类别分类器,用SGDClassifier试试
    
    #用SGDClassifier试试:
    sgd_clf.fit(X_train, y_train)
    sgd_clf.predict([some_digit])
    #print(sgd_clf.predict([some_digit]))
    some_digit_scores = sgd_clf.decision_function([some_digit])
    #print(some_digit_scores)
    #print(np.argmax(some_digit_scores))
    #print(sgd_clf.classes_)
    #print(sgd_clf.classes_[5])
    
    #下面这段代码使用OvO策略,基于SGDClassifier创建了一个多类别分类器:
    from sklearn.multiclass import OneVsOneClassifier
    ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))
    ovo_clf.fit(X_train, y_train)
    ovo_clf.predict([some_digit])
    len(ovo_clf.estimators_)
    #print(ovo_clf.predict([some_digit]))
    #print(len(ovo_clf.estimators_))
    训练RandomForestClassifier
    
    from sklearn.model_selection import cross_val_score
    forest_clf.fit(X_train, y_train)
    #print(forest_clf.predict([some_digit]))
    #print(forest_clf.predict_proba([some_digit]))  #概率列表
    #print(cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy"))  #准确率
    #将输入进行简单缩放
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
    #print(cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy"))
    使用Matplotlib的matshow()函数来查看混淆矩阵
    
    y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
    conf_mx = confusion_matrix(y_train, y_train_pred)
    #print(conf_mx)
    #使用Matplotlib的matshow()函数来查看混淆矩阵的图像表示
    #plt.matshow(conf_mx, cmap=plt.cm.gray)
    #save_fig("confusion_matrix_plot", tight_layout=False)
    你需要将混淆矩阵中的每个值除以相应类别中的图片数量,这样你比较的就是错误率而不是错误的绝对值
    
    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums
    #用0填充对角线,只保留错误,重新绘制结果:
    np.fill_diagonal(norm_conf_mx, 0)
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
    #save_fig("confusion_matrix_errors_plot", tight_layout=False)
    看看数字3和数字5的例子:
    
    cl_a, cl_b = 3, 5
    X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
    X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
    X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
    X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]
    
    plt.figure(figsize=(8,8))
    plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
    plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
    plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
    plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
    #save_fig("error_analysis_digits_plot")
    多标签分类
    
    #这段代码会创建一个y_multilabel数组,其中包含两个数字图片的目标标签:第一个表示数字是否是大数(7、8、9),第二个表示是否为奇数。
    from sklearn.neighbors import KNeighborsClassifier
    y_train_large = (y_train >= 7)
    y_train_odd = (y_train % 2 == 1)
    y_multilabel = np.c_[y_train_large, y_train_odd]
    
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X_train, y_multilabel)
    #print(knn_clf.fit(X_train, y_multilabel))
    #下一行创建一个KNeighborsClassifier实例(它支持多标签分类,不是所有的分类器都支持),然后使用多个目标数组对它进行
    #训练。现在用它做一个预测,注意它输出的两个标签:
    knn_clf.predict([some_digit])    #数字5确实不大(False),为奇数(True)。
    #print(knn_clf.predict([some_digit]))
    下面这段代码计算所有标签的平均F1分数:
    
    from sklearn.metrics import f1_score
    y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)
    f1_score(y_multilabel, y_train_knn_pred, average="macro")
    #print(f1_score(y_multilabel, y_train_knn_pred, average="macro"))
    多输出分类(多输出-多类别分类)
    
    #还先从创建训练集和测试集开始,使用NumPy的randint()函数
    #为MNIST图片的像素强度增加噪声。目标是将图片还原为原始图片:
    noise = np.random.randint(0, 100, (len(X_train), 784))
    X_train_mod = X_train + noise
    noise = np.random.randint(0, 100, (len(X_test), 784))
    X_test_mod = X_test + noise
    y_train_mod = X_train
    y_test_mod = X_test
    
    some_index = 5500
    #plt.subplot(121); plot_digit(X_test_mod[some_index])
    #plt.subplot(122); plot_digit(y_test_mod[some_index])
    #save_fig("noisy_digit_example_plot")
    清洗这张图片:
    
    knn_clf.fit(X_train_mod, y_train_mod)
    clean_digit = knn_clf.predict([X_test_mod[some_index]])
    plot_digit(clean_digit)
    save_fig("cleaned_digit_example_plot")
    plt.show()
    
      12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199

    相关技术文章

    点击QQ咨询
    开通会员
    返回顶部
    ×
    微信扫码支付
    微信扫码支付
    确定支付下载
    请使用微信描二维码支付
    ×

    提示信息

    ×

    选择支付方式

    • 微信支付
    • 支付宝付款
    确定支付下载