ML_with_Missing_Data/missing_data.py at master · mramshaw/ML_with_Missing_Data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python

# pylint: disable=C0103,C0325,W0621

"""Quick missing/incomplete data exercise with NumPy and Pandas."""

import matplotlib.pyplot as plt

import numpy

from pandas import read_csv

import seaborn as sb

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


def k_fold_cross_validation(dataset, title):
    """Define the common code for k-fold cross-validation."""
    # split dataset into inputs and outputs
    values = dataset.values
    X = values[:, 0:8]
    y = values[:, 8]

    # evaluate an LDA model on the dataset using k-fold cross validation
    model = LinearDiscriminantAnalysis()
    kfold = KFold(n_splits=3, random_state=7)
    result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    print(title)
    underline = ""
    for _ in xrange(len(title)):
        underline += "-"
    print(underline)
    print(result.mean())


if __name__ == '__main__':

    dataset = read_csv('pima-indians-diabetes.data.csv', header=None)

    # Show the shape (rows & columns) of the dataset
    print("Rows, columns = " + str(dataset.shape))
    print

    # Show the first 20 rows
    print("The first 20 observations")
    print("-------------------------")
    print(dataset.head(20))
    print

    # count the number of zero values - where zero is an anomaly
    print("Number of zero values")
    print("---------------------")
    print((dataset[[1, 2, 3, 4, 5, 6, 7]] == 0).sum())
    print

    # count the number of NaN values (using isnull) in each column
    print("Number of missing fields (original)")
    print("-----------------------------------")
    print(dataset.isnull().sum())
    print

    # Show the stats of the dataset
    print("Statistics (original)")
    print("---------------------")
    print(dataset.describe())
    print

    # Make a copy of the dataset so we can compare original & replaced
    replaced_dataset = dataset.copy()

    # mark zero values as NaN (missing)
    replaced_dataset[[1, 2, 3, 4, 5]] = \
        replaced_dataset[[1, 2, 3, 4, 5]].replace(0, numpy.NaN)

    print("Number of missing fields (zero fields flagged as NaN)")
    print("-----------------------------------------------------")
    print(replaced_dataset.isnull().sum())
    print

    # Show the stats of the dataset
    print("Statistics (pre-fill)")
    print("---------------------")
    print(replaced_dataset[[1, 2, 3, 4, 5]].describe())
    print

    # Make copies of the dataset so we can compare them
    mean_dataset = replaced_dataset.copy()
    mode_dataset = replaced_dataset.copy()
    median_dataset = replaced_dataset.copy()

    # fill missing values with mean column values
    mean_dataset.fillna(value=replaced_dataset.mean(), inplace=True)

    # count the number of NaN values in each column
    print("Number of missing fields (post-fill)")
    print("------------------------------------")
    print(mean_dataset.isnull().sum())
    print

    # Show the stats of the dataset
    print("Statistics (post-fill)")
    print("----------------------")
    print(mean_dataset[[1, 2, 3, 4, 5]].describe())
    print

    # fill missing values with column mode value
    mode_dataset.fillna(value=replaced_dataset.mode(numeric_only=True).iloc[0],
                        inplace=True)

    # fill missing values with column median value
    median_dataset.fillna(value=replaced_dataset.median(), inplace=True)

    # Use Seaborn to plot before & after graphs for columns 1 - 5
    for i in range(1, 6):
        sb.distplot(dataset[[i]], hist=False, label='Original')
        # Cannot plot datasets with NaN values
        sb.distplot(mode_dataset[[i]], hist=False, label='Mode')
        sb.distplot(median_dataset[[i]], hist=False, label='Median')
        sb.distplot(mean_dataset[[i]], hist=False, label='Mean')
        plt.suptitle('Column ' + str(i))
        plt.show()

    # split dataset into inputs and outputs
    values = replaced_dataset.values
    X = values[:, 0:8]
    y = values[:, 8]

    # evaluate an LDA model on the dataset using k-fold cross validation
    # [We know it is going to fail, so wrap it in a try/catch block.]
    print("Accuracy (with NaN values)")
    print("--------------------------")
    try:
        model = LinearDiscriminantAnalysis()
        kfold = KFold(n_splits=3, random_state=7)
        result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
        print(result.mean())
    except ValueError as ve:
        print
        print ve
    print

    # ------------------------------
    # drop NaN entries and try again
    # ------------------------------

    # create new dataset with NaN entries removed
    dropped_dataset = replaced_dataset.dropna(inplace=False)

    # summarize the number of rows and columns in the dataset
    print("Rows, columns (NaN values dropped) = " + str(dropped_dataset.shape))
    print

    # Show the stats of the dataset
    print("Statistics (NaN values dropped)")
    print(dropped_dataset[[1, 2, 3, 4, 5]].describe())
    print

    k_fold_cross_validation(dropped_dataset,
                            "Accuracy (with NaN values dropped)")
    print

    # -------------------------------------
    # use mean-filled entries and try again
    # -------------------------------------

    k_fold_cross_validation(mean_dataset, "Accuracy (with NaN values filled)")
    print

    # Use Seaborn to plot before & after graphs for columns 1 - 5
    for i in range(1, 6):
        sb.distplot(dataset[[i]], hist=False, label='Original')
        sb.distplot(dropped_dataset[[i]], hist=False, label='NaN Dropped')
        sb.distplot(mode_dataset[[i]], hist=False, label='NaN Filled (mean)')
        plt.suptitle('Column ' + str(i))
        plt.show()