-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathEvaluate.py
More file actions
320 lines (264 loc) · 12.9 KB
/
Evaluate.py
File metadata and controls
320 lines (264 loc) · 12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#!/usr/bin/python3
"""
Module for evaluating machine learning models.
Arguments
----------
--model
Path to the trained models. Note: This is not the full path (ending in .pth)
but the folder path + model name. For example "folder/modelname" loads the
models "folder/modelname-0.pth", "folder/modelname-1.pth" etc.
--source
Folder path where the input JSON files are located.
May also end in .json when only a single JSON is to be used for evaluation.
Example
----------
$ python .\Evaluate.py --model models/final-model --source input.json
"""
import logging
import argparse
import libcst as cst
from Predict import load_model, predict, predict_single
from util.FixType import FixType
import util.IOProcessor as IOProcessor
from os import listdir
import json
parser = argparse.ArgumentParser()
parser.add_argument(
'--model', help="Path of trained model.", required=True)
parser.add_argument(
'--source', help="Folder path of all test files.", required=True)
def evaluate(model, test_files):
"""
Evaluates the given models on the given set of samples.
Results are printed on the console.
Parameters
----------
model : str
Path to the set of models to be used for predictions.
test_files : str
File path to the JSON with the set of samples.
May also be a folder path to a folder containing multiple JSONs.
Returns
-------
None.
"""
print("Running predictions.")
models = load_model(model)
predictions = predict(models, test_files)
# # write predictions to file
# write_predictions("evaluate_out.json",predictions)
evaluate_individual(predictions, test_files, models)
evaluate_overall(predictions)
def evaluate_individual(predictions, test_files, models):
"""
Evaluates each of the given models individually.
Results are printed on the console.
Parameters
----------
predictions : list of Dict
List of outputs of the predict() function from the Predict module.
test_files : str
File path to the JSON with the set of samples.
May also be a folder path to a folder containing multiple JSONs.
models : list of RNN
List of four RNN models to be used for predictions.
Returns
-------
None.
"""
print("\nAccuracy for individual models\n")
# Fix Location
correct_predictions = [0, 0, 0]
total_predictions = [0, 0, 0]
num_failed_predictions = 0
for prediction in predictions:
if prediction["correct_data"]["correct_location"] == prediction["predicted_location"]:
correct_predictions[FixType[prediction["correct_data"]["correct_type"]].value] = correct_predictions[FixType[
prediction["correct_data"]["correct_type"]].value] + 1
if prediction["predicted_location"] is None:
num_failed_predictions = num_failed_predictions + 1
total_predictions[FixType[prediction["correct_data"]["correct_type"]].value] = total_predictions[FixType[
prediction["correct_data"]["correct_type"]].value] + 1
for i in range(3):
if total_predictions[i] == 0: # If the type was never predicted
accuracy = 0
else:
accuracy = correct_predictions[i] / total_predictions[i]
print(f"Fix Location accuracy for class {FixType(i).name}: {accuracy * 100} %")
accuracy = sum(correct_predictions) / (len(predictions) - num_failed_predictions)
print(f"Fix Location accuracy overall is {accuracy * 100} %")
# Fix type
correct_predictions = [0, 0, 0]
total_predictions = [0, 0, 0]
num_failed_predictions = 0
for prediction in predictions:
if prediction["correct_data"]["correct_type"] == prediction["predicted_type"]:
correct_predictions[FixType[prediction["predicted_type"]].value] = correct_predictions[FixType[
prediction["predicted_type"]].value] + 1
if prediction["predicted_type"] is None:
num_failed_predictions = num_failed_predictions + 1
total_predictions[FixType[prediction["predicted_type"]].value] = total_predictions[FixType[
prediction["predicted_type"]].value] + 1
for i in range(3):
if total_predictions[i] == 0: # If the type was never predicted
accuracy = 0
else:
accuracy = correct_predictions[i] / total_predictions[i]
print(f"Fix Type accuracy for class {FixType(i).name}: {accuracy * 100} %")
accuracy = sum(correct_predictions) / (len(predictions) - num_failed_predictions)
print(f"Fix Type accuracy overall is {accuracy * 100} %")
# We repeat the predictions to evaluate the insert and modify models individually, regardless of the predicted fix type
raw_training_samples = []
if test_files.endswith(".json"): # Single JSON file
with open(test_files) as file:
logging.info("Source ending in .json. Predicting on single JSON file.")
raw_training_samples = json.load(file)
else: # Folder path
for filename in listdir(test_files):
with open(test_files + filename) as file:
raw_training_samples.extend(json.load(file))
correct_predictions_insert = 0
total_predictions_insert = 0
correct_predictions_modify = 0
total_predictions_modify = 0
insert_tokens = []
modify_tokens = []
for sample in raw_training_samples:
# Insert
if sample["metadata"]["fix_type"] == "insert":
actual_sample, tokens = IOProcessor.preprocess(sample["wrong_code"])
pred = predict_single(actual_sample, models[2])
token = IOProcessor.postprocess(pred, 2)
if token == sample["metadata"]["fix_token"]: # Correct Prediction
correct_predictions_insert = correct_predictions_insert + 1
else: # Incorrect prediction
insert_tokens.append([token, sample["metadata"]["fix_token"]])
total_predictions_insert = total_predictions_insert + 1
# Modify
if sample["metadata"]["fix_type"] == "modify":
actual_sample, tokens = IOProcessor.preprocess(sample["wrong_code"])
pred = predict_single(actual_sample, models[3])
token = IOProcessor.postprocess(pred, 3)
if token == sample["metadata"]["fix_token"]: # Correct Prediction
correct_predictions_modify = correct_predictions_modify + 1
else: # Incorrect prediction
modify_tokens.append([token, sample["metadata"]["fix_token"]])
total_predictions_modify = total_predictions_modify + 1
insert_accuracy = correct_predictions_insert / total_predictions_insert
modify_accuracy = correct_predictions_modify / total_predictions_modify
print(f"Fix Token accuracy for insert is {insert_accuracy * 100} %")
print(f"Fix Token accuracy for modify is {modify_accuracy * 100} %")
# The following code may be used to create a swarm plot of the erroneous predictions for fix locations
# This does, however, require the installation of the pandas, seaborn, and matplotlib libraries.
# import seaborn as sns
# import matplotlib.pyplot as plt
# import pandas as pd
# location_distance_array = []
# for prediction in predictions:
# actual_sample, tokens = IOProcessor.preprocess(prediction["correct_data"]["wrong_code"])
# label = get_token_index(prediction["correct_data"]["wrong_code"], tokens, prediction["correct_data"]["correct_location"])
# if prediction["predicted_token_location"] - label == 0:
# pass
# else:
# location_distance_array.append([prediction["predicted_token_location"] - label, prediction["correct_data"]["correct_type"]])
# df = pd.DataFrame(data=location_distance_array)
# sns.set_theme(style="whitegrid")
# f, ax = plt.subplots(figsize=(6, 4))
# sns.despine(bottom=True, left=True)
# sns.swarmplot(y=0, x=1, data=df, palette="dark", size=6)
# ax.set_xlabel('')
# ax.set_ylabel('')
# plt.ylim([-15, 16])
# plt.savefig('line_plot.pdf', bbox_inches='tight', pad_inches=0)
def evaluate_overall(predictions):
"""
Evaluates the given predictions based on the overall task.
Results are printed on the console.
Parameters
----------
predictions : list of Dict
List of outputs of the predict() function from the Predict module.
Returns
-------
None.
"""
print("\nAccuracy for the the models combined (full prediction):\n")
perfect_predictions = [0, 0, 0]
exact_code_match = [0, 0, 0]
exact_code_match_ignore_spaces = [0, 0, 0]
correct_syntax = [0, 0, 0]
total_predictions = [0, 0, 0]
num_failed_predictions = 0
for prediction in predictions:
# Perfect Prediction
if prediction["correct_data"]["correct_location"] == prediction["predicted_location"]:
# Correct type
if prediction["correct_data"]["correct_type"] == prediction["predicted_type"]:
# Correct token
if prediction["predicted_type"] == "insert" or prediction["predicted_type"] == "modify":
if "correct_token" in prediction["correct_data"] and prediction["correct_data"]["correct_token"] == \
prediction["predicted_token"]:
perfect_predictions[FixType[prediction["predicted_type"]].value] = perfect_predictions[FixType[
prediction["predicted_type"]].value] + 1
else:
perfect_predictions[FixType[prediction["predicted_type"]].value] = perfect_predictions[FixType[
prediction["predicted_type"]].value] + 1
# Exact Code Match
if prediction["predicted_code"] == prediction["correct_data"]["correct_code"]:
exact_code_match[FixType[prediction["predicted_type"]].value] = exact_code_match[FixType[
prediction["predicted_type"]].value] + 1
# Exact Code Match (Ignore Spaces)
if prediction["predicted_code"].replace(" ", "") == prediction["correct_data"]["correct_code"].replace(" ", ""):
exact_code_match_ignore_spaces[FixType[prediction["predicted_type"]].value] = exact_code_match_ignore_spaces[FixType[
prediction["predicted_type"]].value] + 1
# Correct Syntax Fix
try:
cst.parse_module(prediction["predicted_code"])
correct_syntax[FixType[prediction["predicted_type"]].value] = correct_syntax[FixType[
prediction["predicted_type"]].value] + 1
except Exception as e:
logging.warning(f"{e.__class__.__name__} occurred: {e}")
# Happens if parsing fails
pass
total_predictions[FixType[prediction["predicted_type"]].value] = total_predictions[FixType[
prediction["predicted_type"]].value] + 1
# Perfect Prediction
for i in range(3):
if total_predictions[i] == 0: # If the type was never predicted
accuracy = 0
else:
accuracy = perfect_predictions[i] / total_predictions[i]
print(f"Perfect Prediction for class {FixType(i).name}: {accuracy * 100} %")
accuracy = sum(perfect_predictions) / (len(predictions) - num_failed_predictions)
print(f"Perfect Prediction accuracy overall is {accuracy * 100} %")
# Exact Code Match
for i in range(3):
if total_predictions[i] == 0: # If the type was never predicted
accuracy = 0
else:
accuracy = exact_code_match[i] / total_predictions[i]
print(f"Exact Code Match for class {FixType(i).name}: {accuracy * 100} %")
accuracy = sum(exact_code_match) / (len(predictions) - num_failed_predictions)
print(f"Exact Code Match accuracy overall is {accuracy * 100} %")
# Exact Code Match (Ignore Spaces)
for i in range(3):
if total_predictions[i] == 0: # If the type was never predicted
accuracy = 0
else:
accuracy = exact_code_match_ignore_spaces[i] / total_predictions[i]
print(f"Exact Code Match (Ignore Spaces) for class {FixType(i).name}: {accuracy * 100} %")
accuracy = sum(exact_code_match_ignore_spaces) / (len(predictions) - num_failed_predictions)
print(f"Exact Code Match (Ignore Spaces) accuracy overall is {accuracy * 100} %")
# Correct Syntax Fixes
for i in range(3):
if total_predictions[i] == 0: # If the type was never predicted
accuracy = 0
else:
accuracy = correct_syntax[i] / total_predictions[i]
print(f"Correct Syntax Fixes for class {FixType(i).name}: {accuracy * 100} %")
accuracy = sum(correct_syntax) / (len(predictions) - num_failed_predictions)
print(f"Correct Syntax Fixes accuracy overall is {accuracy * 100} %")
if __name__ == '__main__':
args = parser.parse_args()
logging.getLogger().setLevel(logging.ERROR) # Set log level
evaluate(args.model, args.source)