ai-syntax-fixes/Predict.py at main · mmunozba/ai-syntax-fixes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/python3
"""
Module for running predictions with machine learning models.

Arguments
----------
--model
    Path to the trained models. Note: This is not the full path (ending in .pth)
    but the folder path + model name. For example "folder/modelname" loads the
    models "folder/modelname-0.pth", "folder/modelname-1.pth" etc.
--source
    Folder path where the input JSON files are located.
    May also end in .json when only a single JSON is to be used for predictions.
--destination
    File path where the prediction output will be saved. Should end in ".json".

Example
----------
$ python .\Predict.py --model models/final-model --source input.json --destination output.json

"""

import argparse
import torch
import logging
import json
import util.IOProcessor as IOProcessor
from util.StringUtils import remove_suffix
from os import listdir

parser = argparse.ArgumentParser()
parser.add_argument(
    '--model', help="Path of trained model.", required=True)
parser.add_argument(
    '--source', help="Folder path of all test files.", required=True)
parser.add_argument(
    '--destination', help="Path to output json file of extracted predictions.", required=True)

# Select CPU or GPU for Pytorch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu") # Force CPU


def predict(models, test_files):
    """
    Returns a set of predictions on the given set of files using the given models.

    Parameters
    ----------
    models : list of RNN
        List of four RNN models for a full prediction.
    test_files : str
        Path to folder containing JSON files with the samples.

    Returns
    -------
    predictions : list of Dict
        List of predictions. Each prediction consists of metadata and
        the predicted fix_location, fix_type and fix_token (if applicable).

    """
    predictions = []
    raw_training_samples = []

    # Load samples
    if test_files.endswith(".json"):  # Single JSON file
        with open(test_files) as file:
            logging.info("Source ending in .json. Predicting on single JSON file.")
            raw_training_samples = json.load(file)
    else:  # Folder path
        for filename in listdir(test_files):
            with open(test_files + filename) as file:
                raw_training_samples.extend(json.load(file))

    # Retrieve models from input and move to device
    fixlocationmodel = models[0]
    fixtypemodel = models[1]
    fixinsertmodel = models[2]
    fixmodifymodel = models[3]
    fixlocationmodel.to(device)
    fixtypemodel.to(device)
    fixinsertmodel.to(device)
    fixmodifymodel.to(device)

    for sample in raw_training_samples:
        try:
            logging.info(f"Running prediction on {sample['metadata']['file']}.")
            actual_sample, tokens = IOProcessor.preprocess(sample["wrong_code"])

            with torch.no_grad():

                # Predict Fix Type and Location
                predicted_token_location = predict_single(actual_sample, fixlocationmodel)
                predicted_location = IOProcessor.postprocess(predicted_token_location, 0, tokens, sample["wrong_code"])
                try:
                    predicted_token_old = tokens[predicted_token_location]
                except IndexError:  # If predicted index is outside of token range
                    predicted_token_old = ""
                predicted_type = IOProcessor.postprocess(
                    predict_single(actual_sample, fixtypemodel), 1)
                predicted_token = ""

                # Predict Fix Token, if needed
                if predicted_type == "insert":
                    predicted_token = IOProcessor.postprocess(
                        predict_single(actual_sample, fixinsertmodel), 2)
                if predicted_type == "modify":
                    predicted_token = IOProcessor.postprocess(
                        predict_single(actual_sample, fixmodifymodel), 3)

            # Build the list of predictions
            prediction = IOProcessor.buildPredictionJson(sample, predicted_location, predicted_type, predicted_token_location, predicted_token_old, predicted_token)
            predictions.append(prediction)

        except Exception as e:
            logging.warning(f"{e.__class__.__name__} occurred: {e}")
            logging.warning(f"Prediction failed for {sample['metadata']['id']}.")
            logging.warning("Skipping the sample.")
            prediction = {}

    return predictions


def load_model(source):
    """
    Loads the four models necessary for a prediction located at the specified source path.
    Note: This is not the full path (ending in .pth) but the folder path + model name.
    Usage: load_model("folder/modelname") loads the models "folder/modelname-0.pth", "folder/modelname-1.pth" etc.

    Parameters
    ----------
    source : str
        File path leading to the set of models.
        Each model should be at source + "-X.pth". With X being the modeltype index.

    Returns
    -------
    models : list of RNN
        List of four models loaded from disk.

    """

    models = []

    for index in range(4):
        logging.info("Loading model from disk.")
        if source.endswith(".pth"):
            logging.warning("Given file path ended with .pth. Removing suffix and attempting to load.")
            source = remove_suffix(source, ".pth")
        model = torch.load(source + f"-{index}.pth", map_location=device)
        model.to(device)
        model.eval()
        models.append(model)

    return models


def write_predictions(destination, predictions):
    """
    Takes a list of predictions and writes it to a JSON file at the destination.

    Parameters
    ----------
    destination : str
        Destination file path for the JSON output.
    predictions : list of Dict
        List of predictions to be written.

    Returns
    -------
    None.

    """

    logging.info("Writing predictions to disk.")

    cleaned_predictions = []

    for index, prediction in enumerate(predictions):
        # Remove correct label info from predictions
        cleaned_prediction = {
            "metadata": {
                "file": prediction["metadata"]["file"],
                "id": prediction["metadata"]["id"],
                # "wrong_code": prediction["correct_data"]["wrong_code"],
                # "correct_code": prediction["correct_data"]["correct_code"],
                # "fix_location": prediction["correct_data"]["correct_location"],
                # "fix_type": prediction["correct_data"]["correct_type"],
            },
            "predicted_location": prediction["predicted_location"],
            "predicted_type": prediction["predicted_type"],
        }

        if prediction["predicted_type"] == "modify" or prediction["predicted_type"] == "insert":
            cleaned_prediction["predicted_token"] = prediction["predicted_token"]

        cleaned_prediction["predicted_code"] = prediction["predicted_code"]

        cleaned_predictions.append(cleaned_prediction)

    # Write summary JSON
    with open(destination, 'w') as file:
        json.dump(cleaned_predictions, file, indent=2)


def predict_single(actual_sample, model):
    """
    Takes a sample and returns a single prediction with the given model.

    Parameters
    ----------
    actual_sample : list of int
        List of tokens to be used for a prediction.
    model : RNN
        RNN model to be used for the prediction.

    Returns
    -------
    label_id : int
        Label predicted by the ML model.
        Depending on the modeltype, this can be
        - a token index ([0-n_tokens] fixlocation)
        - a fix type ([0-2] fixtype)
        - a fix token ([0-99] fixtoken)

    """
    hidden = model.initHidden()
    for word_index in range(actual_sample.size()[0]):
        output, hidden = model(actual_sample[word_index], hidden)
    _, predicted = torch.max(output[0][0], 0)
    label_id = predicted.item()
    return label_id


if __name__ == "__main__":
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.INFO)  # Uncomment for info logs

    # load the serialized model
    model = load_model(args.model)

    # predict incorrect location for each test example.
    predictions = predict(model, args.source)

    # write predictions to file
    write_predictions(args.destination, predictions)