dcpmi/evaluation.py at main · qqplot/dcpmi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from AlignScore.src.alignscore import AlignScore
from BARTScore.bart_score import BARTScorer
from transformers import BertForSequenceClassification, BertTokenizer
import argparse
import json
import numpy as np
import evaluate
import torch

def init_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str, default='input.json')
    parser.add_argument('--output_file', type=str, default='output.json')
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--alignscore_ckpt', type=str, default='/path/to/alignscore/checkpoint')

    return parser


def main(args):
    input_file = args.input_file
    output_file = args.output_file

    file_name = {}
    file_name['file_name'] = input_file
    json_data = []
    with open(input_file, 'r') as file:
        for line in file:
            json_data.append(json.loads(line))

    predictions = []
    references = []
    sources = []
    for data in json_data:
        predictions.append(data["predicted"])
        references.append(data["gold"])
        sources.append(data["source"])

    total_result = {}

    # 1) AlignScore
    align_scorer = AlignScore(model='roberta-base', batch_size=args.batch_size, device='cuda:0', ckpt_path=args.alignscore_ckpt, evaluation_mode='nli_sp')
    alignscore_result = align_scorer.score(contexts=sources, claims=predictions)
    total_result['AlignScore'] = 100*np.mean(alignscore_result)

    # 2) FactCC
    model_path = 'manueldeprada/FactCC'
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    device = torch.device(f'cuda:0')
    model.to(device)

    pred_result = []
    for i in range(len(predictions)):
        text = sources[i]
        claim = predictions[i]

        input_dict = tokenizer(text, claim, max_length=512, padding='max_length', truncation='only_first', return_tensors='pt')
        input_dict = input_dict.to(device)
        logits = model(**input_dict).logits
        pred = logits.argmax(dim=1)
        pred_result.append(pred.item())

    factcc_result = 1 - np.mean(pred_result)
    total_result['FactCC'] = 100*factcc_result

    # 3) BARTScore
    bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
    bart_result = np.mean(bart_scorer.score(srcs=sources, tgts=predictions, batch_size=args.batch_size))
    total_result['BARTScore'] = bart_result

    # 4) BS-Fact
    bert_scorer = evaluate.load("bertscore")
    bsfact_result = bert_scorer.compute(predictions=predictions, references=sources, lang="en",batch_size=args.batch_size)
    total_result['BS-Fact'] = 100*np.mean(bsfact_result["precision"])

    # 5) Rouge-L
    rouge_scorer = evaluate.load('rouge')
    rouge_result = rouge_scorer.compute(predictions=predictions, references=references)
    total_result['Rouge-L'] = 100*rouge_result['rougeL']

    # 6) BERTScore
    bertscore_result = bert_scorer.compute(predictions=predictions, references=references, lang="en",batch_size=args.batch_size)
    total_result['BERTScore'] = 100*np.mean(bertscore_result["f1"])

    print(total_result)

    with open(output_file, "a") as json_file:
        json_file.write(json.dumps(file_name))
        json_file.write("\n")
        json_file.write(json.dumps(total_result))
        json_file.write("\n")
        json_file.write("\n")


if __name__ == '__main__':
    parser = init_parser()
    args = parser.parse_args()
    main(args)