-
Notifications
You must be signed in to change notification settings - Fork 116
Expand file tree
/
Copy pathcorpus_stats.py
More file actions
executable file
·102 lines (88 loc) · 3.29 KB
/
corpus_stats.py
File metadata and controls
executable file
·102 lines (88 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
from __future__ import print_function
import argparse
import json
import sys
from collections import Counter
def update_in_quote(in_quote, token):
if '"' in token and len(token.split('"')) % 2 == 0:
in_quote[0] = not in_quote[0]
if "'" in token and len(token.split("'")) % 2 == 0:
in_quote[1] = not in_quote[1]
def process_query(data, stats):
stats['sentences'] += len(data['sentences'])
stats['queries'] += 1
# Calculate number of SELECTS
for sql in data['sql']:
selects = 0
in_quote = [False, False]
for token in sql.split():
if token == 'SELECT' and (not (in_quote[0] or in_quote[1])):
selects += 1
update_in_quote(in_quote, token)
stats["SQL-selects-{}".format(selects)] += 1
# Calculate depth and breadth
for sql in data['sql']:
max_depth = 0
max_breadth = 1
depth = 0
prev = None
other_bracket = []
breadth = [0]
in_quote = [False, False]
for token in sql.split():
if in_quote[0] or in_quote[1]:
update_in_quote(in_quote, token)
elif token == 'SELECT':
depth += 1
max_depth = max(max_depth, depth)
other_bracket.append(0)
breadth[-1] += 1
breadth.append(0)
elif '(' in prev:
other_bracket[-1] += 1
update_in_quote(in_quote, token)
elif token == ')':
if other_bracket[-1] == 0:
depth -= 1
other_bracket.pop()
possible = breadth.pop()
max_breadth = max(max_breadth, possible)
else:
other_bracket[-1] -= 1
else:
update_in_quote(in_quote, token)
if '(' in token and ')' in token:
prev = "SQL_FUNCTION"
else:
prev = token
assert len(other_bracket) == 1 and other_bracket[0] == 0, sql
assert depth == 1, sql
stats["SQL-depth-{}".format(max_depth)] += 1
stats["SQL-breadth-{}".format(max_breadth)] += 1
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Prints stats about the specified files.')
parser.add_argument('--per-file-stats', help='Show stats on each file as well as overall', action='store_true')
parser.add_argument('json_files', help='File in our json format', nargs='+')
args = parser.parse_args()
total_stats = Counter()
for filename in args.json_files:
cur_stats = Counter()
data = json.load(open(filename))
if type(data) == list:
for query in data:
if args.per_file_stats:
process_query(query, cur_stats)
process_query(query, total_stats)
else:
if args.per_file_stats:
process_query(data, cur_stats)
process_query(data, total_stats)
if args.per_file_stats:
for stat in cur_stats:
print(filename, stat, cur_stats[stat])
start = ''
if args.per_file_stats:
start = "Overall: "
for stat in total_stats:
print(start + stat, total_stats[stat])