-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathted_talk_statistic_correlation.py
More file actions
100 lines (96 loc) · 3.99 KB
/
ted_talk_statistic_correlation.py
File metadata and controls
100 lines (96 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import cPickle as cp
import os
import nltk
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
from list_of_talks import all_valid_talks
from TED_data_location import ted_data_path
# Returns indices with outliers removed
def remove_outlier(alist):
nplist = np.array(alist)
mean = np.mean(nplist)
std = np.std(nplist)
idx = np.where((nplist<mean+3.*std) & (nplist>mean-3.*std))
print 'Outlier Removal'
print 'input list length:',len(alist)
print 'output list length:',len(idx[0])
print 'outliers:',len(alist)-len(idx[0])
print
return idx[0]
# abs_ratcnt = Absolute Rating Count
def plot_correlation(abs_ratcnt,infolder,
outfolder,show_scatter=False):
alltalks = [str(afile)+'.pkl' for afile in all_valid_talks]
tottalks = len(alltalks)
totlen,totut,tottok,totsent = 0,0,0,0
lenlst,viewlst,ratinglst,topratings,timealive,kwlst=[],[],{},{},[],[]
titles=[]
allratings={}
allrating_names=['beautiful','confusing','courageous','fascinating','funny',\
'informative','ingenious','inspiring','jaw-dropping','longwinded',\
'obnoxious','ok','persuasive','total_count','unconvincing']
# Reading all the pickle files and enlisting required info
for afile in alltalks:
atalk=cp.load(open(infolder+afile,'rb'))
# View count
viewlst.append(atalk['talk_meta']['totalviews'])
# Update total ratings and list the highest rating of each talk
for akey in allrating_names:
if akey=='total_count':
continue
if not allratings.get(akey):
if not abs_ratcnt:
allratings[akey]=[atalk['talk_meta']['ratings'].get(akey,0)/
float(atalk['talk_meta']['ratings']['total_count'])]
else:
allratings[akey]=[atalk['talk_meta']['ratings'].get(akey,0)]
else:
if not abs_ratcnt:
allratings[akey].append(float(atalk['talk_meta']['ratings'].get(akey,0))/\
float(atalk['talk_meta']['ratings']['total_count']))
else:
allratings[akey].append(float(atalk['talk_meta']['ratings'].get(akey,0)))
# Drawing the scatter plots
allcorr=[]
for ind,akey in enumerate(allratings):
# remove the outliers because some ratings are so high that it skews the plot
#idx = remove_outlier(allratings[akey])
#x = [viewlst[i] for i in idx]
#y = [allratings[akey][i] for i in idx]
# Calculate Correlation Coefficient
z = np.corrcoef(viewlst,allratings[akey])[0,1]
allcorr.append(z)
print 'Correlation coefficient for rating',akey,'and view count:',z
if show_scatter:
plt.figure(ind)
plt.scatter(viewlst,allratings[akey],alpha=0.33)
# plot the x axis in log scale
plt.xscale('log',nonposy='clip')
plt.xlabel('Total Viewcount (log scale)')
plt.ylabel('Percent of ratings')
plt.title('rating-view scatter plot '+akey+\
'corr: {:0.2f}'.format(z))
plt.tight_layout()
if abs_ratcnt:
plt.savefig(outfolder+'scatter_'+akey+'absolute'+'.eps')
else:
plt.savefig(outfolder+'scatter_'+akey+'.eps')
plt.figure()
allcorr = np.array(allcorr)
idx = np.argsort(allcorr)
pos = list(range(len(allcorr)))
plt.barh(pos,allcorr[idx])
plt.yticks(pos,[allratings.keys()[i] for i in idx])
plt.xlabel('Correlation Coefficient')
plt.title('CorrCoef of Ratings vs. Total View')
plt.tight_layout()
if abs_ratcnt:
plt.savefig(outfolder+'CorrCoef of Ratings vs. Total View (absolute)'+'.eps')
else:
plt.savefig(outfolder+'CorrCoef of Ratings vs. Total View'+'.eps')
if __name__=='__main__':
plot_correlation(False,
infolder=os.path.join(ted_data_path,'talks/'),
outfolder=os.path.join(ted_data_path,'TED_stats/'))