-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_coverage.py
68 lines (54 loc) · 1.95 KB
/
find_coverage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from sklearn.metrics import mutual_info_score
import numpy as np
from operator import itemgetter
import codecs, json
import sys
from pprint import pprint
"""
python find_coverage.py "comp_vis_cluster_topics.json" "comp_vis_sentence_topics.json" 5 "comp_vis_coverage_dict.json"
"""
""" Given the probability distributions of topics for each cluster and for each sentence in the
original text, return for each cluster the N sentences with the lowest kl-divergence to the
cluster's topic distribution.
"""
def topNsentences(cluster_topics, sentence_topics, N):
with open(cluster_topics, 'r') as f1:
C_topics = json.loads(f1.read())
with open(sentence_topics, 'r') as f2:
S_topics = json.loads(f2.read())
# pprint(C_topics)
# pprint(S_topics)
print "Loaded topics..."
numClustr = len(C_topics)
# MATRIX OF MUTUAL INFORMATION SCORES
scores = np.zeros((len(C_topics), len(S_topics)))
concept_lookup = {}
sent_lookup = {}
for i in range(len(C_topics.keys())):
concept = C_topics.keys()[i]
ct = [ t[1] for t in C_topics[concept]]
concept_lookup[i] = concept
for j in range(len(S_topics.keys())):
sentence = S_topics.keys()[j]
st = [ t[1] for t in S_topics[sentence]]
sent_lookup[j] = sentence
scores[i][j] = mutual_info_score(ct, st)
pprint(scores)
print "Scores matrix done..."
(c,s) = np.shape(scores)
coverage_dict = {}
for i in range(c):
con = concept_lookup[i]
simil = scores[i]
topInx = simil.argsort()[-N:][::-1]
coverage_dict[con] = []
for n in range(len(topInx)):
sent = sent_lookup[topInx[n]]
coverage_dict[con].append(sent)
pprint(coverage_dict)
print "Coverage dictionary done..."
json.dump(coverage_dict, codecs.open(sys.argv[4], 'w', encoding='utf-8'), separators=(',', ':'), indent=4) ### this saves the array in .json format
print "Saved to json. Done!"
return
# coverage = topNsentences("cluster_topics.json", "sentence_topics.json", 5)
coverage = topNsentences(sys.argv[1], sys.argv[2], int(sys.argv[3]))