-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup.py
190 lines (167 loc) · 6.21 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# PARAMS
import sys
import os
CONTEXT_RANGE = 100
# FOLDERS
NEWSWCL50_FOLDER_NAME = "2019_annot"
ECBPLUS_FOLDER_NAME = "ECB+"
MEANTIME_FOLDER_NAME = "meantime_newsreader_english_oct15"
OUTPUT_FOLDER_NAME = "output_data"
SUMMARY_FOLDER = "summary"
TMP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
# DATASETS
NEWSWCL50 = "NewsWCL50-prep"
ECB_PLUS = "ECBplus-prep"
MEANTIME = "MEANTIME-prep"
# FILES
SAMPLE_DOC_JSON = "_sample_doc.json"
SAMPLE_MENTION_JSON = "_sample_mention.json"
MENTIONS_ALL_CSV = "all_mentions.csv"
MENTIONS_EVENTS_JSON = "event_mentions.json"
MENTIONS_ENTITIES_JSON = "entities_mentions.json"
CONLL_CSV = "conll.csv"
SUMMARY_CHAINS_CSV = "summary_chains.csv"
SUMMARY_TOPICS_CSV = "summary_dataset_topics.csv"
MANUAL_REVIEW_FILE = "manual_review_needed.json"
# coref types
STRICT = "STRICT"
NEAR_IDENTITY = "NEAR_IDENTITY"
# doc.json fields (from news-please)
TITLE = "title"
DESCRIPTION = "description"
TEXT = "text"
SOURCE_DOMAIN = "source_domain"
# NewsWCL50 original column names in annotated mentions)
CODE = "Code"
SEGMENT = "Segment"
DOCUMENT_NAME = "Document name"
BEGINNING = "Beginning"
TYPE = "type"
# mentions.json fields
TOPIC_ID = "topic_id"
TOPIC = "topic"
COREF_CHAIN = "coref_chain"
MENTION_FULL_TYPE = "mention_full_type"
MENTION_TYPE = "mention_type"
MENTION_NER = "mention_ner"
MENTION_HEAD_POS = "mention_head_pos"
MENTION_HEAD_LEMMA = "mention_head_lemma"
MENTION_HEAD = "mention_head"
MENTION_HEAD_ID = "mention_head_id"
DOC_ID_FULL = "doc_id_full"
DOC_ID = "doc_id"
IS_CONTINIOUS = "is_continuous"
IS_SINGLETON = "is_singleton"
SENTENCE = "sentence"
MENTION_ID = "mention_id"
SCORE = "score"
SENT_ID = "sent_id"
MENTION_CONTEXT = "mention_context"
TOKENS_NUMBER = "tokens_number"
TOKENS_TEXT = "tokens_text"
TOKENS_STR = "tokens_str"
TOKEN_ID = "token_id"
COREF_TYPE = "coref_type"
SUBTOPIC = "subtopic"
CONLL_DOC_KEY = "conll_doc_key"
# conll fields
REFERENCE = "reference"
DOC_IDENTIFIER = "doc_identifier"
TOKEN = "token"
TOPIC_SUBTOPIC = "topic/subtopic_name"
# summary fields
DATASET_NAME = "dataset"
TOPICS = "topics"
ENTITY = "entity"
EVENT = "event"
MENTIONS = "mentions"
PHRASING_DIVERSITY = "phrasing_diversity"
UNIQUE_LEMMAS = "unique_lemmas"
WEIGHTED = "_weighted"
MEAN = "_mean"
ALL = "_all"
WO_SINGL = "_wo_singl"
ARTICLES = "articles"
TOKENS = "tokens"
SINGLETONS = "singletons"
AVERAGE_SIZE = "average_size"
# support fields
CONCAT_TEXT = "concat_text"
FIRST_TOKEN = "first_token"
LAST_TOKEN = "last_token"
# ECB+ orig annotated files
T_ID = "t_id"
ID = "id"
SENT = "sent"
M_ID = "m_id"
NUM = "number"
if __name__ == '__main__':
import spacy
import gdown
import zipfile
import os
FOLDER = "folder"
ZIP = "zip"
LINK = "link"
while True:
try:
b = input("Would you like to download the spacy languages? (y/n) : ")
b = b.lower()
assert b == "y" or b == "n"
break
except (ValueError, AssertionError) as e:
print("Oops! That input was not correct (y/n). Please retry.")
if b == "y":
print("Downloading spacy languages...")
spacy.cli.download('en_core_web_sm')
spacy.cli.download('es_core_news_sm')
spacy.cli.download('nl_core_news_sm')
spacy.cli.download('it_core_news_sm')
else:
print("Skipping the download of languages.")
datasets = {ECB_PLUS: {LINK: "https://github.com/cltl/ecbPlus/raw/master/ECB%2B_LREC2014/ECB%2B.zip",
ZIP: os.path.join(os.getcwd(), ECB_PLUS, ECBPLUS_FOLDER_NAME + ".zip"),
FOLDER: os.path.join(os.getcwd(), ECB_PLUS)},
MEANTIME: {LINK: "https://drive.google.com/u/0/uc?id=1K0hcWHOomyrFaKigwzrwImHugdb1pjAX&export=download",
ZIP: os.path.join(os.getcwd(), MEANTIME, MEANTIME_FOLDER_NAME + ".zip"),
FOLDER: os.path.join(os.getcwd(), MEANTIME)},
NEWSWCL50: {LINK: "https://drive.google.com/u/1/uc?id=1ZcTnDeY85iIeUX0nvg3cypnRq87tVSVo&export=download",
ZIP: os.path.join(os.getcwd(), NEWSWCL50, NEWSWCL50_FOLDER_NAME + ".zip"),
FOLDER: os.path.join(os.getcwd(), NEWSWCL50)}}
prompt_str = "The following datasets are available for download: \n\n"
for i, dataset in enumerate(datasets.keys()):
prompt_str = prompt_str + str(i) + ": " + dataset + "\n"
prompt_str = prompt_str + str(len(datasets)) + ": all datasets \n"
print(prompt_str)
while True:
try:
input_number = int(input("Please enter a number to download the dataset: "))
assert 0 <= input_number < len(datasets)
break
except (ValueError, AssertionError) as e:
print("Oops! Seems like the number you entered is not a number or not valid. Please retry. ")
# All datasets download
if input_number == len(datasets):
for dataset, values in datasets.items():
print("Getting: " + dataset)
gdown.download(values[LINK], values[ZIP], quiet=False)
with zipfile.ZipFile(values[ZIP], 'r') as zip_ref:
zip_ref.extractall(values[FOLDER])
if dataset == ECB_PLUS:
gdown.download("https://raw.githubusercontent.com/cltl/ecbPlus/master/ECB%2B_LREC2014/ECBplus_coreference_sentences.csv",
os.path.join(os.getcwd(), ECB_PLUS, ECBPLUS_FOLDER_NAME, "ECBplus_coreference_sentences.csv"), quiet=False)
# Download selected dataset
elif 0 <= input_number < len(datasets):
for i, (dataset, values) in enumerate(datasets.items()):
if i != input_number: # skip other datasets
continue
print("Getting: " + dataset)
gdown.download(values[LINK], values[ZIP], quiet=False)
with zipfile.ZipFile(values[ZIP], 'r') as zip_ref:
zip_ref.extractall(values[FOLDER])
if dataset == ECB_PLUS:
gdown.download(
"https://raw.githubusercontent.com/cltl/ecbPlus/master/ECB%2B_LREC2014/ECBplus_coreference_sentences.csv",
os.path.join(os.getcwd(), ECB_PLUS, ECBPLUS_FOLDER_NAME, "ECBplus_coreference_sentences.csv"),
quiet=False)
print("Setup successful.")