-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge @parker57 data restructure (ety-python/data-restructure)
- Loading branch information
Showing
16 changed files
with
235 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
source/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
""" | ||
Filtered data is the full dataset after the following: | ||
grep -E "rel:etymology|rel:is_derived_from" etymwn.tsv > etymwn-filtered.tsv | ||
""" | ||
|
||
import csv | ||
import hashlib | ||
import os | ||
import io | ||
import json | ||
import gc | ||
|
||
import requests | ||
import six | ||
from clint.textui import progress | ||
|
||
|
||
def prepare(source_dir): | ||
if not os.path.exists(source_dir): | ||
os.makedirs(source_dir) | ||
|
||
|
||
def download_dataset(url, dl_path): | ||
""" | ||
Download etymwn from jmsv.me mirror, displaying progress bar | ||
""" | ||
r = requests.get(url, stream=True) | ||
|
||
with open(dl_path, 'wb') as f: | ||
total_length = int(r.headers.get('content-length')) | ||
chunk_size = 4096 | ||
for chunk in progress.bar( | ||
r.iter_content(chunk_size=chunk_size), | ||
expected_size=(total_length / chunk_size) + 1): | ||
if chunk: | ||
f.write(chunk) | ||
f.flush() | ||
print('Downloaded to ' + dl_path) | ||
|
||
|
||
def verify_local_data(url, dl_path): | ||
""" | ||
Compare actual file checksum with expected served checksum | ||
:return: True if local file matches, otherwise False | ||
""" | ||
try: | ||
with open(dl_path, 'rb') as f: | ||
actual = hashlib.md5(f.read()).hexdigest() | ||
except EnvironmentError: | ||
return False | ||
|
||
expected = requests.get('%s.checksum' % url).text.strip() | ||
return actual == expected | ||
|
||
|
||
def split_elements(compound): | ||
elements = [e.strip() for e in compound.split(':')] | ||
if len(elements) == 2: | ||
return elements | ||
|
||
result = [elements[0], ':'.join(elements[1:])] | ||
return result | ||
|
||
|
||
def generate_json(source_path, dir): | ||
result = {} | ||
|
||
print('Loading source tsv') | ||
with io.open(source_path, 'r', newline='', encoding='utf-8') as source: | ||
reader = csv.reader(source, delimiter='\t') | ||
source_rows = list(reader) | ||
|
||
gc.collect() | ||
|
||
print('Structuring data') | ||
for row in progress.bar(source_rows): | ||
source_lang, source_word = split_elements(row[0]) | ||
|
||
if source_lang not in result: | ||
result[source_lang] = {} | ||
if source_word not in result[source_lang]: | ||
result[source_lang][source_word] = [] | ||
|
||
dest_lang, dest_word = split_elements(row[2]) | ||
result[source_lang][source_word].append({dest_word: dest_lang}) | ||
|
||
del source_lang, source_word, dest_lang, dest_word | ||
|
||
# Save data to seperate files for languages, may be required in the future | ||
# print('Saving language files') | ||
# for key in progress.bar(result): | ||
# with io.open(os.path.join(dir, 'data/ety-%s.json' % key), 'w') as f: | ||
# f.write(json.dumps(result[key], sort_keys=False)) | ||
|
||
# Save data | ||
print('Writing etymologies file') | ||
with io.open(os.path.join(dir, 'etymologies.json'), 'w') as f: | ||
json.dump(result, f) | ||
|
||
|
||
def main(): | ||
""" | ||
Define paths, download data if required, generate dataset | ||
:return: | ||
""" | ||
dir = os.path.dirname(os.path.realpath(__file__)) | ||
source_dir = os.path.join(dir, 'source') | ||
source_path = os.path.join(source_dir, 'etymwn.tsv') | ||
source_url = 'https://data.jmsv.me/etymwn-filtered.tsv' | ||
|
||
# Exit if not Python 3 | ||
if not six.PY3: | ||
print("Script should be run as Python 3, exiting") | ||
exit(1) | ||
|
||
prepare(source_dir) | ||
|
||
# (Re)download data if required | ||
if not verify_local_data(source_url, source_path): | ||
print('Downloading source data') | ||
download_dataset(source_url, source_path) | ||
|
||
# If checksum still doesn't match, exit | ||
if verify_local_data(source_url, source_path): | ||
print('Verified local source data') | ||
else: | ||
print('Error verifying local source data, exiting') | ||
exit(1) | ||
else: | ||
print('Verified local source data') | ||
|
||
generate_json(source_path, dir) | ||
|
||
print('Done') | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.