Skip to content

Commit

Permalink
Merge @parker57 data restructure (ety-python/data-restructure)
Browse files Browse the repository at this point in the history
Closes #24, closes #42
  • Loading branch information
jmsv committed Jun 24, 2018
1 parent 31140a0 commit ed2844c
Show file tree
Hide file tree
Showing 16 changed files with 235 additions and 39 deletions.
3 changes: 1 addition & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,4 @@ before_script:
script:
- flake8 .
- python tests.py
- timeout 10s python -c "import tests; tests.test_circular_etymology()"

- timeout 30s python -c "import tests; tests.test_circular_etymology()"
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ install:
test:
flake8
python tests.py
timeout 10s python -c "import tests; tests.test_circular_etymology()"
timeout 30s python -c "import tests; tests.test_circular_etymology()"

data:
PYTHONIOENCODING=utf-8 python ety/data/generate.py

clean:
rm -rf build dist ety.egg-info _trial_temp __pycache__
rm -rf build dist ety.egg-info _trial_temp __pycache__ */__pycache__
rm -f *.pyc */*.pyc

dist:
Expand Down
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ six = "*"
[dev-packages]
pylint = "*"
"flake8" = "*"
requests = "*"
clint = "*"

[requires]
python_version = "3.6"
51 changes: 50 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions ety/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,8 @@ def tree(word, word_lang='eng'):


def random_word(lang='eng'):
row = list(filter(lambda entry: entry['a_lang'] == lang, data.etyms))
word = choice(row)['a_word']
return Word(word, lang)
w = choice(list(data.etyms[lang]))
return Word(w, lang)


def census(words):
Expand Down
2 changes: 2 additions & 0 deletions ety/data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
source/

8 changes: 4 additions & 4 deletions ety/data.py → ety/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@
from pkg_resources import resource_filename


def load_relety():
resource = resource_filename('ety', 'wn/etymwn-relety.json')
def load_etymologies():
resource = resource_filename('ety', 'data/etymologies.json')
with io.open(resource, 'r', encoding='utf-8') as f:
return json.load(f)


def load_country_codes():
resource = resource_filename('ety', 'wn/iso-639-3.json')
resource = resource_filename('ety', 'data/iso-639-3.json')
with io.open(resource, 'r', encoding='utf-8') as f:
return json.load(f)


etyms = load_relety()
etyms = load_etymologies()
langs = load_country_codes()
1 change: 1 addition & 0 deletions ety/data/etymologies.json

Large diffs are not rendered by default.

138 changes: 138 additions & 0 deletions ety/data/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
Filtered data is the full dataset after the following:
grep -E "rel:etymology|rel:is_derived_from" etymwn.tsv > etymwn-filtered.tsv
"""

import csv
import hashlib
import os
import io
import json
import gc

import requests
import six
from clint.textui import progress


def prepare(source_dir):
if not os.path.exists(source_dir):
os.makedirs(source_dir)


def download_dataset(url, dl_path):
"""
Download etymwn from jmsv.me mirror, displaying progress bar
"""
r = requests.get(url, stream=True)

with open(dl_path, 'wb') as f:
total_length = int(r.headers.get('content-length'))
chunk_size = 4096
for chunk in progress.bar(
r.iter_content(chunk_size=chunk_size),
expected_size=(total_length / chunk_size) + 1):
if chunk:
f.write(chunk)
f.flush()
print('Downloaded to ' + dl_path)


def verify_local_data(url, dl_path):
"""
Compare actual file checksum with expected served checksum
:return: True if local file matches, otherwise False
"""
try:
with open(dl_path, 'rb') as f:
actual = hashlib.md5(f.read()).hexdigest()
except EnvironmentError:
return False

expected = requests.get('%s.checksum' % url).text.strip()
return actual == expected


def split_elements(compound):
elements = [e.strip() for e in compound.split(':')]
if len(elements) == 2:
return elements

result = [elements[0], ':'.join(elements[1:])]
return result


def generate_json(source_path, dir):
result = {}

print('Loading source tsv')
with io.open(source_path, 'r', newline='', encoding='utf-8') as source:
reader = csv.reader(source, delimiter='\t')
source_rows = list(reader)

gc.collect()

print('Structuring data')
for row in progress.bar(source_rows):
source_lang, source_word = split_elements(row[0])

if source_lang not in result:
result[source_lang] = {}
if source_word not in result[source_lang]:
result[source_lang][source_word] = []

dest_lang, dest_word = split_elements(row[2])
result[source_lang][source_word].append({dest_word: dest_lang})

del source_lang, source_word, dest_lang, dest_word

# Save data to seperate files for languages, may be required in the future
# print('Saving language files')
# for key in progress.bar(result):
# with io.open(os.path.join(dir, 'data/ety-%s.json' % key), 'w') as f:
# f.write(json.dumps(result[key], sort_keys=False))

# Save data
print('Writing etymologies file')
with io.open(os.path.join(dir, 'etymologies.json'), 'w') as f:
json.dump(result, f)


def main():
"""
Define paths, download data if required, generate dataset
:return:
"""
dir = os.path.dirname(os.path.realpath(__file__))
source_dir = os.path.join(dir, 'source')
source_path = os.path.join(source_dir, 'etymwn.tsv')
source_url = 'https://data.jmsv.me/etymwn-filtered.tsv'

# Exit if not Python 3
if not six.PY3:
print("Script should be run as Python 3, exiting")
exit(1)

prepare(source_dir)

# (Re)download data if required
if not verify_local_data(source_url, source_path):
print('Downloading source data')
download_dataset(source_url, source_path)

# If checksum still doesn't match, exit
if verify_local_data(source_url, source_path):
print('Verified local source data')
else:
print('Error verifying local source data, exiting')
exit(1)
else:
print('Verified local source data')

generate_json(source_path, dir)

print('Done')


if __name__ == '__main__':
main()
File renamed without changes.
14 changes: 11 additions & 3 deletions ety/wn/readme.md → ety/data/readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
- etymwn-eng.tsv is the etymwn.tsv dataset filtered to lines starting with "eng:"
- etymwn-relety.tsv is the dataset filtered to lines containing "rel:etymology"
- iso-639-3.json contains country codes used by the etymwn dataset
# data

- `__init__.py` is imported by ety to read data
- `generate.py` downloads (if required) and generates data

---

## Original etymwn readme

```
Etymological Wordnet 2013-02-08
Gerard de Melo
http://icsi.berkeley.edu/~demelo/etymwn/
Expand Down Expand Up @@ -37,3 +43,5 @@ In scientific works, please cite:
Gerard de Melo, Gerhard Weikum. "Towards Universal Multilingual Knowledge Bases".
In: Principles, Construction, and Applications of Multilingual Wordnets. Proceedings
of the 5th Global Wordnet Conference (GWC 2010). Narosa Publishing 2010, New Delhi India.
```

2 changes: 0 additions & 2 deletions ety/wn/.gitignore

This file was deleted.

1 change: 0 additions & 1 deletion ety/wn/etymwn-relety.json

This file was deleted.

32 changes: 15 additions & 17 deletions ety/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,25 @@ def __init__(self, word, language='eng', is_source=False):
self._id = u"{}:{}".format(self.word, self.language.iso)

def origins(self, recursive=False):
search = 'recursive' if recursive else 'direct'
if self.word not in etymwn_data[self.language.iso]:
# There are no roots for this word
return []

o = self._origins[search] # Origins for direct or recursive search
roots = [Word(word, lang) for root in
etymwn_data[self.language.iso][self.word] for word, lang in
root.items()]

if o:
return o

row = list(filter(
lambda entry: entry['a_word'] == self.word and entry[
'a_lang'] == self.language.iso, etymwn_data))

o = [Word(item['b_word'], item['b_lang']) for item in row]
tracked = roots[:]

if recursive:
for origin in o:
for child in origin.origins():
for root in tracked:
for child in root.origins():
# Check word isn't already in tree before appending
if child not in o and child != self:
o.append(child)
if child not in tracked and child != self:
tracked.append(child)

return o
self._origins = tracked
return self._origins

def tree(self):
return EtyTree(self)
Expand All @@ -70,6 +68,6 @@ def __str__(self):
return self.pretty

def __repr__(self):
return u'Word({word}, language={lang})'.format(
word=self.word, lang=self.language
return u'Word({word}, {lang} [{iso}])'.format(
word=self.word, lang=self.language, iso=self.language.iso
)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@
],
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4',
keywords='etymology origins english language words',
packages=['ety'],
packages=['ety', 'ety/data'],
install_requires=[
"treelib", "colorful", "six",
],
extras_require={
'dev': ['flake8'],
},
package_data={
'ety': ['wn/etymwn-relety.json', 'wn/iso-639-3.json'],
'ety': ['data/etymologies.json', 'data/iso-639-3.json'],
},
entry_points={
'console_scripts': [
Expand Down
Loading

0 comments on commit ed2844c

Please sign in to comment.