Merge @parker57 data restructure (ety-python/data-restructure)

Closes #24, closes #42
jmsv · Jun 24, 2018 · ed2844c · ed2844c
1 parent 31140a0
commit ed2844c
Show file tree

Hide file tree

Showing 16 changed files with 235 additions and 39 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -20,5 +20,4 @@ before_script:
 script:
   - flake8 .
   - python tests.py
-  - timeout 10s python -c "import tests; tests.test_circular_etymology()"
-
+  - timeout 30s python -c "import tests; tests.test_circular_etymology()"
diff --git a/Makefile b/Makefile
@@ -6,10 +6,13 @@ install:
 test:
 	flake8
 	python tests.py
-	timeout 10s python -c "import tests; tests.test_circular_etymology()"
+	timeout 30s python -c "import tests; tests.test_circular_etymology()"
+
+data:
+	PYTHONIOENCODING=utf-8 python ety/data/generate.py
 
 clean:
-	rm -rf build dist ety.egg-info _trial_temp __pycache__
+	rm -rf build dist ety.egg-info _trial_temp __pycache__ */__pycache__
 	rm -f *.pyc */*.pyc
 
 dist:

diff --git a/Pipfile b/Pipfile
@@ -11,6 +11,8 @@ six = "*"
 [dev-packages]
 pylint = "*"
 "flake8" = "*"
+requests = "*"
+clint = "*"
 
 [requires]
 python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/ety/__init__.py b/ety/__init__.py
@@ -59,9 +59,8 @@ def tree(word, word_lang='eng'):
 
 
 def random_word(lang='eng'):
-    row = list(filter(lambda entry: entry['a_lang'] == lang, data.etyms))
-    word = choice(row)['a_word']
-    return Word(word, lang)
+    w = choice(list(data.etyms[lang]))
+    return Word(w, lang)
 
 
 def census(words):

diff --git a/ety/data/.gitignore b/ety/data/.gitignore
@@ -0,0 +1,2 @@
+source/
+
diff --git a/ety/data.py → ety/data/__init__.py b/ety/data.py → ety/data/__init__.py
@@ -7,17 +7,17 @@
 from pkg_resources import resource_filename
 
 
-def load_relety():
-    resource = resource_filename('ety', 'wn/etymwn-relety.json')
+def load_etymologies():
+    resource = resource_filename('ety', 'data/etymologies.json')
     with io.open(resource, 'r', encoding='utf-8') as f:
         return json.load(f)
 
 
 def load_country_codes():
-    resource = resource_filename('ety', 'wn/iso-639-3.json')
+    resource = resource_filename('ety', 'data/iso-639-3.json')
     with io.open(resource, 'r', encoding='utf-8') as f:
         return json.load(f)
 
 
-etyms = load_relety()
+etyms = load_etymologies()
 langs = load_country_codes()
diff --git a/ety/data/etymologies.json b/ety/data/etymologies.json
diff --git a/ety/data/generate.py b/ety/data/generate.py
@@ -0,0 +1,138 @@
+"""
+Filtered data is the full dataset after the following:
+grep -E "rel:etymology|rel:is_derived_from" etymwn.tsv > etymwn-filtered.tsv
+"""
+
+import csv
+import hashlib
+import os
+import io
+import json
+import gc
+
+import requests
+import six
+from clint.textui import progress
+
+
+def prepare(source_dir):
+    if not os.path.exists(source_dir):
+        os.makedirs(source_dir)
+
+
+def download_dataset(url, dl_path):
+    """
+    Download etymwn from jmsv.me mirror, displaying progress bar
+    """
+    r = requests.get(url, stream=True)
+
+    with open(dl_path, 'wb') as f:
+        total_length = int(r.headers.get('content-length'))
+        chunk_size = 4096
+        for chunk in progress.bar(
+                r.iter_content(chunk_size=chunk_size),
+                expected_size=(total_length / chunk_size) + 1):
+            if chunk:
+                f.write(chunk)
+                f.flush()
+    print('Downloaded to ' + dl_path)
+
+
+def verify_local_data(url, dl_path):
+    """
+    Compare actual file checksum with expected served checksum
+    :return: True if local file matches, otherwise False
+    """
+    try:
+        with open(dl_path, 'rb') as f:
+            actual = hashlib.md5(f.read()).hexdigest()
+    except EnvironmentError:
+        return False
+
+    expected = requests.get('%s.checksum' % url).text.strip()
+    return actual == expected
+
+
+def split_elements(compound):
+    elements = [e.strip() for e in compound.split(':')]
+    if len(elements) == 2:
+        return elements
+
+    result = [elements[0], ':'.join(elements[1:])]
+    return result
+
+
+def generate_json(source_path, dir):
+    result = {}
+
+    print('Loading source tsv')
+    with io.open(source_path, 'r', newline='', encoding='utf-8') as source:
+        reader = csv.reader(source, delimiter='\t')
+        source_rows = list(reader)
+
+    gc.collect()
+
+    print('Structuring data')
+    for row in progress.bar(source_rows):
+        source_lang, source_word = split_elements(row[0])
+
+        if source_lang not in result:
+            result[source_lang] = {}
+        if source_word not in result[source_lang]:
+            result[source_lang][source_word] = []
+
+        dest_lang, dest_word = split_elements(row[2])
+        result[source_lang][source_word].append({dest_word: dest_lang})
+
+        del source_lang, source_word, dest_lang, dest_word
+
+    # Save data to seperate files for languages, may be required in the future
+    # print('Saving language files')
+    # for key in progress.bar(result):
+    #     with io.open(os.path.join(dir, 'data/ety-%s.json' % key), 'w') as f:
+    #         f.write(json.dumps(result[key], sort_keys=False))
+
+    # Save data
+    print('Writing etymologies file')
+    with io.open(os.path.join(dir, 'etymologies.json'), 'w') as f:
+        json.dump(result, f)
+
+
+def main():
+    """
+    Define paths, download data if required, generate dataset
+    :return:
+    """
+    dir = os.path.dirname(os.path.realpath(__file__))
+    source_dir = os.path.join(dir, 'source')
+    source_path = os.path.join(source_dir, 'etymwn.tsv')
+    source_url = 'https://data.jmsv.me/etymwn-filtered.tsv'
+
+    # Exit if not Python 3
+    if not six.PY3:
+        print("Script should be run as Python 3, exiting")
+        exit(1)
+
+    prepare(source_dir)
+
+    # (Re)download data if required
+    if not verify_local_data(source_url, source_path):
+        print('Downloading source data')
+        download_dataset(source_url, source_path)
+
+        # If checksum still doesn't match, exit
+        if verify_local_data(source_url, source_path):
+            print('Verified local source data')
+        else:
+            print('Error verifying local source data, exiting')
+            exit(1)
+    else:
+        print('Verified local source data')
+
+    generate_json(source_path, dir)
+
+    print('Done')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ety/wn/iso-639-3.json → ety/data/iso-639-3.json b/ety/wn/iso-639-3.json → ety/data/iso-639-3.json
diff --git a/ety/wn/readme.md → ety/data/readme.md b/ety/wn/readme.md → ety/data/readme.md
@@ -1,7 +1,13 @@
-- etymwn-eng.tsv is the etymwn.tsv dataset filtered to lines starting with "eng:"
-- etymwn-relety.tsv is the dataset filtered to lines containing "rel:etymology"
-- iso-639-3.json contains country codes used by the etymwn dataset
+# data
 
+- `__init__.py` is imported by ety to read data
+- `generate.py` downloads (if required) and generates data
+
+---
+
+## Original etymwn readme
+
+```
 Etymological Wordnet 2013-02-08
 Gerard de Melo
 http://icsi.berkeley.edu/~demelo/etymwn/
@@ -37,3 +43,5 @@ In scientific works, please cite:
   Gerard de Melo, Gerhard Weikum. "Towards Universal Multilingual Knowledge Bases".
   In: Principles, Construction, and Applications of Multilingual Wordnets. Proceedings
   of the 5th Global Wordnet Conference (GWC 2010). Narosa Publishing 2010, New Delhi India.
+```
+
diff --git a/ety/wn/.gitignore b/ety/wn/.gitignore
diff --git a/ety/wn/etymwn-relety.json b/ety/wn/etymwn-relety.json
diff --git a/ety/word.py b/ety/word.py
@@ -24,27 +24,25 @@ def __init__(self, word, language='eng', is_source=False):
         self._id = u"{}:{}".format(self.word, self.language.iso)
 
     def origins(self, recursive=False):
-        search = 'recursive' if recursive else 'direct'
+        if self.word not in etymwn_data[self.language.iso]:
+            # There are no roots for this word
+            return []
 
-        o = self._origins[search]  # Origins for direct or recursive search
+        roots = [Word(word, lang) for root in
+                 etymwn_data[self.language.iso][self.word] for word, lang in
+                 root.items()]
 
-        if o:
-            return o
-
-        row = list(filter(
-            lambda entry: entry['a_word'] == self.word and entry[
-                'a_lang'] == self.language.iso, etymwn_data))
-
-        o = [Word(item['b_word'], item['b_lang']) for item in row]
+        tracked = roots[:]
 
         if recursive:
-            for origin in o:
-                for child in origin.origins():
+            for root in tracked:
+                for child in root.origins():
                     # Check word isn't already in tree before appending
-                    if child not in o and child != self:
-                        o.append(child)
+                    if child not in tracked and child != self:
+                        tracked.append(child)
 
-        return o
+        self._origins = tracked
+        return self._origins
 
     def tree(self):
         return EtyTree(self)
@@ -70,6 +68,6 @@ def __str__(self):
         return self.pretty
 
     def __repr__(self):
-        return u'Word({word}, language={lang})'.format(
-            word=self.word, lang=self.language
+        return u'Word({word}, {lang} [{iso}])'.format(
+            word=self.word, lang=self.language, iso=self.language.iso
         )
diff --git a/setup.py b/setup.py
@@ -31,15 +31,15 @@
     ],
     python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4',
     keywords='etymology origins english language words',
-    packages=['ety'],
+    packages=['ety', 'ety/data'],
     install_requires=[
         "treelib", "colorful", "six",
     ],
     extras_require={
         'dev': ['flake8'],
     },
     package_data={
-        'ety': ['wn/etymwn-relety.json', 'wn/iso-639-3.json'],
+        'ety': ['data/etymologies.json', 'data/iso-639-3.json'],
     },
     entry_points={
         'console_scripts': [