Added indexing constants, merged functionality into export script

neuml · May 26, 2020 · e3e8f99 · e3e8f99
1 parent a924c52
commit e3e8f99
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 14 deletions.
diff --git a/src/python/cord19q/export.py b/src/python/cord19q/export.py
@@ -7,8 +7,11 @@
 import sqlite3
 import sys
 
+import regex as re
+
 # pylint: disable=E0611
 # Defined at runtime
+from .index import Index
 from .models import Models
 
 class Export(object):
@@ -31,19 +34,19 @@ def stream(dbfile, output):
             db = sqlite3.connect(dbfile)
             cur = db.cursor()
 
-            # Database query
-            cur.execute("SELECT Text FROM sections WHERE tags is not null AND design NOT IN (0, 9) AND " +
-                        "(labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))")
+            # Get all indexed text, with a detected study design, excluding modeling designs
+            cur.execute(Index.SECTION_QUERY + " AND design NOT IN (0, 9)")
 
             count = 0
-            for section in cur:
-                count += 1
-                if count % 1000 == 0:
-                    print("Streamed %d documents" % (count), end="\r")
-
-                # Write row
-                if section[0]:
-                    output.write(section[0] + "\n")
+            for _, name, text in cur:
+                if not name or not re.search(Index.SECTION_FILTER, name.lower()):
+                    count += 1
+                    if count % 1000 == 0:
+                        print("Streamed %d documents" % (count), end="\r")
+
+                    # Write row
+                    if text:
+                        output.write(text + "\n")
 
             print("Iterated over %d total rows" % (count))
 

diff --git a/src/python/cord19q/index.py b/src/python/cord19q/index.py
@@ -17,6 +17,10 @@ class Index(object):
     Methods to build a new sentence embeddings index.
     """
 
+    # Section query and filtering logic constants
+    SECTION_FILTER = r"background|(?<!.*?results.*?)discussion|introduction|reference"
+    SECTION_QUERY = "SELECT Id, Name, Text FROM sections WHERE tags is not null AND (labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))"
+
     @staticmethod
     def stream(dbfile):
         """
@@ -31,15 +35,14 @@ def stream(dbfile):
         cur = db.cursor()
 
         # Select tagged sentences without a NLP label. NLP labels are set for non-informative sentences.
-        cur.execute("SELECT Id, Name, Text FROM sections WHERE tags is not null AND " +
-                    "(labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))")
+        cur.execute(Index.SECTION_QUERY)
 
         count = 0
         for row in cur:
             # Unpack row
             uid, name, text = row
 
-            if not name or not re.search(r"background|(?<!.*?results.*?)discussion|introduction|reference", name.lower()):
+            if not name or not re.search(Index.SECTION_FILTER, name.lower()):
                 # Tokenize text
                 tokens = Tokenizer.tokenize(text)