Skip to content
This repository has been archived by the owner on Nov 20, 2022. It is now read-only.

Commit

Permalink
Added indexing constants, merged functionality into export script
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed May 26, 2020
1 parent a924c52 commit e3e8f99
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 14 deletions.
25 changes: 14 additions & 11 deletions src/python/cord19q/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
import sqlite3
import sys

import regex as re

# pylint: disable=E0611
# Defined at runtime
from .index import Index
from .models import Models

class Export(object):
Expand All @@ -31,19 +34,19 @@ def stream(dbfile, output):
db = sqlite3.connect(dbfile)
cur = db.cursor()

# Database query
cur.execute("SELECT Text FROM sections WHERE tags is not null AND design NOT IN (0, 9) AND " +
"(labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))")
# Get all indexed text, with a detected study design, excluding modeling designs
cur.execute(Index.SECTION_QUERY + " AND design NOT IN (0, 9)")

count = 0
for section in cur:
count += 1
if count % 1000 == 0:
print("Streamed %d documents" % (count), end="\r")

# Write row
if section[0]:
output.write(section[0] + "\n")
for _, name, text in cur:
if not name or not re.search(Index.SECTION_FILTER, name.lower()):
count += 1
if count % 1000 == 0:
print("Streamed %d documents" % (count), end="\r")

# Write row
if text:
output.write(text + "\n")

print("Iterated over %d total rows" % (count))

Expand Down
9 changes: 6 additions & 3 deletions src/python/cord19q/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ class Index(object):
Methods to build a new sentence embeddings index.
"""

# Section query and filtering logic constants
SECTION_FILTER = r"background|(?<!.*?results.*?)discussion|introduction|reference"
SECTION_QUERY = "SELECT Id, Name, Text FROM sections WHERE tags is not null AND (labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))"

@staticmethod
def stream(dbfile):
"""
Expand All @@ -31,15 +35,14 @@ def stream(dbfile):
cur = db.cursor()

# Select tagged sentences without a NLP label. NLP labels are set for non-informative sentences.
cur.execute("SELECT Id, Name, Text FROM sections WHERE tags is not null AND " +
"(labels is null or labels NOT IN ('FRAGMENT', 'QUESTION'))")
cur.execute(Index.SECTION_QUERY)

count = 0
for row in cur:
# Unpack row
uid, name, text = row

if not name or not re.search(r"background|(?<!.*?results.*?)discussion|introduction|reference", name.lower()):
if not name or not re.search(Index.SECTION_FILTER, name.lower()):
# Tokenize text
tokens = Tokenizer.tokenize(text)

Expand Down

0 comments on commit e3e8f99

Please sign in to comment.