Skip to content

Commit

Permalink
Merge pull request #98 from lanl/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
MaksimEkin authored Mar 26, 2024
2 parents 18eebe8 + 25d7dce commit a7e1036
Show file tree
Hide file tree
Showing 78 changed files with 972 additions and 207 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ authors:
- family-names: Alexandrov
given-names: Boian
title: "Tensor Extraction of Latent Features (T-ELF)"
version: 0.0.10
version: 0.0.11
url: https://github.com/lanl/T-ELF
doi: 10.5281/zenodo.10257897
date-released: 2023-12-04
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ If you use T-ELF please cite.

**APA:**
```latex
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.010) [Computer software]. https://doi.org/10.5281/zenodo.10257897
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.11) [Computer software]. https://doi.org/10.5281/zenodo.10257897
```

**BibTeX:**
Expand Down
3 changes: 2 additions & 1 deletion TELF/pre_processing/Vulture/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
from .lemmatize import LemmatizeCleaner
from .substitute import SubstitutionCleaner
from .detect_nonenglish import RemoveNonEnglishCleaner
from .ner import NEDetector
from .ner import NEDetector
from .acronym import AcronymDetector
113 changes: 113 additions & 0 deletions TELF/pre_processing/Vulture/modules/acronym.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import re
import spacy
import warnings

from TELF.pre_processing.Vulture.modules import VultureModuleBase
from TELF.pre_processing.Vulture.tokens_analysis.top_words import get_top_words

FIRST_LETTER = 0
LAST_PART_INDEX = -1

class AcronymDetector(VultureModuleBase):
"""
An operator that detects Acronyms in text.
"""

def __init__(self, gram_range=list(range(2,8)), replace_raw=False, frozen=None):
super().__init__(frozen)
self.module_type = "OPERATOR"
self.gram_range = gram_range
self.current_document_id = None
self.replace_raw = replace_raw


def __call__(self, document):
return self.run(document)

def run(self, document):
"""
Run the acronym detection
Parameters
----------
document: tuple
A document id, document text pair for which to perform acronym detection
Returns
-------
tuple
Tuple of document id and operation result
"""
doc_id, doc_text = document
self.current_document_id = doc_id
doc_operation_result = self._detect_acronym(doc_text)
return (doc_id, doc_operation_result)


def _detect_acronym(self, text):
"""
Detect acronyms in a given string
Parametersfrozen
----------
text: str
A string to etect acronyms over
Returns
-------
str
Dictionary of entity name and correcponding set of entities
"""
only_acronyms = {}
for n in self.gram_range:
cur_n_grams = get_top_words( [text],
top_n=99999,
n_gram=n+1,
verbose=False,
filename=None
)
only_acronyms.update( self._detect_acronym_helper(cur_n_grams) )

replaced_text = ''
if self.replace_raw:
replaced_text = text
for full_form, acronym in only_acronyms.items():
# replaces full strings with nothing, then corrects double spaces introduced by this -- ex: "The Example Part EP" becomes "The EP"
# replaces acronyms with comma joined full form -- ex: "The EP" becomes "The Example_Part"
replaced_text = replaced_text.replace(full_form, '').replace(acronym, full_form.replace(" ", "_")).replace(' ', ' ')

return {"Acronyms":only_acronyms, "replaced_text":replaced_text}


def _detect_acronym_helper(self, df):
acronyms = {}
for gram,tf, df in zip(df['word'],df['tf'], df['df']):
gram_parts = gram.split()
gram_without_end = gram_parts[:LAST_PART_INDEX]
gram_without_end_acronym = "".join([gram_part[FIRST_LETTER] for gram_part in gram_without_end])
last_part = gram_parts[LAST_PART_INDEX]

gram_without_beginning = gram_parts[1:]
first_part = gram_parts[0]
gram_without_beginning_acronym = "".join([gram_part[FIRST_LETTER] for gram_part in gram_without_beginning])

first_is_acronym = gram_without_beginning_acronym == first_part
last_is_acronym = gram_without_end_acronym == last_part

if first_is_acronym or last_is_acronym:
if last_is_acronym:
words_composing_acronym = " ".join(gram_without_end)
acronym = last_part
else:
words_composing_acronym = " ".join(gram_without_beginning)
acronym = first_part


if words_composing_acronym in acronyms:
warnings.warn(f'The document at id="{self.current_document_id}" defines "{last_part}" as an acronym twice, using last occurance!')

acronyms[words_composing_acronym] = acronym


return acronyms
2 changes: 1 addition & 1 deletion TELF/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.10'
__version__ = '0.0.11'
Binary file added data/acronyms_documents.p
Binary file not shown.
6 changes: 3 additions & 3 deletions docs/Beaver.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.10 documentation</title>
<title>TELF.pre_processing.Beaver: Fast matrix and tensor building tool &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/Cheetah.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.10 documentation</title>
<title>TELF.applications.Cheetah: Advanced search by keywords and phrases &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/NMFk.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.NMFk: Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.10 documentation</title>
<title>TELF.factorization.NMFk: Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/RESCALk.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.RESCALk: RESCAL with Automatic Model Determination &#8212; TELF 0.0.10 documentation</title>
<title>TELF.factorization.RESCALk: RESCAL with Automatic Model Determination &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/SymNMFk.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.SymNMFk: Symmetric Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.10 documentation</title>
<title>TELF.factorization.SymNMFk: Symmetric Non-negative Matrix Factorization with Automatic Model Determination &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/TELF.factorization.decompositions.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.decompositions package &#8212; TELF 0.0.10 documentation</title>
<title>TELF.factorization.decompositions package &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -128,7 +128,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/TELF.factorization.decompositions.utilities.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.decompositions.utilities package &#8212; TELF 0.0.10 documentation</title>
<title>TELF.factorization.decompositions.utilities package &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/TELF.factorization.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization package &#8212; TELF 0.0.10 documentation</title>
<title>TELF.factorization package &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
6 changes: 3 additions & 3 deletions docs/TELF.factorization.utilities.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

<title>TELF.factorization.utilities package &#8212; TELF 0.0.10 documentation</title>
<title>TELF.factorization.utilities package &#8212; TELF 0.0.11 documentation</title>



Expand Down Expand Up @@ -37,7 +37,7 @@
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=365ca57ee442770a23c6" />
<script src="_static/vendor/fontawesome/6.1.2/js/all.min.js?digest=365ca57ee442770a23c6"></script>

<script src="_static/documentation_options.js?v=94452c26"></script>
<script src="_static/documentation_options.js?v=2fb9ae3b"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/scripts/sphinx-book-theme.js?digest=5a5c038af52cf7bc1a1ec88eea08e6366ee68824"></script>
Expand Down Expand Up @@ -127,7 +127,7 @@



<p class="title logo__title">TELF 0.0.10 documentation</p>
<p class="title logo__title">TELF 0.0.11 documentation</p>

</a></div>
<div class="sidebar-primary-item"><nav class="bd-links" id="bd-docs-nav" aria-label="Main">
Expand Down
Loading

0 comments on commit a7e1036

Please sign in to comment.