Merge pull request #138 from lanl/develop

Develop
lanl · Apr 22, 2024 · 3207768 · 3207768
2 parents 2ed58e6 + 8610807
commit 3207768
Show file tree

Hide file tree

Showing 81 changed files with 688 additions and 495 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -20,7 +20,7 @@ authors:
   - family-names: Alexandrov
     given-names: Boian
 title: "Tensor Extraction of Latent Features (T-ELF)"
-version: 0.0.15
+version: 0.0.16
 url: https://github.com/lanl/T-ELF
 doi: 10.5281/zenodo.10257897
 date-released: 2023-12-04
diff --git a/README.md b/README.md
@@ -156,7 +156,7 @@ If you use T-ELF please cite.
 
 **APA:**
 ```latex
-Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.15) [Computer software]. https://doi.org/10.5281/zenodo.10257897
+Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.16) [Computer software]. https://doi.org/10.5281/zenodo.10257897
 ```
 
 **BibTeX:**

diff --git a/TELF/factorization/HNMFk.py b/TELF/factorization/HNMFk.py
@@ -396,9 +396,9 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
         )
 
         #
-        # check if leaf node status
+        # check if leaf node status based on number of samples
         #
-        if (current_node.num_samples == 1) or (self.sample_thresh > 0 and (current_node.num_samples <= self.sample_thresh)):
+        if (current_node.num_samples == 1):
             current_node.leaf = True
             pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
             pickle.dump(current_node, open(pickle_path, "wb"))
@@ -420,6 +420,14 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
             curr_X, save_at_node = self.generate_X_callback(current_node.original_indices)
             current_node.user_node_data = save_at_node.copy()
 
+        #
+        # Based on number of features or samples, no seperation possible
+        #
+        if min(curr_X.shape) <= 1:
+            current_node.leaf = True
+            pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
+            pickle.dump(current_node, open(pickle_path, "wb"))
+            return {"name":node_name, "target_jobs":[], "node_save_path":pickle_path}
 
         #
         # prepare the current nmfk parameters
@@ -431,6 +439,16 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
         curr_nmfk_params = self.nmfk_params[select_params % len(self.nmfk_params)]
         curr_nmfk_params["save_path"] = node_save_path
 
+        #
+        # check for K range
+        #
+        Ks = self._adjust_curr_Ks(curr_X.shape, Ks)
+        if len(Ks) == 0 or (len(Ks) == 1 and Ks[0] < 2):
+            current_node.leaf = True
+            pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
+            pickle.dump(current_node, open(pickle_path, "wb"))
+            return {"name":node_name, "target_jobs":[], "node_save_path":pickle_path}
+
         #
         # apply nmfk
         #
@@ -461,6 +479,13 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
             current_node.W = factors_data["W"]
             current_node.H = factors_data["H"]
             current_node.k = predict_k
+
+        # sample threshold check for leaf node determination
+        if self.sample_thresh > 0 and (current_node.num_samples <= self.sample_thresh):
+            current_node.leaf = True
+            pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
+            pickle.dump(current_node, open(pickle_path, "wb"))
+            return {"name":node_name, "target_jobs":[], "node_save_path":pickle_path}
 
         #
         # apply clustering
@@ -476,7 +501,7 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
         # obtain the unique number of clusters that samples falls to
         n_clusters = len(set(cluster_labels))
 
-        # leaf node or single cluster or all samples in same cluster
+        # leaf node based on depth limit or single cluster or all samples in same cluster
         if ((current_node.depth >= self.depth) and self.depth > 0) or current_node.k == 1 or n_clusters == 1:
             current_node.leaf = True
             pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
@@ -517,6 +542,16 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
 
         return {"name":node_name, "target_jobs":target_jobs, "node_save_path":pickle_path}
 
+    def _adjust_curr_Ks(self, X_shape, Ks):
+        if max(Ks) >= min(X_shape):
+            try:
+                Ks = range(1, min(X_shape), self.Ks_deep_step)
+            except Exception as e:
+                print(e)
+                return []
+
+        return Ks
+
     def _get_curr_Ks(self, node_k, num_samples):
         if not self.K2:
             if self.Ks_deep_max is None:
@@ -741,5 +776,8 @@ def _set_params(self, class_parameters):
     def _save_checkpoint(self):
         class_params = vars(self).copy()
         del class_params["X"]
+        if self.generate_X_callback is not None:
+            del class_params["generate_X_callback"]
+
         pickle.dump(class_params, open(os.path.join(
             self.experiment_save_path, "checkpoint.p"), "wb"))
diff --git a/TELF/pre_processing/Vulture/modules/acronym.py b/TELF/pre_processing/Vulture/modules/acronym.py
@@ -8,6 +8,27 @@
 FIRST_LETTER = 0
 LAST_PART_INDEX = -1
 
+def transform_acronyms_to_substitutions(old_list):
+    """
+    TODO: Document this
+    """
+    new_list = []
+    for dictionary in old_list:
+        if dictionary:  
+            index_dictionary = {}
+            for key, value in dictionary.items():
+                new_key = '_'.join(key.split())
+
+                index_dictionary[key] = new_key
+                index_dictionary[value] = new_key
+
+            new_list.append(index_dictionary)
+        else:
+            new_list.append({})
+
+    return new_list
+
+
 def flatten_acronym_dict(acronym_dict):
     """
     Transform the acronym operator data into the format that will work for consolidation and substitution operators.
@@ -23,7 +44,7 @@ def flatten_acronym_dict(acronym_dict):
         a list of dict that contain the acronyms.
     """
     acronym_dict_list = []
-    for id, data in acronym_dict:
+    for id, data in acronym_dict.items():
         acronym_dict_list.append(data['Acronyms'])
 
     return acronym_dict_list
@@ -161,4 +182,4 @@ def _detect_acronym_helper(self, df):
                     warnings.warn(warning_sring)
                 acronyms[words_composing_acronym] = acronym
 
-        return acronyms
+        return acronyms
diff --git a/TELF/pre_processing/Vulture/modules/simple_clean.py b/TELF/pre_processing/Vulture/modules/simple_clean.py
@@ -244,7 +244,7 @@ def _remove_stop_words(self, text ):
             if self.exclude_hyphenated_stopwords:
                 cleaned_words = [t for t in tokens if 
                                 t in self.frozen or  # entire term in frozen
-                                not any(t.lower() in self.effective_stop_words)] 
+                                not t.lower() in self.effective_stop_words] 
                 return ' '.join(cleaned_words)
             else:
                 cleaned_words = [t for t in tokens if 

diff --git a/TELF/pre_processing/Vulture/tokens_analysis/levenstein.py b/TELF/pre_processing/Vulture/tokens_analysis/levenstein.py
@@ -1,4 +1,9 @@
 import pandas as pd
+from tqdm import tqdm
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from itertools import combinations
+import os
 
 def levenshtein_distance(s1, s2):
     """
@@ -15,7 +20,6 @@ def levenshtein_distance(s1, s2):
     -------
     int
         The Levenshtein distance between s1 and s2.
-
     """
     if len(s1) < len(s2):
         return levenshtein_distance(s2, s1)
@@ -32,74 +36,157 @@ def levenshtein_distance(s1, s2):
         previous_row = current_row
     return previous_row[-1]
 
-def is_levenshtein_similar(s1, s2, threshold=0.95):
+def compare_keys(key1, key2, threshold=0.95, use_indel=False):
     """
-    Check if two strings are Levenshtein similar based on a given threshold.
+    Check if two strings are Levenshtein similar based on a given threshold. This function can optionally consider
+    insertion and deletion costs in the similarity calculation, which is controlled by the 'use_indel' parameter.
 
     Parameters
     ----------
-    s1 : str
-        The first string.
-    s2 : str
-        The second string.
+    key1 : str
+        The first string to compare.
+    key2 : str
+        The second string to compare.
     threshold : float, optional
-        The minimum similarity threshold (default is 0.95).
+        The minimum similarity threshold for considering the strings as similar (default is 0.95).
+    use_indel : bool, optional
+        Whether to include insertion and deletion costs in the similarity calculation (default is False).
 
     Returns
     -------
     tuple
         A tuple containing a boolean indicating if the strings are similar and the similarity score.
     """
-    max_len = max(len(s1), len(s2))
-    dist = levenshtein_distance(s1, s2)
-    similarity = (max_len - dist) / max_len
-    return similarity >= threshold, similarity
+    if use_indel:
+        raise ValueError("use_indel is not implemented yet -- pending dependency approval")
+    else:
+        max_len = max(len(key1), len(key2))
+        dist = levenshtein_distance(key1, key2)
+        similarity = (max_len - dist) / max_len
+        return similarity > threshold, similarity
+
+def process_chunk(pairs, key_frequency, threshold=0.95, use_indel=False):
+    """
+    Process a chunk of key pairs to determine if they are similar and decide the preferred key based on frequency.
+
+    Parameters
+    ----------
+    pairs : list of tuple
+        A list of tuples each containing two keys to be compared.
+    key_frequency : dict
+        A dictionary with keys and their corresponding frequency count.
+    threshold : float, optional
+        The minimum similarity threshold for considering keys as similar (default is 0.95).
+    use_indel : bool, optional
+        Whether to include insertion and deletion costs in the similarity calculation (default is False).
+
+    Returns
+    -------
+    list
+        A list of tuples, each containing the less preferred key, the preferred key, and the similarity score.
+    """
+    results = []
+    for key1, key2 in pairs:
+        similar_bool, similar_score = compare_keys(key1, key2, threshold, use_indel)
+        if similar_bool:
+            preferred_key = key1 if key_frequency[key1] > key_frequency[key2] else key2
+            less_preferred_key = key2 if preferred_key == key1 else key1
+            results.append((less_preferred_key, preferred_key, similar_score))
+    return results
 
-def replace_similar_keys_levenshtein(dict_list, changes_made_save_path=None, similarity_threshold=0.95):
+def replace_similar_keys_levenshtein(dict_list, 
+                                     group_by_first_letter=True,
+                                     changes_made_save_path=None, 
+                                     similarity_threshold=0.95, 
+                                     use_indel=False,
+                                     n_jobs=-1):
     """
-    Replace similar keys in a list of dictionaries based on Levenshtein similarity.
+    Replace similar keys in a list of dictionaries based on similarity,
+    preferring the key that occurs more often. Optionally uses an alternative similarity calculation method.
+
+    This function can group keys by their first letter before comparing them to reduce computational load, which is
+    controlled by the 'group_by_first_letter' parameter. It supports parallel processing through the 'n_jobs' parameter.
 
     Parameters
     ----------
     dict_list : list
         A list of dictionaries.
+    group_by_first_letter : bool, optional
+        Whether to group keys by the first letter before comparison (default is True).
     changes_made_save_path : str, optional
         The path to save the changes made (default is None).
     similarity_threshold : float, optional
         The minimum similarity threshold for considering keys as similar (default is 0.95).
+    use_indel : bool, optional
+        Whether to use an alternative method for similarity comparison, such as including insertions and deletions in the cost (default is False).
+    n_jobs : int, optional
+        The number of jobs to run in parallel (default is -1, which uses all processors).
 
     Returns
     -------
     tuple
         A tuple containing the updated list of dictionaries and a DataFrame of changes made.
     """
-    all_keys = set(key for d in dict_list for key in d.keys())
+
+    all_keys = [key for d in dict_list for key in d.keys()]
+    key_frequency = Counter(all_keys)
     similar_keys = {}
     changes = []
 
-    sorted_keys = sorted(all_keys)   
-    for key1 in sorted_keys:
-        for key2 in sorted_keys:
-            if key1 != key2:
-                similar_bool, similar_score = is_levenshtein_similar(key1, key2, similarity_threshold)
-                if similar_bool:
-                    smaller, larger = sorted([key1, key2], key=len)
-                    similar_keys[larger] = (smaller, similar_score)
-
-    for dict_index, dict_ in enumerate(dict_list):
-        keys_to_replace = {k: v for k, v in similar_keys.items() if k in dict_}
-        for longer_key, (shorter_key, score) in keys_to_replace.items():
-            if longer_key in dict_:
-                dict_[shorter_key] = dict_.pop(longer_key)
+    sorted_keys = sorted(set(all_keys))
+
+    # Group keys by the first character
+    if group_by_first_letter:
+        grouped_keys = {}
+        for key in sorted_keys:
+            first_char = key[0]
+            if first_char not in grouped_keys:
+                grouped_keys[first_char] = []
+            grouped_keys[first_char].append(key)
+
+        # Generate all pairs where the first character matches
+        all_pairs = [pair for key_list in grouped_keys.values() for pair in combinations(key_list, 2)]
+    else:
+        all_pairs = list(combinations(sorted_keys, 2))
+
+    num_cpus = os.cpu_count()
+    if n_jobs == -1:
+        num_cpus = os.cpu_count()  # Get the number of CPUs available
+    else:
+        # Make sure the thread count passed in is not greater than the number available
+        num_cpus = min(n_jobs, num_cpus)
+
+    chunk_size = int(len(all_pairs) / num_cpus) + 1
+    print(f"chunk_size = {chunk_size}, num_cpus = {num_cpus}, len all_pairs = {len(all_pairs)}")
+    chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]
+    progress = tqdm(total=len(chunks), desc="Processing Chunks")
+
+    with ThreadPoolExecutor(max_workers=min(num_cpus,len(chunks))) as executor:
+        results = list(executor.map(process_chunk, chunks, [key_frequency]*len(chunks), [similarity_threshold]*len(chunks), [use_indel]*len(chunks)))
+        for chunk_result in results:
+            for less_preferred_key, preferred_key, similar_score in chunk_result:
+                similar_keys[less_preferred_key] = (preferred_key, similar_score)
+            progress.update(1)
+
+    progress.close()
+
+    for dict_ in dict_list:
+        for less_preferred_key, (preferred_key, score) in similar_keys.items():
+            if less_preferred_key in dict_:
+                if isinstance(dict_[less_preferred_key], int):
+                    dict_[preferred_key] = dict_.get(preferred_key, 0) + dict_.pop(less_preferred_key)
+                elif isinstance(dict_[less_preferred_key], str):
+                    dict_[preferred_key] = dict_.get(preferred_key, '') + dict_.pop(less_preferred_key)
+
                 changes.append({
-                    'Index': dict_index,
-                    'Previous Word': longer_key,
-                    'New Word': shorter_key,
+                    'Previous Key': less_preferred_key,
+                    'New Key': preferred_key,
                     'Similarity Score': score
                 })
 
     changes_df = pd.DataFrame(changes)
+
     if changes_made_save_path:
         changes_df.to_csv(changes_made_save_path, index=False)
 
-    return dict_list, changes_df
+    return dict_list, changes_df