Skip to content

Commit

Permalink
Merge pull request #138 from lanl/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
MaksimEkin authored Apr 22, 2024
2 parents 2ed58e6 + 8610807 commit 3207768
Show file tree
Hide file tree
Showing 81 changed files with 688 additions and 495 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ authors:
- family-names: Alexandrov
given-names: Boian
title: "Tensor Extraction of Latent Features (T-ELF)"
version: 0.0.15
version: 0.0.16
url: https://github.com/lanl/T-ELF
doi: 10.5281/zenodo.10257897
date-released: 2023-12-04
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ If you use T-ELF please cite.

**APA:**
```latex
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.15) [Computer software]. https://doi.org/10.5281/zenodo.10257897
Eren, M., Solovyev, N., Barron, R., Bhattarai, M., Truong, D., Boureima, I., Skau, E., Rasmussen, K., & Alexandrov, B. (2023). Tensor Extraction of Latent Features (T-ELF) (Version 0.0.16) [Computer software]. https://doi.org/10.5281/zenodo.10257897
```

**BibTeX:**
Expand Down
44 changes: 41 additions & 3 deletions TELF/factorization/HNMFk.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,9 +396,9 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
)

#
# check if leaf node status
# check if leaf node status based on number of samples
#
if (current_node.num_samples == 1) or (self.sample_thresh > 0 and (current_node.num_samples <= self.sample_thresh)):
if (current_node.num_samples == 1):
current_node.leaf = True
pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
pickle.dump(current_node, open(pickle_path, "wb"))
Expand All @@ -420,6 +420,14 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
curr_X, save_at_node = self.generate_X_callback(current_node.original_indices)
current_node.user_node_data = save_at_node.copy()

#
# Based on number of features or samples, no seperation possible
#
if min(curr_X.shape) <= 1:
current_node.leaf = True
pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
pickle.dump(current_node, open(pickle_path, "wb"))
return {"name":node_name, "target_jobs":[], "node_save_path":pickle_path}

#
# prepare the current nmfk parameters
Expand All @@ -431,6 +439,16 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
curr_nmfk_params = self.nmfk_params[select_params % len(self.nmfk_params)]
curr_nmfk_params["save_path"] = node_save_path

#
# check for K range
#
Ks = self._adjust_curr_Ks(curr_X.shape, Ks)
if len(Ks) == 0 or (len(Ks) == 1 and Ks[0] < 2):
current_node.leaf = True
pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
pickle.dump(current_node, open(pickle_path, "wb"))
return {"name":node_name, "target_jobs":[], "node_save_path":pickle_path}

#
# apply nmfk
#
Expand Down Expand Up @@ -461,6 +479,13 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
current_node.W = factors_data["W"]
current_node.H = factors_data["H"]
current_node.k = predict_k

# sample threshold check for leaf node determination
if self.sample_thresh > 0 and (current_node.num_samples <= self.sample_thresh):
current_node.leaf = True
pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
pickle.dump(current_node, open(pickle_path, "wb"))
return {"name":node_name, "target_jobs":[], "node_save_path":pickle_path}

#
# apply clustering
Expand All @@ -476,7 +501,7 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name
# obtain the unique number of clusters that samples falls to
n_clusters = len(set(cluster_labels))

# leaf node or single cluster or all samples in same cluster
# leaf node based on depth limit or single cluster or all samples in same cluster
if ((current_node.depth >= self.depth) and self.depth > 0) or current_node.k == 1 or n_clusters == 1:
current_node.leaf = True
pickle_path = f'{node_save_path}/node_{current_node.node_name}.p'
Expand Down Expand Up @@ -517,6 +542,16 @@ def _process_node(self, Ks, depth, original_indices, node_name, parent_node_name

return {"name":node_name, "target_jobs":target_jobs, "node_save_path":pickle_path}

def _adjust_curr_Ks(self, X_shape, Ks):
if max(Ks) >= min(X_shape):
try:
Ks = range(1, min(X_shape), self.Ks_deep_step)
except Exception as e:
print(e)
return []

return Ks

def _get_curr_Ks(self, node_k, num_samples):
if not self.K2:
if self.Ks_deep_max is None:
Expand Down Expand Up @@ -741,5 +776,8 @@ def _set_params(self, class_parameters):
def _save_checkpoint(self):
class_params = vars(self).copy()
del class_params["X"]
if self.generate_X_callback is not None:
del class_params["generate_X_callback"]

pickle.dump(class_params, open(os.path.join(
self.experiment_save_path, "checkpoint.p"), "wb"))
25 changes: 23 additions & 2 deletions TELF/pre_processing/Vulture/modules/acronym.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@
FIRST_LETTER = 0
LAST_PART_INDEX = -1

def transform_acronyms_to_substitutions(old_list):
"""
TODO: Document this
"""
new_list = []
for dictionary in old_list:
if dictionary:
index_dictionary = {}
for key, value in dictionary.items():
new_key = '_'.join(key.split())

index_dictionary[key] = new_key
index_dictionary[value] = new_key

new_list.append(index_dictionary)
else:
new_list.append({})

return new_list


def flatten_acronym_dict(acronym_dict):
"""
Transform the acronym operator data into the format that will work for consolidation and substitution operators.
Expand All @@ -23,7 +44,7 @@ def flatten_acronym_dict(acronym_dict):
a list of dict that contain the acronyms.
"""
acronym_dict_list = []
for id, data in acronym_dict:
for id, data in acronym_dict.items():
acronym_dict_list.append(data['Acronyms'])

return acronym_dict_list
Expand Down Expand Up @@ -161,4 +182,4 @@ def _detect_acronym_helper(self, df):
warnings.warn(warning_sring)
acronyms[words_composing_acronym] = acronym

return acronyms
return acronyms
2 changes: 1 addition & 1 deletion TELF/pre_processing/Vulture/modules/simple_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _remove_stop_words(self, text ):
if self.exclude_hyphenated_stopwords:
cleaned_words = [t for t in tokens if
t in self.frozen or # entire term in frozen
not any(t.lower() in self.effective_stop_words)]
not t.lower() in self.effective_stop_words]
return ' '.join(cleaned_words)
else:
cleaned_words = [t for t in tokens if
Expand Down
153 changes: 120 additions & 33 deletions TELF/pre_processing/Vulture/tokens_analysis/levenstein.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import pandas as pd
from tqdm import tqdm
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import combinations
import os

def levenshtein_distance(s1, s2):
"""
Expand All @@ -15,7 +20,6 @@ def levenshtein_distance(s1, s2):
-------
int
The Levenshtein distance between s1 and s2.
"""
if len(s1) < len(s2):
return levenshtein_distance(s2, s1)
Expand All @@ -32,74 +36,157 @@ def levenshtein_distance(s1, s2):
previous_row = current_row
return previous_row[-1]

def is_levenshtein_similar(s1, s2, threshold=0.95):
def compare_keys(key1, key2, threshold=0.95, use_indel=False):
"""
Check if two strings are Levenshtein similar based on a given threshold.
Check if two strings are Levenshtein similar based on a given threshold. This function can optionally consider
insertion and deletion costs in the similarity calculation, which is controlled by the 'use_indel' parameter.
Parameters
----------
s1 : str
The first string.
s2 : str
The second string.
key1 : str
The first string to compare.
key2 : str
The second string to compare.
threshold : float, optional
The minimum similarity threshold (default is 0.95).
The minimum similarity threshold for considering the strings as similar (default is 0.95).
use_indel : bool, optional
Whether to include insertion and deletion costs in the similarity calculation (default is False).
Returns
-------
tuple
A tuple containing a boolean indicating if the strings are similar and the similarity score.
"""
max_len = max(len(s1), len(s2))
dist = levenshtein_distance(s1, s2)
similarity = (max_len - dist) / max_len
return similarity >= threshold, similarity
if use_indel:
raise ValueError("use_indel is not implemented yet -- pending dependency approval")
else:
max_len = max(len(key1), len(key2))
dist = levenshtein_distance(key1, key2)
similarity = (max_len - dist) / max_len
return similarity > threshold, similarity

def process_chunk(pairs, key_frequency, threshold=0.95, use_indel=False):
"""
Process a chunk of key pairs to determine if they are similar and decide the preferred key based on frequency.
Parameters
----------
pairs : list of tuple
A list of tuples each containing two keys to be compared.
key_frequency : dict
A dictionary with keys and their corresponding frequency count.
threshold : float, optional
The minimum similarity threshold for considering keys as similar (default is 0.95).
use_indel : bool, optional
Whether to include insertion and deletion costs in the similarity calculation (default is False).
Returns
-------
list
A list of tuples, each containing the less preferred key, the preferred key, and the similarity score.
"""
results = []
for key1, key2 in pairs:
similar_bool, similar_score = compare_keys(key1, key2, threshold, use_indel)
if similar_bool:
preferred_key = key1 if key_frequency[key1] > key_frequency[key2] else key2
less_preferred_key = key2 if preferred_key == key1 else key1
results.append((less_preferred_key, preferred_key, similar_score))
return results

def replace_similar_keys_levenshtein(dict_list, changes_made_save_path=None, similarity_threshold=0.95):
def replace_similar_keys_levenshtein(dict_list,
group_by_first_letter=True,
changes_made_save_path=None,
similarity_threshold=0.95,
use_indel=False,
n_jobs=-1):
"""
Replace similar keys in a list of dictionaries based on Levenshtein similarity.
Replace similar keys in a list of dictionaries based on similarity,
preferring the key that occurs more often. Optionally uses an alternative similarity calculation method.
This function can group keys by their first letter before comparing them to reduce computational load, which is
controlled by the 'group_by_first_letter' parameter. It supports parallel processing through the 'n_jobs' parameter.
Parameters
----------
dict_list : list
A list of dictionaries.
group_by_first_letter : bool, optional
Whether to group keys by the first letter before comparison (default is True).
changes_made_save_path : str, optional
The path to save the changes made (default is None).
similarity_threshold : float, optional
The minimum similarity threshold for considering keys as similar (default is 0.95).
use_indel : bool, optional
Whether to use an alternative method for similarity comparison, such as including insertions and deletions in the cost (default is False).
n_jobs : int, optional
The number of jobs to run in parallel (default is -1, which uses all processors).
Returns
-------
tuple
A tuple containing the updated list of dictionaries and a DataFrame of changes made.
"""
all_keys = set(key for d in dict_list for key in d.keys())

all_keys = [key for d in dict_list for key in d.keys()]
key_frequency = Counter(all_keys)
similar_keys = {}
changes = []

sorted_keys = sorted(all_keys)
for key1 in sorted_keys:
for key2 in sorted_keys:
if key1 != key2:
similar_bool, similar_score = is_levenshtein_similar(key1, key2, similarity_threshold)
if similar_bool:
smaller, larger = sorted([key1, key2], key=len)
similar_keys[larger] = (smaller, similar_score)

for dict_index, dict_ in enumerate(dict_list):
keys_to_replace = {k: v for k, v in similar_keys.items() if k in dict_}
for longer_key, (shorter_key, score) in keys_to_replace.items():
if longer_key in dict_:
dict_[shorter_key] = dict_.pop(longer_key)
sorted_keys = sorted(set(all_keys))

# Group keys by the first character
if group_by_first_letter:
grouped_keys = {}
for key in sorted_keys:
first_char = key[0]
if first_char not in grouped_keys:
grouped_keys[first_char] = []
grouped_keys[first_char].append(key)

# Generate all pairs where the first character matches
all_pairs = [pair for key_list in grouped_keys.values() for pair in combinations(key_list, 2)]
else:
all_pairs = list(combinations(sorted_keys, 2))

num_cpus = os.cpu_count()
if n_jobs == -1:
num_cpus = os.cpu_count() # Get the number of CPUs available
else:
# Make sure the thread count passed in is not greater than the number available
num_cpus = min(n_jobs, num_cpus)

chunk_size = int(len(all_pairs) / num_cpus) + 1
print(f"chunk_size = {chunk_size}, num_cpus = {num_cpus}, len all_pairs = {len(all_pairs)}")
chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]
progress = tqdm(total=len(chunks), desc="Processing Chunks")

with ThreadPoolExecutor(max_workers=min(num_cpus,len(chunks))) as executor:
results = list(executor.map(process_chunk, chunks, [key_frequency]*len(chunks), [similarity_threshold]*len(chunks), [use_indel]*len(chunks)))
for chunk_result in results:
for less_preferred_key, preferred_key, similar_score in chunk_result:
similar_keys[less_preferred_key] = (preferred_key, similar_score)
progress.update(1)

progress.close()

for dict_ in dict_list:
for less_preferred_key, (preferred_key, score) in similar_keys.items():
if less_preferred_key in dict_:
if isinstance(dict_[less_preferred_key], int):
dict_[preferred_key] = dict_.get(preferred_key, 0) + dict_.pop(less_preferred_key)
elif isinstance(dict_[less_preferred_key], str):
dict_[preferred_key] = dict_.get(preferred_key, '') + dict_.pop(less_preferred_key)

changes.append({
'Index': dict_index,
'Previous Word': longer_key,
'New Word': shorter_key,
'Previous Key': less_preferred_key,
'New Key': preferred_key,
'Similarity Score': score
})

changes_df = pd.DataFrame(changes)

if changes_made_save_path:
changes_df.to_csv(changes_made_save_path, index=False)

return dict_list, changes_df
return dict_list, changes_df
Loading

0 comments on commit 3207768

Please sign in to comment.