From faf7467ce9000f2435a7ee8efab05a5c51beb184 Mon Sep 17 00:00:00 2001 From: Matt Jones Date: Thu, 7 Mar 2024 13:44:25 -0800 Subject: [PATCH 1/3] fixed bug in lca-reconstruct of ambiguous states --- cassiopeia/data/utilities.py | 5 +++++ test/data_tests/data_utilities_test.py | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/cassiopeia/data/utilities.py b/cassiopeia/data/utilities.py index bcf008f4..0a265e7b 100755 --- a/cassiopeia/data/utilities.py +++ b/cassiopeia/data/utilities.py @@ -72,6 +72,7 @@ def get_lca_characters( else: lca_vec[i] = all_states[0] else: + all_ambiguous = np.all([is_ambiguous_state(s) for s in all_states]) chars = set.intersection( *map( set, @@ -83,6 +84,10 @@ def get_lca_characters( ) if len(chars) == 1: lca_vec[i] = list(chars)[0] + if all_ambiguous: + # if we only have ambiguous states, we set the LCA state + # to be the intersection. + lca_vec[i] = tuple(chars) return lca_vec diff --git a/test/data_tests/data_utilities_test.py b/test/data_tests/data_utilities_test.py index 94902faa..67eb2fd7 100755 --- a/test/data_tests/data_utilities_test.py +++ b/test/data_tests/data_utilities_test.py @@ -316,6 +316,17 @@ def test_lca_characters_ambiguous(self): ) self.assertEqual(ret_vec, [1, 2, 3, 0, 5]) + def test_lca_characters_ambiguous2(self): + + s1 = [(4, 62), (3, 10), (3, 10, 16), (0, 3), (0, 2, 3), (0, 2, 3), (0, 4, 7), (0, 2, 23), (0, 1, 4, 44)] + s2 = [4, 3, -1, 0, 0, 0, (0, 7), (0, 2), (0, 4)] + + expected_reconstruction = [4, 3, (3, 10, 16), 0, 0, 0, (0, 7), (0, 2), (0, 4)] + ret_vec = data_utilities.get_lca_characters( + [s1, s2], missing_state_indicator=-1 + ) + self.assertEqual(ret_vec, expected_reconstruction) + def test_lca_characters_ambiguous_and_missing(self): vecs = [ [(1, 1), (0, 2), (3, 0), (4,), (5,)], From b5b84cfc33985afcb1adcc1038f9393d6d1e3a49 Mon Sep 17 00:00:00 2001 From: Matt Jones Date: Thu, 7 Mar 2024 13:45:47 -0800 Subject: [PATCH 2/3] reformatted --- cassiopeia/data/utilities.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cassiopeia/data/utilities.py b/cassiopeia/data/utilities.py index 0a265e7b..c2d4d970 100755 --- a/cassiopeia/data/utilities.py +++ b/cassiopeia/data/utilities.py @@ -1,6 +1,7 @@ """ General utilities for the datasets encountered in Cassiopeia. """ + import collections from joblib import delayed import multiprocessing @@ -61,7 +62,7 @@ def get_lca_characters( all_states = [ vec[i] for vec in vecs if vec[i] != missing_state_indicator ] - + # this check is specifically if all_states consists of a single # ambiguous state. if len(list(set(all_states))) == 1: @@ -72,7 +73,9 @@ def get_lca_characters( else: lca_vec[i] = all_states[0] else: - all_ambiguous = np.all([is_ambiguous_state(s) for s in all_states]) + all_ambiguous = np.all( + [is_ambiguous_state(s) for s in all_states] + ) chars = set.intersection( *map( set, @@ -233,9 +236,7 @@ def compute_dissimilarity_map( ] # load character matrix into shared memory - shm = shared_memory.SharedMemory( - create=True, size=cm.nbytes - ) + shm = shared_memory.SharedMemory(create=True, size=cm.nbytes) shared_cm = np.ndarray(cm.shape, dtype=cm.dtype, buffer=shm.buf) shared_cm[:] = cm[:] From dced9533d0a796bd9d4c65a8c44ecc726ec6dd65 Mon Sep 17 00:00:00 2001 From: Matt Jones Date: Thu, 7 Mar 2024 13:45:56 -0800 Subject: [PATCH 3/3] reformatted --- test/data_tests/data_utilities_test.py | 48 +++++++++++++++++++------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/test/data_tests/data_utilities_test.py b/test/data_tests/data_utilities_test.py index 67eb2fd7..2d40d867 100755 --- a/test/data_tests/data_utilities_test.py +++ b/test/data_tests/data_utilities_test.py @@ -85,7 +85,7 @@ def test_bootstrap_character_matrices_no_priors(self): self.assertEqual(len(bootstrap_samples), 10) - for (bootstrap_matrix, bootstrap_priors) in bootstrap_samples: + for bootstrap_matrix, bootstrap_priors in bootstrap_samples: self.assertCountEqual( self.character_matrix.index, bootstrap_matrix.index ) @@ -113,7 +113,7 @@ def test_bootstrap_character_matrices_with_priors(self): self.assertEqual(len(bootstrap_samples), 10) - for (bootstrap_matrix, bootstrap_priors) in bootstrap_samples: + for bootstrap_matrix, bootstrap_priors in bootstrap_samples: self.assertCountEqual( self.character_matrix.index, bootstrap_matrix.index ) @@ -317,11 +317,31 @@ def test_lca_characters_ambiguous(self): self.assertEqual(ret_vec, [1, 2, 3, 0, 5]) def test_lca_characters_ambiguous2(self): - - s1 = [(4, 62), (3, 10), (3, 10, 16), (0, 3), (0, 2, 3), (0, 2, 3), (0, 4, 7), (0, 2, 23), (0, 1, 4, 44)] + + s1 = [ + (4, 62), + (3, 10), + (3, 10, 16), + (0, 3), + (0, 2, 3), + (0, 2, 3), + (0, 4, 7), + (0, 2, 23), + (0, 1, 4, 44), + ] s2 = [4, 3, -1, 0, 0, 0, (0, 7), (0, 2), (0, 4)] - expected_reconstruction = [4, 3, (3, 10, 16), 0, 0, 0, (0, 7), (0, 2), (0, 4)] + expected_reconstruction = [ + 4, + 3, + (3, 10, 16), + 0, + 0, + 0, + (0, 7), + (0, 2), + (0, 4), + ] ret_vec = data_utilities.get_lca_characters( [s1, s2], missing_state_indicator=-1 ) @@ -336,7 +356,7 @@ def test_lca_characters_ambiguous_and_missing(self): ret_vec = data_utilities.get_lca_characters( vecs, missing_state_indicator=-1 ) - self.assertEqual(ret_vec, [1, (0,2), (3,0), 0, 5]) + self.assertEqual(ret_vec, [1, (0, 2), (3, 0), 0, 5]) def test_resolve_most_abundant(self): state = (1, 2, 3, 3) @@ -463,8 +483,10 @@ def test_inter_cluster_distance_basic(self): tree = CassiopeiaTree(tree=tree, cell_meta=meta_data) - inter_cluster_distances = data_utilities.compute_inter_cluster_distances( - tree, meta_item="CellType" + inter_cluster_distances = ( + data_utilities.compute_inter_cluster_distances( + tree, meta_item="CellType" + ) ) expected_distances = pd.DataFrame.from_dict( @@ -518,10 +540,12 @@ def test_inter_cluster_distance_custom_input(self): tree = CassiopeiaTree(tree=tree) - inter_cluster_distances = data_utilities.compute_inter_cluster_distances( - tree, - meta_data=meta_data["CellType"], - dissimilarity_map=weight_matrix, + inter_cluster_distances = ( + data_utilities.compute_inter_cluster_distances( + tree, + meta_data=meta_data["CellType"], + dissimilarity_map=weight_matrix, + ) ) expected_distances = pd.DataFrame.from_dict(