diff --git a/cassiopeia/data/utilities.py b/cassiopeia/data/utilities.py index bcf008f4..c2d4d970 100755 --- a/cassiopeia/data/utilities.py +++ b/cassiopeia/data/utilities.py @@ -1,6 +1,7 @@ """ General utilities for the datasets encountered in Cassiopeia. """ + import collections from joblib import delayed import multiprocessing @@ -61,7 +62,7 @@ def get_lca_characters( all_states = [ vec[i] for vec in vecs if vec[i] != missing_state_indicator ] - + # this check is specifically if all_states consists of a single # ambiguous state. if len(list(set(all_states))) == 1: @@ -72,6 +73,9 @@ def get_lca_characters( else: lca_vec[i] = all_states[0] else: + all_ambiguous = np.all( + [is_ambiguous_state(s) for s in all_states] + ) chars = set.intersection( *map( set, @@ -83,6 +87,10 @@ def get_lca_characters( ) if len(chars) == 1: lca_vec[i] = list(chars)[0] + if all_ambiguous: + # if we only have ambiguous states, we set the LCA state + # to be the intersection. + lca_vec[i] = tuple(chars) return lca_vec @@ -228,9 +236,7 @@ def compute_dissimilarity_map( ] # load character matrix into shared memory - shm = shared_memory.SharedMemory( - create=True, size=cm.nbytes - ) + shm = shared_memory.SharedMemory(create=True, size=cm.nbytes) shared_cm = np.ndarray(cm.shape, dtype=cm.dtype, buffer=shm.buf) shared_cm[:] = cm[:] diff --git a/test/data_tests/data_utilities_test.py b/test/data_tests/data_utilities_test.py index 94902faa..2d40d867 100755 --- a/test/data_tests/data_utilities_test.py +++ b/test/data_tests/data_utilities_test.py @@ -85,7 +85,7 @@ def test_bootstrap_character_matrices_no_priors(self): self.assertEqual(len(bootstrap_samples), 10) - for (bootstrap_matrix, bootstrap_priors) in bootstrap_samples: + for bootstrap_matrix, bootstrap_priors in bootstrap_samples: self.assertCountEqual( self.character_matrix.index, bootstrap_matrix.index ) @@ -113,7 +113,7 @@ def test_bootstrap_character_matrices_with_priors(self): self.assertEqual(len(bootstrap_samples), 10) - for (bootstrap_matrix, bootstrap_priors) in bootstrap_samples: + for bootstrap_matrix, bootstrap_priors in bootstrap_samples: self.assertCountEqual( self.character_matrix.index, bootstrap_matrix.index ) @@ -316,6 +316,37 @@ def test_lca_characters_ambiguous(self): ) self.assertEqual(ret_vec, [1, 2, 3, 0, 5]) + def test_lca_characters_ambiguous2(self): + + s1 = [ + (4, 62), + (3, 10), + (3, 10, 16), + (0, 3), + (0, 2, 3), + (0, 2, 3), + (0, 4, 7), + (0, 2, 23), + (0, 1, 4, 44), + ] + s2 = [4, 3, -1, 0, 0, 0, (0, 7), (0, 2), (0, 4)] + + expected_reconstruction = [ + 4, + 3, + (3, 10, 16), + 0, + 0, + 0, + (0, 7), + (0, 2), + (0, 4), + ] + ret_vec = data_utilities.get_lca_characters( + [s1, s2], missing_state_indicator=-1 + ) + self.assertEqual(ret_vec, expected_reconstruction) + def test_lca_characters_ambiguous_and_missing(self): vecs = [ [(1, 1), (0, 2), (3, 0), (4,), (5,)], @@ -325,7 +356,7 @@ def test_lca_characters_ambiguous_and_missing(self): ret_vec = data_utilities.get_lca_characters( vecs, missing_state_indicator=-1 ) - self.assertEqual(ret_vec, [1, (0,2), (3,0), 0, 5]) + self.assertEqual(ret_vec, [1, (0, 2), (3, 0), 0, 5]) def test_resolve_most_abundant(self): state = (1, 2, 3, 3) @@ -452,8 +483,10 @@ def test_inter_cluster_distance_basic(self): tree = CassiopeiaTree(tree=tree, cell_meta=meta_data) - inter_cluster_distances = data_utilities.compute_inter_cluster_distances( - tree, meta_item="CellType" + inter_cluster_distances = ( + data_utilities.compute_inter_cluster_distances( + tree, meta_item="CellType" + ) ) expected_distances = pd.DataFrame.from_dict( @@ -507,10 +540,12 @@ def test_inter_cluster_distance_custom_input(self): tree = CassiopeiaTree(tree=tree) - inter_cluster_distances = data_utilities.compute_inter_cluster_distances( - tree, - meta_data=meta_data["CellType"], - dissimilarity_map=weight_matrix, + inter_cluster_distances = ( + data_utilities.compute_inter_cluster_distances( + tree, + meta_data=meta_data["CellType"], + dissimilarity_map=weight_matrix, + ) ) expected_distances = pd.DataFrame.from_dict(