From 20922198a04cac8dc4511e1e110a050ecfadcc66 Mon Sep 17 00:00:00 2001 From: PauBadiaM Date: Thu, 20 Jun 2024 12:22:04 +0200 Subject: [PATCH] Refactored return data for all methods --- decoupler/__init__.py | 2 +- decoupler/method_aucell.py | 9 ++------ decoupler/method_gsea.py | 15 ++----------- decoupler/method_gsva.py | 9 ++------ decoupler/method_mdt.py | 9 ++------ decoupler/method_mlm.py | 10 ++------- decoupler/method_ora.py | 10 ++------- decoupler/method_udt.py | 9 ++------ decoupler/method_ulm.py | 10 ++------- decoupler/method_viper.py | 10 ++------- decoupler/method_wmean.py | 16 ++------------ decoupler/method_wsum.py | 16 ++------------ decoupler/pre.py | 22 ++++++++++++++++++- decoupler/tests/test_pre.py | 40 ++++++++++++++++++++++++++++++++++- docs/source/release_notes.rst | 2 ++ 15 files changed, 85 insertions(+), 104 deletions(-) diff --git a/decoupler/__init__.py b/decoupler/__init__.py index 894f1fc..79887cd 100644 --- a/decoupler/__init__.py +++ b/decoupler/__init__.py @@ -1,7 +1,7 @@ __version__ = '1.6.3' # noqa: F401 __version_info__ = tuple([int(num) for num in __version__.split('.')]) # noqa: F401 -from .pre import extract, match, rename_net, get_net_mat, filt_min_n, mask_features # noqa: F401 +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, mask_features, return_data # noqa: F401 from .utils import ( melt, show_methods, check_corr, get_toy_data, summarize_acts, assign_groups, dense_run, p_adjust_fdr, shuffle_net, read_gmt # noqa: F401 diff --git a/decoupler/method_aucell.py b/decoupler/method_aucell.py index b631440..9723061 100644 --- a/decoupler/method_aucell.py +++ b/decoupler/method_aucell.py @@ -10,7 +10,7 @@ from numpy.random import default_rng from tqdm import tqdm -from .pre import extract, rename_net, filt_min_n +from .pre import extract, rename_net, filt_min_n, return_data from anndata import AnnData import numba as nb @@ -151,9 +151,4 @@ def run_aucell(mat, net, source='source', target='target', n_up=None, min_n=5, s estimate = pd.DataFrame(estimate, index=r, columns=net.index) estimate.name = 'aucell_estimate' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - else: - return estimate + return return_data(mat=mat, results=(estimate, )) diff --git a/decoupler/method_gsea.py b/decoupler/method_gsea.py index cce5fdc..f1219ef 100644 --- a/decoupler/method_gsea.py +++ b/decoupler/method_gsea.py @@ -9,7 +9,7 @@ from numpy.random import default_rng from scipy.sparse import csr_matrix -from .pre import extract, rename_net, filt_min_n +from .pre import extract, rename_net, filt_min_n, return_data from .utils import p_adjust_fdr from anndata import AnnData @@ -369,15 +369,4 @@ def run_gsea(mat, net, source='source', target='target', times=1000, batch_size= pvals = pd.DataFrame(pvals, index=r, columns=net.index) pvals.name = 'gsea_pvals' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - if norm_e is not None: - mat.obsm[norm_e.name] = norm_e - mat.obsm[pvals.name] = pvals - else: - if pvals is not None: - return estimate, norm_e, pvals - else: - return estimate + return return_data(mat=mat, results=(estimate, norm_e, pvals)) diff --git a/decoupler/method_gsva.py b/decoupler/method_gsva.py index ac3e975..69a047c 100644 --- a/decoupler/method_gsva.py +++ b/decoupler/method_gsva.py @@ -10,7 +10,7 @@ from scipy.sparse import csr_matrix from numpy.random import default_rng -from .pre import extract, rename_net, filt_min_n +from .pre import extract, rename_net, filt_min_n, return_data from .method_gsea import std from anndata import AnnData @@ -232,9 +232,4 @@ def run_gsva(mat, net, source='source', target='target', kcdf=False, mx_diff=Tru estimate = pd.DataFrame(estimate, index=r, columns=net.index) estimate.name = 'gsva_estimate' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - else: - return estimate + return return_data(mat=mat, results=(estimate, )) diff --git a/decoupler/method_mdt.py b/decoupler/method_mdt.py index c1cad15..6e44785 100644 --- a/decoupler/method_mdt.py +++ b/decoupler/method_mdt.py @@ -7,7 +7,7 @@ import pandas as pd from scipy.sparse import csr_matrix -from .pre import extract, match, rename_net, get_net_mat, filt_min_n +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data from anndata import AnnData from tqdm import tqdm @@ -117,9 +117,4 @@ def run_mdt(mat, net, source='source', target='target', weight='weight', trees=1 estimate = pd.DataFrame(estimate, index=r, columns=sources) estimate.name = 'mdt_estimate' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - else: - return estimate + return return_data(mat=mat, results=(estimate, )) diff --git a/decoupler/method_mlm.py b/decoupler/method_mlm.py index 304dc0c..998cc31 100644 --- a/decoupler/method_mlm.py +++ b/decoupler/method_mlm.py @@ -7,7 +7,7 @@ import pandas as pd from scipy.sparse import csr_matrix -from .pre import extract, match, rename_net, get_net_mat, filt_min_n +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data from anndata import AnnData from scipy import stats @@ -131,10 +131,4 @@ def run_mlm(mat, net, source='source', target='target', weight='weight', batch_s pvals = pd.DataFrame(pvals, index=r, columns=sources) pvals.name = 'mlm_pvals' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - mat.obsm[pvals.name] = pvals - else: - return estimate, pvals + return return_data(mat=mat, results=(estimate, pvals)) diff --git a/decoupler/method_ora.py b/decoupler/method_ora.py index 90441ce..5233e4f 100644 --- a/decoupler/method_ora.py +++ b/decoupler/method_ora.py @@ -12,7 +12,7 @@ from scipy.stats import rankdata from math import log, exp, lgamma -from .pre import extract, rename_net, filt_min_n +from .pre import extract, rename_net, filt_min_n, return_data from .utils import p_adjust_fdr from anndata import AnnData @@ -315,10 +315,4 @@ def run_ora(mat, net, source='source', target='target', n_up=None, n_bottom=0, n estimate = pd.DataFrame(-np.log10(pvals), index=r, columns=pvals.columns) estimate.name = 'ora_estimate' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - mat.obsm[pvals.name] = pvals - else: - return estimate, pvals + return return_data(mat=mat, results=(estimate, pvals)) diff --git a/decoupler/method_udt.py b/decoupler/method_udt.py index 178e288..f41957c 100644 --- a/decoupler/method_udt.py +++ b/decoupler/method_udt.py @@ -7,7 +7,7 @@ from scipy.sparse import csr_matrix import pandas as pd -from .pre import extract, match, rename_net, get_net_mat, filt_min_n +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data from anndata import AnnData from tqdm import tqdm @@ -114,9 +114,4 @@ def run_udt(mat, net, source='source', target='target', weight='weight', min_lea estimate = pd.DataFrame(estimate, index=r, columns=sources) estimate.name = 'udt_estimate' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - else: - return estimate + return return_data(mat=mat, results=(estimate, )) diff --git a/decoupler/method_ulm.py b/decoupler/method_ulm.py index 27ad2a5..53de352 100644 --- a/decoupler/method_ulm.py +++ b/decoupler/method_ulm.py @@ -9,7 +9,7 @@ from scipy.stats import t -from .pre import extract, match, rename_net, get_net_mat, filt_min_n +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data from anndata import AnnData from tqdm import tqdm @@ -124,10 +124,4 @@ def run_ulm(mat, net, source='source', target='target', weight='weight', batch_s pvals = pd.DataFrame(pvals, index=r, columns=sources) pvals.name = 'ulm_pvals' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - mat.obsm[pvals.name] = pvals - else: - return estimate, pvals + return return_data(mat=mat, results=(estimate, pvals)) diff --git a/decoupler/method_viper.py b/decoupler/method_viper.py index 9c9d019..09cfbb2 100644 --- a/decoupler/method_viper.py +++ b/decoupler/method_viper.py @@ -10,7 +10,7 @@ from scipy.stats import rankdata from scipy.stats import norm -from .pre import extract, match, rename_net, get_net_mat, filt_min_n +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data from anndata import AnnData from tqdm import tqdm @@ -308,10 +308,4 @@ def run_viper(mat, net, source='source', target='target', weight='weight', pleio pvals = pd.DataFrame(pvals, index=r, columns=sources) pvals.name = 'viper_pvals' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - mat.obsm[pvals.name] = pvals - else: - return estimate, pvals + return return_data(mat=mat, results=(estimate, pvals)) diff --git a/decoupler/method_wmean.py b/decoupler/method_wmean.py index 841e7aa..68b8b7e 100644 --- a/decoupler/method_wmean.py +++ b/decoupler/method_wmean.py @@ -7,7 +7,7 @@ import pandas as pd from scipy.sparse import csr_matrix -from .pre import extract, match, rename_net, get_net_mat, filt_min_n +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data from .method_gsea import std from anndata import AnnData @@ -177,16 +177,4 @@ def run_wmean(mat, net, source='source', target='target', weight='weight', times pvals = pd.DataFrame(pvals, index=r, columns=sources) pvals.name = 'wmean_pvals' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - if pvals is not None: - mat.obsm[norm.name] = norm - mat.obsm[corr.name] = corr - mat.obsm[pvals.name] = pvals - else: - if pvals is not None: - return estimate, norm, corr, pvals - else: - return estimate + return return_data(mat=mat, results=(estimate, norm, corr, pvals)) diff --git a/decoupler/method_wsum.py b/decoupler/method_wsum.py index 20adbde..dbd4eda 100644 --- a/decoupler/method_wsum.py +++ b/decoupler/method_wsum.py @@ -7,7 +7,7 @@ import pandas as pd from scipy.sparse import csr_matrix -from .pre import extract, match, rename_net, get_net_mat, filt_min_n +from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data from .method_gsea import std from anndata import AnnData @@ -173,16 +173,4 @@ def run_wsum(mat, net, source='source', target='target', weight='weight', times= pvals = pd.DataFrame(pvals, index=r, columns=sources) pvals.name = 'wsum_pvals' - # AnnData support - if isinstance(mat, AnnData): - # Update obsm AnnData object - mat.obsm[estimate.name] = estimate - if pvals is not None: - mat.obsm[norm.name] = norm - mat.obsm[corr.name] = corr - mat.obsm[pvals.name] = pvals - else: - if pvals is not None: - return estimate, norm, corr, pvals - else: - return estimate + return return_data(mat=mat, results=(estimate, norm, corr, pvals)) diff --git a/decoupler/pre.py b/decoupler/pre.py index d125698..268ad86 100644 --- a/decoupler/pre.py +++ b/decoupler/pre.py @@ -6,7 +6,7 @@ import numpy as np from scipy.sparse import csr_matrix, issparse import pandas as pd - +import logging from anndata import AnnData @@ -278,3 +278,23 @@ def mask_features(mat, log=False, thr=1, use_raw=False): else: raise ValueError("""mat must be a list of [matrix, samples, features], dataframe (samples x features) or an AnnData instance.""") + + +def add_to_anndata(mat, results): + for result in results: + if result is not None: + mat.obsm[result.name] = result + + +def return_data(mat, results): + if isinstance(mat, AnnData): + if mat.obs_names.size != results[0].index.size: + logging.warning('Provided AnnData contains empty observations. Returning repaired object.') + mat = mat[results[0].index, :].copy() + add_to_anndata(mat, results) + return mat + else: + add_to_anndata(mat, results) + return None + else: + return tuple([result for result in results if result is not None]) diff --git a/decoupler/tests/test_pre.py b/decoupler/tests/test_pre.py index 0b6de03..bc12674 100644 --- a/decoupler/tests/test_pre.py +++ b/decoupler/tests/test_pre.py @@ -3,7 +3,10 @@ import numpy as np from scipy.sparse import csr_matrix from anndata import AnnData -from ..pre import check_mat, extract, filt_min_n, match, rename_net, get_net_mat, mask_features +from ..pre import ( + check_mat, extract, filt_min_n, match, rename_net, get_net_mat, mask_features, + return_data, add_to_anndata +) def test_check_mat(): @@ -101,3 +104,38 @@ def test_mask_features(): mask_features('asdfg') with pytest.raises(ValueError): mask_features(adata, use_raw=True) + + +def test_add_to_anndata(): + m = np.array([[1, 0, 2], [1, 0, 3]]) + r = np.array(['S1', 'S2']) + c = np.array(['G1', 'G2', 'G3']) + df = pd.DataFrame(m, index=r, columns=c) + adata = AnnData(df.astype(np.float32)) + estimate = np.array([[1], [4]]) + s = np.array(['S1']) + estimate = pd.DataFrame(estimate, index=r, columns=s) + estimate.name = 'estimate' + add_to_anndata(mat=adata, results=(estimate, None)) + assert 'estimate' in adata.obsm + + +def test_return_data(): + m = np.array([[1, 0, 2], [1, 0, 3], [0, 0, 0]]) + r = np.array(['S1', 'S2', 'S3']) + c = np.array(['G1', 'G2', 'G3']) + df = pd.DataFrame(m, index=r, columns=c) + adata = AnnData(df.astype(np.float32)) + estimate = np.array([[1], [4]]) + s = np.array(['S1']) + estimate = pd.DataFrame(estimate, index=r[:-1], columns=s) + estimate.name = 'estimate' + pvals = np.array([[0.4], [0.01]]) + pvals = pd.DataFrame(pvals, index=estimate.index, columns=estimate.columns) + pvals.name = 'pvals' + ret = return_data(mat=adata, results=(estimate, pvals)) + assert isinstance(ret, AnnData) + ret = return_data(mat=adata[estimate.index, :].copy(), results=(estimate, pvals)) + assert ret is None + ret = return_data(mat=df, results=(estimate, pvals)) + assert isinstance(ret, tuple) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 124da05..c6943c5 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -9,10 +9,12 @@ Bug fixes - Fixed error in in ``get_contrast`` by reverting use of ``copy.deepcopy`` to ``copy``. - Fixed verbose error regarding the number of unique sources being used in ``benchmark``. - Added check for minimum version of ``igraph>=0.10.0`` to properly render ``plot_network``. +- Fixed return error of methods triggered when an observation was empty and input was ``AnnData``. Changes ~~~~~~~ - Resource functions such as ``get_resource`` or ``get_collectri`` now accept different ``genesymbol_resource`` than UniProt for gene translation to other organisms. +- Deprecated ``sklearn`` and switched to ``sklearn`` for ``udt``. Additions ~~~~~~~~~