From ebfaed3328f67487d52cdf5549b8df0ba484469e Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 29 Oct 2024 17:00:54 +0100 Subject: [PATCH 01/19] Next development iteration `0.7.1.dev0`. --- docs/conf.py | 2 +- src/gpsea/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 48b41634..7c204680 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -62,7 +62,7 @@ # The short X.Y version. version = u'0.7' # The full version, including alpha/beta/rc tags. -release = u'0.7.0' +release = u'0.7.1.dev0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/src/gpsea/__init__.py b/src/gpsea/__init__.py index cddd77e8..fb3db8e4 100644 --- a/src/gpsea/__init__.py +++ b/src/gpsea/__init__.py @@ -2,4 +2,4 @@ GPSEA is a library for analyzing genotype-phenotype correlations in cohorts of rare disease patients. """ -__version__ = "0.7.0" +__version__ = "0.7.1.dev0" From 1c42593c01b1fb7075fa3b8a0ebc7ea121ede34c Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Mon, 11 Nov 2024 23:06:16 +0100 Subject: [PATCH 02/19] Simplify extracting significant phenotypes from HPO analysis results. --- src/gpsea/analysis/_base.py | 43 ++++++++++++++ tests/analysis/test_analysis_result.py | 82 ++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 tests/analysis/test_analysis_result.py diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py index c132f795..4bfe4824 100644 --- a/src/gpsea/analysis/_base.py +++ b/src/gpsea/analysis/_base.py @@ -3,6 +3,7 @@ import os import typing +import numpy as np import pandas as pd from .predicate.phenotype import PhenotypePolyPredicate, P @@ -205,6 +206,48 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]: The sequence includes a `NaN` value for each phenotype that was *not* tested. """ return self._corrected_pvals + + def n_significant_for_alpha( + self, + alpha: float = .05, + ) -> typing.Optional[int]: + """ + Get the count of the corrected p values with the value being less than or equal to `alpha`. + + :param alpha: a `float` with significance level. + """ + if self.corrected_pvals is None: + return None + else: + return sum(p_val <= alpha for p_val in self.corrected_pvals) + + def significant_phenotype_indices( + self, + alpha: float = .05, + pval_kind: typing.Literal["corrected", "nominal"] = "corrected", + ) -> typing.Optional[typing.Sequence[int]]: + """ + Get the indices of phenotypes that attain significance for provided `alpha`. + """ + if pval_kind == "corrected": + if self.corrected_pvals is None: + vals = None + else: + vals = np.array(self.corrected_pvals) + elif pval_kind == "nominal": + vals = np.array(self.pvals) + else: + raise ValueError(f"Unsupported `pval_kind` value {pval_kind}") + + if vals is None: + return None + + not_na = ~np.isnan(vals) + significant = vals <= alpha + selected = not_na & significant + + return tuple(int(idx) for idx in np.argsort(vals) if selected[idx]) + @property def total_tests(self) -> int: diff --git a/tests/analysis/test_analysis_result.py b/tests/analysis/test_analysis_result.py new file mode 100644 index 00000000..e7603e0c --- /dev/null +++ b/tests/analysis/test_analysis_result.py @@ -0,0 +1,82 @@ +import math +import typing + +import hpotk +import pandas as pd +import pytest + +from gpsea.analysis import MultiPhenotypeAnalysisResult +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate +from gpsea.analysis.predicate.phenotype import HpoPredicate +from gpsea.analysis.pcats.stats import FisherExactTest + + +@pytest.fixture(scope="class") +def multi_phenotype_analysis_result( + hpo: hpotk.MinimalOntology, + suox_gt_predicate: GenotypePolyPredicate, +) -> MultiPhenotypeAnalysisResult: + is_arachnodactyly = HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie("HP:0001166"), # Arachnodactyly + ) + is_seizure = HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie("HP:0001250"), # Seizure + ) + is_polydactyly = HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie("HP:0100259"), # Postaxial polydactyly + ) + is_clinodactyly = HpoPredicate( + hpo=hpo, + query=hpotk.TermId.from_curie("HP:0030084"), # Clinodactyly + ) + return MultiPhenotypeAnalysisResult( + gt_predicate=suox_gt_predicate, + statistic=FisherExactTest(), + mtc_correction="fdr_bh", + pheno_predicates=( + is_arachnodactyly, + is_seizure, + is_polydactyly, + is_clinodactyly, + ), + n_usable=(40, 20, 115, 10), + all_counts=( + pd.DataFrame( + data=[[10, 5], [10, 15]], + index=pd.Index(is_arachnodactyly.get_categories()), + columns=pd.Index(suox_gt_predicate.get_categories()), + ), + pd.DataFrame( + data=[[5, 0], [5, 10]], + index=pd.Index(is_seizure.get_categories()), + columns=pd.Index(suox_gt_predicate.get_categories()), + ), + pd.DataFrame( + data=[[50, 0], [5, 60]], + index=pd.Index(is_polydactyly.get_categories()), + columns=pd.Index(suox_gt_predicate.get_categories()), + ), + pd.DataFrame( + data=[[0, 0], [10, 0]], + index=pd.Index(is_clinodactyly.get_categories()), + columns=pd.Index(suox_gt_predicate.get_categories()), + ), + ), + pvals=(math.nan, 0.005, 0.0005, .05), + corrected_pvals=(math.nan, 0.01, 0.001, .5), + ) + + +class TestMultiPhenotypeAnalysisResult: + + def test_significant_phenotype_indices( + self, + multi_phenotype_analysis_result: MultiPhenotypeAnalysisResult, + ): + indices = multi_phenotype_analysis_result.significant_phenotype_indices( + alpha=0.05, pval_kind="corrected" + ) + assert indices == (2, 1) From a6b02ba1904d70eab8c28824e022c711e69d5f32 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Tue, 12 Nov 2024 08:47:44 +0100 Subject: [PATCH 03/19] adding zinc finger to ProteinFeatures --- src/gpsea/model/_protein.py | 21 ++++++---- .../P17010_manual_download.json | 1 + tests/preprocessing/test_uniprot_json.py | 41 +++++++++++++++++++ 3 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 tests/preprocessing/data/uniprot_response/P17010_manual_download.json diff --git a/src/gpsea/model/_protein.py b/src/gpsea/model/_protein.py index 2fe55684..178c86e1 100644 --- a/src/gpsea/model/_protein.py +++ b/src/gpsea/model/_protein.py @@ -118,21 +118,28 @@ class FeatureType(enum.Enum): A region of interest that cannot be described in other subsections. """ + ZINC_FINGER = enum.auto() + """ + A zinc finger is a small, functional, independently folded domain that coordinates one or more zinc ions to stabilize its structure through cysteine and/or histidine residues. + """ + @staticmethod def from_string(category: str) -> "FeatureType": - cat_lover = category.lower() - if cat_lover == "repeat": + cat_lower = category.lower() + if cat_lower == "repeat": return FeatureType.REGION - elif cat_lover == "motif": + elif cat_lower == "motif": return FeatureType.MOTIF - elif cat_lover == "domain": + elif cat_lower == "domain": return FeatureType.DOMAIN - elif cat_lover == "region": + elif cat_lower == "region": return FeatureType.REGION - elif cat_lover == "coiled coil": + elif cat_lower == "coiled coil": return FeatureType.REGION - elif cat_lover == "compositional bias": + elif cat_lower == "compositional bias": return FeatureType.COMPOSITIONAL_BIAS + elif cat_lower == "zinc finger": + return FeatureType.ZINC_FINGER else: raise ValueError(f'Unrecognized protein feature type: "{category}"') diff --git a/tests/preprocessing/data/uniprot_response/P17010_manual_download.json b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json new file mode 100644 index 00000000..f7a36cec --- /dev/null +++ b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json @@ -0,0 +1 @@ +{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"P17010","features":[{"type":"Zinc finger","location":{"start":{"value":425,"modifier":"EXACT"},"end":{"value":447,"modifier":"EXACT"}},"description":"C2H2-type 1","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":456,"modifier":"EXACT"},"end":{"value":478,"modifier":"EXACT"}},"description":"C2H2-type 2","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":488,"modifier":"EXACT"},"end":{"value":510,"modifier":"EXACT"}},"description":"C2H2-type 3","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":519,"modifier":"EXACT"},"end":{"value":542,"modifier":"EXACT"}},"description":"C2H2-type 4","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":548,"modifier":"EXACT"},"end":{"value":570,"modifier":"EXACT"}},"description":"C2H2-type 5","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":576,"modifier":"EXACT"},"end":{"value":599,"modifier":"EXACT"}},"description":"C2H2-type 6","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":605,"modifier":"EXACT"},"end":{"value":627,"modifier":"EXACT"}},"description":"C2H2-type 7","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":633,"modifier":"EXACT"},"end":{"value":656,"modifier":"EXACT"}},"description":"C2H2-type 8","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":662,"modifier":"EXACT"},"end":{"value":684,"modifier":"EXACT"}},"description":"C2H2-type 9","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":690,"modifier":"EXACT"},"end":{"value":713,"modifier":"EXACT"}},"description":"C2H2-type 10","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":719,"modifier":"EXACT"},"end":{"value":741,"modifier":"EXACT"}},"description":"C2H2-type 11","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":747,"modifier":"EXACT"},"end":{"value":770,"modifier":"EXACT"}},"description":"C2H2-type 12","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":776,"modifier":"EXACT"},"end":{"value":798,"modifier":"EXACT"}},"description":"C2H2-type 13","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]}],"extraAttributes":{"uniParcId":"UPI000013C504"}} \ No newline at end of file diff --git a/tests/preprocessing/test_uniprot_json.py b/tests/preprocessing/test_uniprot_json.py index 6765640d..e1e56e9b 100644 --- a/tests/preprocessing/test_uniprot_json.py +++ b/tests/preprocessing/test_uniprot_json.py @@ -9,6 +9,10 @@ ITPR1_protein_len = 2758 +#P17010 +ZFX_protein_len = 805 +ZFX_protein_id = "NP_001171555.1" + class TestUniprotJsonToMetadata: """ Test function that ingests UniProt JSON and transforms it to a ProteinMetadata object @@ -36,6 +40,28 @@ def q8izt6_protein_metadata( protein_length=ITPR1_protein_len, ) + @pytest.fixture + def P17010_json_file_path( + self, + fpath_preprocessing_data_dir: str, + ) -> str: + return os.path.join(fpath_preprocessing_data_dir, "uniprot_response", "P17010_manual_download.json") + + @pytest.fixture + def P17010_protein_metadata( + self, + P17010_json_file_path#: str, + ) -> ProteinMetadata: + """ + :returns: ProteinMetadata created from a downloaded UniProt JSON file + """ + return ProteinMetadata.from_uniprot_json( + protein_id=ZFX_protein_id, + label=ZFX_protein_id, + uniprot_json=P17010_json_file_path, + protein_length=ZFX_protein_len, + ) + def test_general_info( self, q8izt6_protein_metadata: ProteinMetadata, @@ -68,3 +94,18 @@ def test_first_feature( assert feature_0.info.start == 919 assert feature_0.info.end == 1056 assert feature_0.info.name == "Calponin-homology (CH) 1" + + + def test_ZFX(self, + P17010_protein_metadata): + """ + :[{"type":"Zinc finger", + "location":{"start":{"value":425,"modifier":"EXACT"}, + "end":{"value":447,"modifier":"EXACT"}}," + """ + assert P17010_protein_metadata is not None + feature_0 = P17010_protein_metadata.protein_features[0] + assert feature_0.feature_type == FeatureType.ZINC_FINGER + assert feature_0.info.start == 424 ## zero based open-closed + assert feature_0.info.end == 447 + From b2e43d767a0e93c2873d324a1f9f4ca68a418e66 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 12 Nov 2024 09:33:44 +0100 Subject: [PATCH 04/19] Do not use `get_question`. --- src/gpsea/analysis/predicate/genotype/_variant.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py index c725f24f..4b24daef 100644 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ b/src/gpsea/analysis/predicate/genotype/_variant.py @@ -57,7 +57,7 @@ def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: >>> genes = ('SURF1', 'SURF2',) >>> predicate = VariantPredicates.all(VariantPredicates.gene(g) for g in genes) - >>> predicate.get_question() + >>> predicate.description '(affects SURF1 AND affects SURF2)' Args: @@ -88,7 +88,7 @@ def any(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: >>> tx_id = 'NM_123456.7' >>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,) >>> predicate = VariantPredicates.any(VariantPredicates.variant_effect(e, tx_id) for e in effects) - >>> predicate.get_question() + >>> predicate.description '(MISSENSE_VARIANT on NM_123456.7 OR STOP_GAINED on NM_123456.7)' Args: @@ -117,7 +117,7 @@ def variant_effect( >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_123.4') - >>> predicate.get_question() + >>> predicate.description 'MISSENSE_VARIANT on NM_123.4' Args: @@ -278,7 +278,7 @@ def structural_type( >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.structural_type('SO:1000029') - >>> predicate.get_question() + >>> predicate.description 'structural type is SO:1000029' Args: @@ -301,7 +301,7 @@ def variant_class( >>> from gpsea.model import VariantClass >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.variant_class(VariantClass.DEL) - >>> predicate.get_question() + >>> predicate.description 'variant class is DEL' Args: @@ -359,7 +359,7 @@ def change_length( >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.change_length('<=', -10) - >>> predicate.get_question() + >>> predicate.description 'change length <= -10' Args: @@ -391,7 +391,7 @@ def is_structural_deletion( >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> predicate = VariantPredicates.is_structural_deletion(-20) - >>> predicate.get_question() + >>> predicate.description '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -20))' Args: From 812e1756ddfc78f39443839f972e53fa0db735c0 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 12 Nov 2024 14:21:21 +0100 Subject: [PATCH 05/19] Add convenience methods to `Cohort` to simplify reporting. --- src/gpsea/analysis/pcats/_impl.py | 6 +++ src/gpsea/model/_cohort.py | 77 +++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py index 7c75706c..620d404f 100644 --- a/src/gpsea/analysis/pcats/_impl.py +++ b/src/gpsea/analysis/pcats/_impl.py @@ -329,6 +329,12 @@ def mtc_filter_results(self) -> typing.Sequence[PhenotypeMtcResult]: """ return self._mtc_filter_results + def n_filtered_out(self) -> int: + """ + Get the number of phenotype terms that were filtered out by the MTC filter. + """ + return sum(result.is_filtered_out() for result in self.mtc_filter_results) + def __eq__(self, other): return isinstance(other, HpoTermAnalysisResult) \ and super(MultiPhenotypeAnalysisResult, self).__eq__(other) \ diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py index e6eecf3b..e0f1a523 100644 --- a/src/gpsea/model/_cohort.py +++ b/src/gpsea/model/_cohort.py @@ -319,6 +319,16 @@ def all_phenotypes(self) -> typing.Set[Phenotype]: Get a set of all phenotypes (observed or excluded) in the cohort members. """ return set(itertools.chain(phenotype for patient in self._members for phenotype in patient.phenotypes)) + + def count_distinct_hpo_terms(self) -> int: + """ + Get count of distinct HPO terms (either in present or excluded state) seen in the cohort members. + """ + terms = set( + itertools.chain(phenotype.identifier for patient in self._members for phenotype in patient.phenotypes) + ) + return len(terms) + def all_measurements(self) -> typing.Set[Measurement]: """ @@ -332,6 +342,14 @@ def all_diseases(self) -> typing.Set[Disease]: """ return set(itertools.chain(disease for patient in self._members for disease in patient.diseases)) + def count_with_disease_onset(self) -> int: + """ + Get the count of individuals with recorded disease onset. + """ + return self._count_individuals_with_condition( + lambda i: any(d.onset is not None for d in i.diseases), + ) + def all_variants(self) -> typing.Set[Variant]: """ Get a set of all variants observed in the cohort members. @@ -483,6 +501,65 @@ def get_variant_by_key(self, variant_key) -> Variant: else: raise ValueError(f"Variant key {variant_key} not found in cohort.") + def count_males(self) -> int: + """ + Get the number of males in the cohort. + """ + return self._count_individuals_with_sex(Sex.MALE) + + def count_females(self) -> int: + """ + Get the number of females in the cohort. + """ + return self._count_individuals_with_sex(Sex.FEMALE) + + def count_unknown_sex(self) -> int: + """ + Get the number of individuals with unknown sex in the cohort. + """ + return self._count_individuals_with_sex(Sex.UNKNOWN_SEX) + + def _count_individuals_with_sex(self, sex: Sex) -> int: + return self._count_individuals_with_condition(lambda i: i.sex == sex) + + def count_alive(self) -> int: + """ + Get the number of individuals reported to be alive at the time of last encounter. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is not None and i.vital_status.is_alive + ) + + def count_deceased(self) -> int: + """ + Get the number of individuals reported to be deceased. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is not None and i.vital_status.is_deceased + ) + + def count_unknown_vital_status(self) -> int: + """ + Get the number of individuals with unknown or no reported vital status. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is None or i.vital_status.is_unknown + ) + + def count_with_age_of_last_encounter(self) -> int: + """ + Get the number of individuals with a known age of last encounter. + """ + return self._count_individuals_with_condition( + lambda i: i.vital_status is not None and i.vital_status.age_of_death is not None + ) + + def _count_individuals_with_condition( + self, + predicate: typing.Callable[[Patient], bool], + ) -> int: + return sum(predicate(individual) for individual in self._members) + def __eq__(self, other): return isinstance(other, Cohort) and self._members == other._members From 758a326fbbad814ba1d808d2441c30a22864a65e Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 12 Nov 2024 17:07:20 +0100 Subject: [PATCH 06/19] Use the `Cohort` features in `CohortViewer`. --- src/gpsea/view/_base.py | 34 ++++ src/gpsea/view/_cohort.py | 181 +++++++------------ src/gpsea/view/templates/cohort.html | 256 +++++++++++++++------------ tests/view/test_view.py | 19 +- 4 files changed, 254 insertions(+), 236 deletions(-) create mode 100644 src/gpsea/view/_base.py diff --git a/src/gpsea/view/_base.py b/src/gpsea/view/_base.py new file mode 100644 index 00000000..f26e34a3 --- /dev/null +++ b/src/gpsea/view/_base.py @@ -0,0 +1,34 @@ +import abc + +from jinja2 import Environment, PackageLoader + + +def pluralize(count: int, stem: str) -> str: + if isinstance(count, int): + if count == 1: + return f"{count} {stem}" + else: + if stem.endswith("s"): + return f"{count} {stem}es" + else: + return f"{count} {stem}s" + else: + raise ValueError(f"{count} must be an `int`!") + + +def was_were(count: int) -> str: + if isinstance(count, int): + if count == 1: + return "1 was" + else: + return f"{count} were" + else: + raise ValueError(f"{count} must be an `int`!") + + +class BaseViewer(metaclass=abc.ABCMeta): + + def __init__(self): + self._environment = Environment(loader=PackageLoader("gpsea.view", "templates")) + self._environment.filters["pluralize"] = pluralize + self._environment.filters["was_were"] = was_were diff --git a/src/gpsea/view/_cohort.py b/src/gpsea/view/_cohort.py index 2cd6d404..557e94ad 100644 --- a/src/gpsea/view/_cohort.py +++ b/src/gpsea/view/_cohort.py @@ -2,26 +2,26 @@ from collections import namedtuple, defaultdict -from hpotk import MinimalOntology -from jinja2 import Environment, PackageLoader +import hpotk from gpsea.model import Cohort, Sex +from ._base import BaseViewer from ._report import GpseaReport, HtmlGpseaReport from ._formatter import VariantFormatter ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects']) -class CohortViewer: +class CohortViewer(BaseViewer): """ `CohortViewer` summarizes the most salient :class:`~gpsea.model.Cohort` aspects into an HTML report. """ def __init__( - self, - hpo: MinimalOntology, - top_phenotype_count: int = 10, - top_variant_count: int = 10, + self, + hpo: hpotk.MinimalOntology, + top_phenotype_count: int = 10, + top_variant_count: int = 10, ): """ Args: @@ -29,11 +29,11 @@ def __init__( top_phenotype_count(int): Maximum number of HPO terms to display in the HTML table (default: 10) top_variant_count(int): Maximum number of variants to display in the HTML table (default: 10) """ + super().__init__() self._hpo = hpo self._top_phenotype_count = top_phenotype_count self._top_variant_count = top_variant_count - environment = Environment(loader=PackageLoader('gpsea.view', 'templates')) - self._cohort_template = environment.get_template("cohort.html") + self._cohort_template = self._environment.get_template("cohort.html") def process( self, @@ -62,35 +62,18 @@ def _prepare_context( ) -> typing.Mapping[str, typing.Any]: hpo_counts = list() - for hpo in cohort.list_present_phenotypes(top=self._top_phenotype_count): - hpo_id = hpo[0] - individual_count = hpo[1] - hpo_label = "n/a" - if hpo_id in self._hpo: - hpo_label = self._hpo.get_term_name(hpo_id) + for term_id, count in cohort.list_present_phenotypes(top=self._top_phenotype_count): + label = self._hpo.get_term_name(term_id) + if label is None: + label = 'N/A' + hpo_counts.append( { - "HPO": hpo_label, - "ID": hpo_id, - "Count": individual_count, + "label": label, + "term_id": term_id, + "count": count, } ) - # Show the same max number of measuresments as phenotypes - measurement_counts = list() - for msmt in cohort.list_measurements(top=self._top_phenotype_count): - msmt_id = msmt[0] - individual_count = msmt[1] - msmt_label = "n/a" - #if hpo_id in self._hpo: - # hpo_label = self._hpo.get_term_name(hpo_id) - msmt_label = "need a way to retrieve measuremnt label" - measurement_counts.append( - { - "Assay": msmt_label, - "ID": msmt_id, - "Count": individual_count, - }) - n_measurements = len(measurement_counts) variant_counts = list() variant_to_display_d = CohortViewer._get_variant_description(cohort, transcript_id) @@ -98,87 +81,68 @@ def _prepare_context( # get HGVS or human readable variant if variant_key in variant_to_display_d: display = variant_to_display_d[variant_key] - hgvs_cdna = display.hgvs_cdna - protein_name = display.hgvsp + hgvsc = display.hgvs_cdna + hgvsp = display.hgvsp effects = '' if display.variant_effects is None else ", ".join(display.variant_effects) else: - display = variant_key - hgvs_cdna = '' - protein_name = '' + hgvsc = '' + hgvsp = '' effects = '' variant_counts.append( { - "variant": variant_key, - "variant_name": hgvs_cdna, - "protein_name": protein_name, - "variant_effects": effects, - "Count": count, + "count": count, + "key": variant_key, + "hgvsc": hgvsc, + "hgvsp": hgvsp, + "effects": effects, } ) disease_counts = list() for disease_id, disease_count in cohort.list_all_diseases(): - disease_name = "Unknown" + label = "Unknown" for disease in cohort.all_diseases(): if disease.identifier.value == disease_id: - disease_name = disease.name + label = disease.name + break + disease_counts.append( { "disease_id": disease_id, - "disease_name": disease_name, + "label": label, "count": disease_count, } ) - n_diseases = len(disease_counts) - - var_effects_list = list() - var_effects_d = dict() - if transcript_id is not None: - has_transcript = True - data_by_tx = cohort.variant_effect_count_by_tx(tx_id=transcript_id) - # e.g., data structure - # -- {'effect}': 'FRAMESHIFT_VARIANT', 'count': 175}, - # -- {'effect}': 'STOP_GAINED', 'count': 67}, - for tx_id, counter in data_by_tx.items(): + variant_effects = list() + has_transcript = transcript_id is not None + if has_transcript: + var_effects_d = dict() + for tx_id, counter in cohort.variant_effect_count_by_tx(tx_id=transcript_id).items(): + # e.g., data structure + # -- {'effect}': 'FRAMESHIFT_VARIANT', 'count': 175}, + # -- {'effect}': 'STOP_GAINED', 'count': 67}, if tx_id == transcript_id: for effect, count in counter.items(): var_effects_d[effect] = count + break total = sum(var_effects_d.values()) # Sort in descending order based on counts - sorted_counts_desc = dict(sorted(var_effects_d.items(), key=lambda item: item[1], reverse=True)) - for effect, count in sorted_counts_desc.items(): - percent = f"{round(count / total * 100)}%" - var_effects_list.append({"effect": effect, "count": count, "percent": percent}) + for effect, count in sorted(var_effects_d.items(), key=lambda item: item[1], reverse=True): + variant_effects.append( + { + "effect": effect, + "count": count, + "percent": round(count / total * 100), + } + ) else: - has_transcript = False transcript_id = "MANE transcript ID" - n_male = 0 - n_female = 0 - n_unknown_sex = 0 - n_deceased = 0 - n_alive = 0 - n_has_age_at_last_encounter = 0 n_has_disease_onset = 0 hpo_id_to_has_cnset_count_d = defaultdict(int) for pat in cohort.all_patients: - if pat.sex == Sex.MALE: - n_male += 1 - elif pat.sex == Sex.FEMALE: - n_female += 1 - else: - n_unknown_sex += 1 - patient_not_deceased = True - if pat.vital_status is not None: - if pat.vital_status.is_deceased: - n_deceased += 1 - patient_not_deceased = False - if patient_not_deceased: - n_alive += 1 - if pat.age is not None: - n_has_age_at_last_encounter += 1 diseases = pat.diseases for d in diseases: if d.onset is not None: @@ -191,47 +155,34 @@ def _prepare_context( terms_and_ancestors_with_onset_information.update(ancs) for hpo_id in terms_and_ancestors_with_onset_information: hpo_id_to_has_cnset_count_d[hpo_id] += 1 + # When we get here, we want to present the counts of HPO terms that have onset information - n_individuals = len(cohort.all_patients) - onset_threshold = int(0.2 * n_individuals) # only show terms with a decent amount of information - has_onset_information = list() - for hpo_id, count in hpo_id_to_has_cnset_count_d.items(): - if count < onset_threshold: - continue - label = self._hpo.get_term_name(hpo_id) - if label is not None: - display_label = f"{label} ({hpo_id})" - else: - display_label = hpo_id - has_onset_information.append({"HPO": display_label, "count": count}) - n_has_onset_info = len(has_onset_information) + # onset_threshold = int(0.2 * len(cohort)) # only show terms with a decent amount of information + # has_onset_information = list() + # for hpo_id, count in hpo_id_to_has_cnset_count_d.items(): + # if count < onset_threshold: + # continue + # label = self._hpo.get_term_name(hpo_id) + # if label is not None: + # display_label = f"{label} ({hpo_id})" + # else: + # display_label = hpo_id + # has_onset_information.append( + # {"HPO": display_label, "count": count}) + # n_has_onset_info = len(has_onset_information) # The following dictionary is used by the Jinja2 HTML template return { - "n_individuals": n_individuals, - "n_male": n_male, - "n_female": n_female, - "n_unknown_sex": n_unknown_sex, - "n_deceased": n_deceased, - "n_alive": n_alive, - "n_has_disease_onset": n_has_disease_onset, - "n_has_age_at_last_encounter": n_has_age_at_last_encounter, - "has_onset_information": has_onset_information, - "n_has_onset_info": n_has_onset_info, - "n_excluded": cohort.get_excluded_count(), - "total_hpo_count": len(cohort.all_phenotypes()), + "cohort": cohort, + "transcript_id": transcript_id, "top_hpo_count": self._top_phenotype_count, "hpo_counts": hpo_counts, - "total_measurement_count": n_measurements, - "measurement_counts": measurement_counts, - "unique_variant_count": len(cohort.all_variant_infos()), "top_var_count": self._top_variant_count, "var_counts": variant_counts, - "n_diseases": n_diseases, + # "n_diseases": n_diseases, "disease_counts": disease_counts, "has_transcript": has_transcript, - "var_effects_list": var_effects_list, - "transcript_id": transcript_id, + "variant_effects": variant_effects, } @staticmethod diff --git a/src/gpsea/view/templates/cohort.html b/src/gpsea/view/templates/cohort.html index fdd45d0a..d566d63f 100644 --- a/src/gpsea/view/templates/cohort.html +++ b/src/gpsea/view/templates/cohort.html @@ -1,9 +1,10 @@ + Cohort -