From ebfaed3328f67487d52cdf5549b8df0ba484469e Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 29 Oct 2024 17:00:54 +0100
Subject: [PATCH 01/19] Next development iteration `0.7.1.dev0`.

---
 docs/conf.py          | 2 +-
 src/gpsea/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 48b41634..7c204680 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -62,7 +62,7 @@
 # The short X.Y version.
 version = u'0.7'
 # The full version, including alpha/beta/rc tags.
-release = u'0.7.0'
+release = u'0.7.1.dev0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/src/gpsea/__init__.py b/src/gpsea/__init__.py
index cddd77e8..fb3db8e4 100644
--- a/src/gpsea/__init__.py
+++ b/src/gpsea/__init__.py
@@ -2,4 +2,4 @@
 GPSEA is a library for analyzing genotype-phenotype correlations in cohorts of rare disease patients.
 """
 
-__version__ = "0.7.0"
+__version__ = "0.7.1.dev0"

From 1c42593c01b1fb7075fa3b8a0ebc7ea121ede34c Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Mon, 11 Nov 2024 23:06:16 +0100
Subject: [PATCH 02/19] Simplify extracting significant phenotypes from HPO
 analysis results.

---
 src/gpsea/analysis/_base.py            | 43 ++++++++++++++
 tests/analysis/test_analysis_result.py | 82 ++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 tests/analysis/test_analysis_result.py

diff --git a/src/gpsea/analysis/_base.py b/src/gpsea/analysis/_base.py
index c132f795..4bfe4824 100644
--- a/src/gpsea/analysis/_base.py
+++ b/src/gpsea/analysis/_base.py
@@ -3,6 +3,7 @@
 import os
 import typing
 
+import numpy as np
 import pandas as pd
 
 from .predicate.phenotype import PhenotypePolyPredicate, P
@@ -205,6 +206,48 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]:
         The sequence includes a `NaN` value for each phenotype that was *not* tested.
         """
         return self._corrected_pvals
+    
+    def n_significant_for_alpha(
+        self,
+        alpha: float = .05,
+    ) -> typing.Optional[int]:
+        """
+        Get the count of the corrected p values with the value being less than or equal to `alpha`.
+
+        :param alpha: a `float` with significance level.
+        """
+        if self.corrected_pvals is None:
+            return None
+        else:
+            return sum(p_val <= alpha for p_val in self.corrected_pvals)
+        
+    def significant_phenotype_indices(
+        self,
+        alpha: float = .05,
+        pval_kind: typing.Literal["corrected", "nominal"] = "corrected",
+    ) -> typing.Optional[typing.Sequence[int]]:
+        """
+        Get the indices of phenotypes that attain significance for provided `alpha`.
+        """
+        if pval_kind == "corrected":
+            if self.corrected_pvals is None:
+                vals = None
+            else:
+                vals = np.array(self.corrected_pvals)
+        elif pval_kind == "nominal":
+            vals = np.array(self.pvals)
+        else:
+            raise ValueError(f"Unsupported `pval_kind` value {pval_kind}")
+        
+        if vals is None:
+            return None
+        
+        not_na = ~np.isnan(vals)
+        significant = vals <= alpha
+        selected = not_na & significant
+
+        return tuple(int(idx) for idx in np.argsort(vals) if selected[idx])
+
 
     @property
     def total_tests(self) -> int:
diff --git a/tests/analysis/test_analysis_result.py b/tests/analysis/test_analysis_result.py
new file mode 100644
index 00000000..e7603e0c
--- /dev/null
+++ b/tests/analysis/test_analysis_result.py
@@ -0,0 +1,82 @@
+import math
+import typing
+
+import hpotk
+import pandas as pd
+import pytest
+
+from gpsea.analysis import MultiPhenotypeAnalysisResult
+from gpsea.analysis.predicate.genotype import GenotypePolyPredicate
+from gpsea.analysis.predicate.phenotype import HpoPredicate
+from gpsea.analysis.pcats.stats import FisherExactTest
+
+
+@pytest.fixture(scope="class")
+def multi_phenotype_analysis_result(
+    hpo: hpotk.MinimalOntology,
+    suox_gt_predicate: GenotypePolyPredicate,
+) -> MultiPhenotypeAnalysisResult:
+    is_arachnodactyly = HpoPredicate(
+        hpo=hpo,
+        query=hpotk.TermId.from_curie("HP:0001166"),  # Arachnodactyly
+    )
+    is_seizure = HpoPredicate(
+        hpo=hpo,
+        query=hpotk.TermId.from_curie("HP:0001250"),  # Seizure
+    )
+    is_polydactyly = HpoPredicate(
+        hpo=hpo,
+        query=hpotk.TermId.from_curie("HP:0100259"),  # Postaxial polydactyly
+    )
+    is_clinodactyly = HpoPredicate(
+        hpo=hpo,
+        query=hpotk.TermId.from_curie("HP:0030084"),  # Clinodactyly
+    )
+    return MultiPhenotypeAnalysisResult(
+        gt_predicate=suox_gt_predicate,
+        statistic=FisherExactTest(),
+        mtc_correction="fdr_bh",
+        pheno_predicates=(
+            is_arachnodactyly,
+            is_seizure,
+            is_polydactyly,
+            is_clinodactyly,
+        ),
+        n_usable=(40, 20, 115, 10),
+        all_counts=(
+            pd.DataFrame(
+                data=[[10, 5], [10, 15]],
+                index=pd.Index(is_arachnodactyly.get_categories()),
+                columns=pd.Index(suox_gt_predicate.get_categories()),
+            ),
+            pd.DataFrame(
+                data=[[5, 0], [5, 10]],
+                index=pd.Index(is_seizure.get_categories()),
+                columns=pd.Index(suox_gt_predicate.get_categories()),
+            ),
+            pd.DataFrame(
+                data=[[50, 0], [5, 60]],
+                index=pd.Index(is_polydactyly.get_categories()),
+                columns=pd.Index(suox_gt_predicate.get_categories()),
+            ),
+            pd.DataFrame(
+                data=[[0, 0], [10, 0]],
+                index=pd.Index(is_clinodactyly.get_categories()),
+                columns=pd.Index(suox_gt_predicate.get_categories()),
+            ),
+        ),
+        pvals=(math.nan, 0.005, 0.0005, .05),
+        corrected_pvals=(math.nan, 0.01, 0.001, .5),
+    )
+
+
+class TestMultiPhenotypeAnalysisResult:
+
+    def test_significant_phenotype_indices(
+        self,
+        multi_phenotype_analysis_result: MultiPhenotypeAnalysisResult,
+    ):
+        indices = multi_phenotype_analysis_result.significant_phenotype_indices(
+            alpha=0.05, pval_kind="corrected"
+        )
+        assert indices == (2, 1)

From a6b02ba1904d70eab8c28824e022c711e69d5f32 Mon Sep 17 00:00:00 2001
From: Peter Robinson <peter.robinson@jax.org>
Date: Tue, 12 Nov 2024 08:47:44 +0100
Subject: [PATCH 03/19] adding zinc finger to ProteinFeatures

---
 src/gpsea/model/_protein.py                   | 21 ++++++----
 .../P17010_manual_download.json               |  1 +
 tests/preprocessing/test_uniprot_json.py      | 41 +++++++++++++++++++
 3 files changed, 56 insertions(+), 7 deletions(-)
 create mode 100644 tests/preprocessing/data/uniprot_response/P17010_manual_download.json

diff --git a/src/gpsea/model/_protein.py b/src/gpsea/model/_protein.py
index 2fe55684..178c86e1 100644
--- a/src/gpsea/model/_protein.py
+++ b/src/gpsea/model/_protein.py
@@ -118,21 +118,28 @@ class FeatureType(enum.Enum):
     A region of interest that cannot be described in other subsections.
     """
 
+    ZINC_FINGER = enum.auto()
+    """
+    A zinc finger is a small, functional, independently folded domain that coordinates one or more zinc ions to stabilize its structure through cysteine and/or histidine residues.
+    """
+
     @staticmethod
     def from_string(category: str) -> "FeatureType":
-        cat_lover = category.lower()
-        if cat_lover == "repeat":
+        cat_lower = category.lower()
+        if cat_lower == "repeat":
             return FeatureType.REGION
-        elif cat_lover == "motif":
+        elif cat_lower == "motif":
             return FeatureType.MOTIF
-        elif cat_lover == "domain":
+        elif cat_lower == "domain":
             return FeatureType.DOMAIN
-        elif cat_lover == "region":
+        elif cat_lower == "region":
             return FeatureType.REGION
-        elif cat_lover == "coiled coil":
+        elif cat_lower == "coiled coil":
             return FeatureType.REGION
-        elif cat_lover == "compositional bias":
+        elif cat_lower == "compositional bias":
             return FeatureType.COMPOSITIONAL_BIAS
+        elif cat_lower == "zinc finger":
+            return FeatureType.ZINC_FINGER
         else:
             raise ValueError(f'Unrecognized protein feature type: "{category}"')
 
diff --git a/tests/preprocessing/data/uniprot_response/P17010_manual_download.json b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json
new file mode 100644
index 00000000..f7a36cec
--- /dev/null
+++ b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json
@@ -0,0 +1 @@
+{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"P17010","features":[{"type":"Zinc finger","location":{"start":{"value":425,"modifier":"EXACT"},"end":{"value":447,"modifier":"EXACT"}},"description":"C2H2-type 1","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":456,"modifier":"EXACT"},"end":{"value":478,"modifier":"EXACT"}},"description":"C2H2-type 2","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":488,"modifier":"EXACT"},"end":{"value":510,"modifier":"EXACT"}},"description":"C2H2-type 3","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":519,"modifier":"EXACT"},"end":{"value":542,"modifier":"EXACT"}},"description":"C2H2-type 4","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":548,"modifier":"EXACT"},"end":{"value":570,"modifier":"EXACT"}},"description":"C2H2-type 5","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":576,"modifier":"EXACT"},"end":{"value":599,"modifier":"EXACT"}},"description":"C2H2-type 6","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":605,"modifier":"EXACT"},"end":{"value":627,"modifier":"EXACT"}},"description":"C2H2-type 7","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":633,"modifier":"EXACT"},"end":{"value":656,"modifier":"EXACT"}},"description":"C2H2-type 8","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":662,"modifier":"EXACT"},"end":{"value":684,"modifier":"EXACT"}},"description":"C2H2-type 9","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":690,"modifier":"EXACT"},"end":{"value":713,"modifier":"EXACT"}},"description":"C2H2-type 10","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":719,"modifier":"EXACT"},"end":{"value":741,"modifier":"EXACT"}},"description":"C2H2-type 11","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":747,"modifier":"EXACT"},"end":{"value":770,"modifier":"EXACT"}},"description":"C2H2-type 12","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":776,"modifier":"EXACT"},"end":{"value":798,"modifier":"EXACT"}},"description":"C2H2-type 13","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]}],"extraAttributes":{"uniParcId":"UPI000013C504"}}
\ No newline at end of file
diff --git a/tests/preprocessing/test_uniprot_json.py b/tests/preprocessing/test_uniprot_json.py
index 6765640d..e1e56e9b 100644
--- a/tests/preprocessing/test_uniprot_json.py
+++ b/tests/preprocessing/test_uniprot_json.py
@@ -9,6 +9,10 @@
 ITPR1_protein_len = 2758
 
 
+#P17010
+ZFX_protein_len = 805
+ZFX_protein_id = "NP_001171555.1"
+
 class TestUniprotJsonToMetadata:
     """
     Test function that ingests UniProt JSON and transforms it to a ProteinMetadata object
@@ -36,6 +40,28 @@ def q8izt6_protein_metadata(
             protein_length=ITPR1_protein_len,
         )
 
+    @pytest.fixture
+    def P17010_json_file_path(
+            self,
+            fpath_preprocessing_data_dir: str,
+    ) -> str:
+        return os.path.join(fpath_preprocessing_data_dir, "uniprot_response", "P17010_manual_download.json")
+
+    @pytest.fixture
+    def P17010_protein_metadata(
+            self,
+            P17010_json_file_path#: str,
+    ) -> ProteinMetadata:
+        """
+        :returns: ProteinMetadata created from a downloaded UniProt JSON file
+        """
+        return ProteinMetadata.from_uniprot_json(
+            protein_id=ZFX_protein_id,
+            label=ZFX_protein_id,
+            uniprot_json=P17010_json_file_path,
+            protein_length=ZFX_protein_len,
+        )
+
     def test_general_info(
         self,
         q8izt6_protein_metadata: ProteinMetadata,
@@ -68,3 +94,18 @@ def test_first_feature(
         assert feature_0.info.start == 919
         assert feature_0.info.end == 1056
         assert feature_0.info.name == "Calponin-homology (CH) 1"
+
+
+    def test_ZFX(self,
+                 P17010_protein_metadata):
+        """
+        :[{"type":"Zinc finger",
+        "location":{"start":{"value":425,"modifier":"EXACT"},
+        "end":{"value":447,"modifier":"EXACT"}},"
+        """
+        assert P17010_protein_metadata is not None
+        feature_0 = P17010_protein_metadata.protein_features[0]
+        assert feature_0.feature_type == FeatureType.ZINC_FINGER
+        assert feature_0.info.start == 424 ## zero based open-closed
+        assert feature_0.info.end == 447
+

From b2e43d767a0e93c2873d324a1f9f4ca68a418e66 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 12 Nov 2024 09:33:44 +0100
Subject: [PATCH 04/19] Do not use `get_question`.

---
 src/gpsea/analysis/predicate/genotype/_variant.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py
index c725f24f..4b24daef 100644
--- a/src/gpsea/analysis/predicate/genotype/_variant.py
+++ b/src/gpsea/analysis/predicate/genotype/_variant.py
@@ -57,7 +57,7 @@ def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate:
 
         >>> genes = ('SURF1', 'SURF2',)
         >>> predicate = VariantPredicates.all(VariantPredicates.gene(g) for g in genes)
-        >>> predicate.get_question()
+        >>> predicate.description
         '(affects SURF1 AND affects SURF2)'
 
         Args:
@@ -88,7 +88,7 @@ def any(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate:
         >>> tx_id = 'NM_123456.7'
         >>> effects = (VariantEffect.MISSENSE_VARIANT, VariantEffect.STOP_GAINED,)
         >>> predicate = VariantPredicates.any(VariantPredicates.variant_effect(e, tx_id) for e in effects)
-        >>> predicate.get_question()
+        >>> predicate.description
         '(MISSENSE_VARIANT on NM_123456.7 OR STOP_GAINED on NM_123456.7)'
 
         Args:
@@ -117,7 +117,7 @@ def variant_effect(
         >>> from gpsea.model import VariantEffect
         >>> from gpsea.analysis.predicate.genotype import VariantPredicates
         >>> predicate = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id='NM_123.4')
-        >>> predicate.get_question()
+        >>> predicate.description
         'MISSENSE_VARIANT on NM_123.4'
 
         Args:
@@ -278,7 +278,7 @@ def structural_type(
 
         >>> from gpsea.analysis.predicate.genotype import VariantPredicates
         >>> predicate = VariantPredicates.structural_type('SO:1000029')
-        >>> predicate.get_question()
+        >>> predicate.description
         'structural type is SO:1000029'
 
         Args:
@@ -301,7 +301,7 @@ def variant_class(
         >>> from gpsea.model import VariantClass
         >>> from gpsea.analysis.predicate.genotype import VariantPredicates
         >>> predicate = VariantPredicates.variant_class(VariantClass.DEL)
-        >>> predicate.get_question()
+        >>> predicate.description
         'variant class is DEL'
 
         Args:
@@ -359,7 +359,7 @@ def change_length(
 
         >>> from gpsea.analysis.predicate.genotype import VariantPredicates
         >>> predicate = VariantPredicates.change_length('<=', -10)
-        >>> predicate.get_question()
+        >>> predicate.description
         'change length <= -10'
 
         Args:
@@ -391,7 +391,7 @@ def is_structural_deletion(
 
         >>> from gpsea.analysis.predicate.genotype import VariantPredicates
         >>> predicate = VariantPredicates.is_structural_deletion(-20)
-        >>> predicate.get_question()
+        >>> predicate.description
         '(structural type is SO:1000029 OR (variant class is DEL AND change length <= -20))'
 
         Args:

From 812e1756ddfc78f39443839f972e53fa0db735c0 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 12 Nov 2024 14:21:21 +0100
Subject: [PATCH 05/19] Add convenience methods to `Cohort` to simplify
 reporting.

---
 src/gpsea/analysis/pcats/_impl.py |  6 +++
 src/gpsea/model/_cohort.py        | 77 +++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

diff --git a/src/gpsea/analysis/pcats/_impl.py b/src/gpsea/analysis/pcats/_impl.py
index 7c75706c..620d404f 100644
--- a/src/gpsea/analysis/pcats/_impl.py
+++ b/src/gpsea/analysis/pcats/_impl.py
@@ -329,6 +329,12 @@ def mtc_filter_results(self) -> typing.Sequence[PhenotypeMtcResult]:
         """
         return self._mtc_filter_results
 
+    def n_filtered_out(self) -> int:
+        """
+        Get the number of phenotype terms that were filtered out by the MTC filter.
+        """
+        return sum(result.is_filtered_out() for result in self.mtc_filter_results)
+
     def __eq__(self, other):
         return isinstance(other, HpoTermAnalysisResult) \
             and super(MultiPhenotypeAnalysisResult, self).__eq__(other) \
diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py
index e6eecf3b..e0f1a523 100644
--- a/src/gpsea/model/_cohort.py
+++ b/src/gpsea/model/_cohort.py
@@ -319,6 +319,16 @@ def all_phenotypes(self) -> typing.Set[Phenotype]:
         Get a set of all phenotypes (observed or excluded) in the cohort members.
         """
         return set(itertools.chain(phenotype for patient in self._members for phenotype in patient.phenotypes))
+    
+    def count_distinct_hpo_terms(self) -> int:
+        """
+        Get count of distinct HPO terms (either in present or excluded state) seen in the cohort members.
+        """
+        terms = set(
+            itertools.chain(phenotype.identifier for patient in self._members for phenotype in patient.phenotypes)
+        )
+        return len(terms)
+
 
     def all_measurements(self) -> typing.Set[Measurement]:
         """
@@ -332,6 +342,14 @@ def all_diseases(self) -> typing.Set[Disease]:
         """
         return set(itertools.chain(disease for patient in self._members for disease in patient.diseases))
 
+    def count_with_disease_onset(self) -> int:
+        """
+        Get the count of individuals with recorded disease onset.
+        """
+        return self._count_individuals_with_condition(
+            lambda i: any(d.onset is not None for d in i.diseases),
+        )
+
     def all_variants(self) -> typing.Set[Variant]:
         """
         Get a set of all variants observed in the cohort members.
@@ -483,6 +501,65 @@ def get_variant_by_key(self, variant_key) -> Variant:
         else:
             raise ValueError(f"Variant key {variant_key} not found in cohort.")
 
+    def count_males(self) -> int:
+        """
+        Get the number of males in the cohort.
+        """
+        return self._count_individuals_with_sex(Sex.MALE)
+
+    def count_females(self) -> int:
+        """
+        Get the number of females in the cohort.
+        """
+        return self._count_individuals_with_sex(Sex.FEMALE)
+
+    def count_unknown_sex(self) -> int:
+        """
+        Get the number of individuals with unknown sex in the cohort.
+        """
+        return self._count_individuals_with_sex(Sex.UNKNOWN_SEX)
+
+    def _count_individuals_with_sex(self, sex: Sex) -> int:
+        return self._count_individuals_with_condition(lambda i: i.sex == sex)
+
+    def count_alive(self) -> int:
+        """
+        Get the number of individuals reported to be alive at the time of last encounter.
+        """
+        return self._count_individuals_with_condition(
+            lambda i: i.vital_status is not None and i.vital_status.is_alive
+        )
+
+    def count_deceased(self) -> int:
+        """
+        Get the number of individuals reported to be deceased.
+        """
+        return self._count_individuals_with_condition(
+            lambda i: i.vital_status is not None and i.vital_status.is_deceased
+        )
+    
+    def count_unknown_vital_status(self) -> int:
+        """
+        Get the number of individuals with unknown or no reported vital status.
+        """
+        return self._count_individuals_with_condition(
+            lambda i: i.vital_status is None or i.vital_status.is_unknown
+        )
+
+    def count_with_age_of_last_encounter(self) -> int:
+        """
+        Get the number of individuals with a known age of last encounter.
+        """
+        return self._count_individuals_with_condition(
+            lambda i: i.vital_status is not None and i.vital_status.age_of_death is not None
+        )
+
+    def _count_individuals_with_condition(
+        self,
+        predicate: typing.Callable[[Patient], bool],
+    ) -> int:
+        return sum(predicate(individual) for individual in self._members)
+
     def __eq__(self, other):
         return isinstance(other, Cohort) and self._members == other._members
 

From 758a326fbbad814ba1d808d2441c30a22864a65e Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Tue, 12 Nov 2024 17:07:20 +0100
Subject: [PATCH 06/19] Use the `Cohort` features in `CohortViewer`.

---
 src/gpsea/view/_base.py              |  34 ++++
 src/gpsea/view/_cohort.py            | 181 +++++++------------
 src/gpsea/view/templates/cohort.html | 256 +++++++++++++++------------
 tests/view/test_view.py              |  19 +-
 4 files changed, 254 insertions(+), 236 deletions(-)
 create mode 100644 src/gpsea/view/_base.py

diff --git a/src/gpsea/view/_base.py b/src/gpsea/view/_base.py
new file mode 100644
index 00000000..f26e34a3
--- /dev/null
+++ b/src/gpsea/view/_base.py
@@ -0,0 +1,34 @@
+import abc
+
+from jinja2 import Environment, PackageLoader
+
+
+def pluralize(count: int, stem: str) -> str:
+    if isinstance(count, int):
+        if count == 1:
+            return f"{count} {stem}"
+        else:
+            if stem.endswith("s"):
+                return f"{count} {stem}es"
+            else:
+                return f"{count} {stem}s"
+    else:
+        raise ValueError(f"{count} must be an `int`!")
+
+
+def was_were(count: int) -> str:
+    if isinstance(count, int):
+        if count == 1:
+            return "1 was"
+        else:
+            return f"{count} were"
+    else:
+        raise ValueError(f"{count} must be an `int`!")
+
+
+class BaseViewer(metaclass=abc.ABCMeta):
+
+    def __init__(self):
+        self._environment = Environment(loader=PackageLoader("gpsea.view", "templates"))
+        self._environment.filters["pluralize"] = pluralize
+        self._environment.filters["was_were"] = was_were
diff --git a/src/gpsea/view/_cohort.py b/src/gpsea/view/_cohort.py
index 2cd6d404..557e94ad 100644
--- a/src/gpsea/view/_cohort.py
+++ b/src/gpsea/view/_cohort.py
@@ -2,26 +2,26 @@
 
 from collections import namedtuple, defaultdict
 
-from hpotk import MinimalOntology
-from jinja2 import Environment, PackageLoader
+import hpotk
 
 from gpsea.model import Cohort, Sex
+from ._base import BaseViewer
 from ._report import GpseaReport, HtmlGpseaReport
 from ._formatter import VariantFormatter
 
 ToDisplay = namedtuple('ToDisplay', ['hgvs_cdna', 'hgvsp', 'variant_effects'])
 
 
-class CohortViewer:
+class CohortViewer(BaseViewer):
     """
     `CohortViewer` summarizes the most salient :class:`~gpsea.model.Cohort` aspects into an HTML report.
     """
 
     def __init__(
-            self,
-            hpo: MinimalOntology,
-            top_phenotype_count: int = 10,
-            top_variant_count: int = 10,
+        self,
+        hpo: hpotk.MinimalOntology,
+        top_phenotype_count: int = 10,
+        top_variant_count: int = 10,
     ):
         """
         Args:
@@ -29,11 +29,11 @@ def __init__(
             top_phenotype_count(int): Maximum number of HPO terms to display in the HTML table (default: 10)
             top_variant_count(int): Maximum number of variants to display in the HTML table (default: 10)
         """
+        super().__init__()
         self._hpo = hpo
         self._top_phenotype_count = top_phenotype_count
         self._top_variant_count = top_variant_count
-        environment = Environment(loader=PackageLoader('gpsea.view', 'templates'))
-        self._cohort_template = environment.get_template("cohort.html")
+        self._cohort_template = self._environment.get_template("cohort.html")
 
     def process(
         self,
@@ -62,35 +62,18 @@ def _prepare_context(
     ) -> typing.Mapping[str, typing.Any]:
 
         hpo_counts = list()
-        for hpo in cohort.list_present_phenotypes(top=self._top_phenotype_count):
-            hpo_id = hpo[0]
-            individual_count = hpo[1]
-            hpo_label = "n/a"
-            if hpo_id in self._hpo:
-                hpo_label = self._hpo.get_term_name(hpo_id)
+        for term_id, count in cohort.list_present_phenotypes(top=self._top_phenotype_count):
+            label = self._hpo.get_term_name(term_id)
+            if label is None:
+                label = 'N/A'
+
             hpo_counts.append(
                 {
-                    "HPO": hpo_label,
-                    "ID": hpo_id,
-                    "Count": individual_count,
+                    "label": label,
+                    "term_id": term_id,
+                    "count": count,
                 }
             )
-        # Show the same max number of measuresments as phenotypes
-        measurement_counts = list()
-        for msmt in cohort.list_measurements(top=self._top_phenotype_count):
-            msmt_id = msmt[0]
-            individual_count = msmt[1]
-            msmt_label = "n/a"
-            #if hpo_id in self._hpo:
-            #    hpo_label = self._hpo.get_term_name(hpo_id)
-            msmt_label = "need a way to retrieve measuremnt label"
-            measurement_counts.append(
-                {
-                    "Assay": msmt_label,
-                    "ID": msmt_id,
-                    "Count": individual_count,
-                })
-        n_measurements = len(measurement_counts)
 
         variant_counts = list()
         variant_to_display_d = CohortViewer._get_variant_description(cohort, transcript_id)
@@ -98,87 +81,68 @@ def _prepare_context(
             # get HGVS or human readable variant
             if variant_key in variant_to_display_d:
                 display = variant_to_display_d[variant_key]
-                hgvs_cdna = display.hgvs_cdna
-                protein_name = display.hgvsp
+                hgvsc = display.hgvs_cdna
+                hgvsp = display.hgvsp
                 effects = '' if display.variant_effects is None else ", ".join(display.variant_effects)
             else:
-                display = variant_key
-                hgvs_cdna = ''
-                protein_name = ''
+                hgvsc = ''
+                hgvsp = ''
                 effects = ''
 
             variant_counts.append(
                 {
-                    "variant": variant_key,
-                    "variant_name": hgvs_cdna,
-                    "protein_name": protein_name,
-                    "variant_effects": effects,
-                    "Count": count,
+                    "count": count,
+                    "key": variant_key,
+                    "hgvsc": hgvsc,
+                    "hgvsp": hgvsp,
+                    "effects": effects,
                 }
             )
 
         disease_counts = list()
         for disease_id, disease_count in cohort.list_all_diseases():
-            disease_name = "Unknown"
+            label = "Unknown"
             for disease in cohort.all_diseases():
                 if disease.identifier.value == disease_id:
-                    disease_name = disease.name
+                    label = disease.name
+                    break
+                
             disease_counts.append(
                 {
                     "disease_id": disease_id,
-                    "disease_name": disease_name,
+                    "label": label,
                     "count": disease_count,
                 }
             )
 
-        n_diseases = len(disease_counts)
-
-        var_effects_list = list()
-        var_effects_d = dict()
-        if transcript_id is not None:
-            has_transcript = True
-            data_by_tx = cohort.variant_effect_count_by_tx(tx_id=transcript_id)
-            # e.g., data structure
-            #   -- {'effect}': 'FRAMESHIFT_VARIANT', 'count': 175},
-            #   -- {'effect}': 'STOP_GAINED', 'count': 67},
-            for tx_id, counter in data_by_tx.items():
+        variant_effects = list()
+        has_transcript = transcript_id is not None
+        if has_transcript:
+            var_effects_d = dict()
+            for tx_id, counter in cohort.variant_effect_count_by_tx(tx_id=transcript_id).items():
+                # e.g., data structure
+                #   -- {'effect}': 'FRAMESHIFT_VARIANT', 'count': 175},
+                #   -- {'effect}': 'STOP_GAINED', 'count': 67},
                 if tx_id == transcript_id:
                     for effect, count in counter.items():
                         var_effects_d[effect] = count
+                    break
             total = sum(var_effects_d.values())
             # Sort in descending order based on counts
-            sorted_counts_desc = dict(sorted(var_effects_d.items(), key=lambda item: item[1], reverse=True))
-            for effect, count in sorted_counts_desc.items():
-                percent = f"{round(count / total * 100)}%"
-                var_effects_list.append({"effect": effect, "count": count, "percent": percent})
+            for effect, count in sorted(var_effects_d.items(), key=lambda item: item[1], reverse=True):
+                variant_effects.append(
+                    {
+                        "effect": effect,
+                        "count": count,
+                        "percent": round(count / total * 100),
+                    }
+                )
         else:
-            has_transcript = False
             transcript_id = "MANE transcript ID"
 
-        n_male = 0
-        n_female = 0
-        n_unknown_sex = 0
-        n_deceased = 0
-        n_alive = 0
-        n_has_age_at_last_encounter = 0
         n_has_disease_onset = 0
         hpo_id_to_has_cnset_count_d = defaultdict(int)
         for pat in cohort.all_patients:
-            if pat.sex == Sex.MALE:
-                n_male += 1
-            elif pat.sex == Sex.FEMALE:
-                n_female += 1
-            else:
-                n_unknown_sex += 1
-            patient_not_deceased = True
-            if pat.vital_status is not None:
-                if pat.vital_status.is_deceased:
-                    n_deceased += 1
-                    patient_not_deceased = False
-            if patient_not_deceased:
-                n_alive += 1
-            if pat.age is not None:
-                n_has_age_at_last_encounter += 1
             diseases = pat.diseases
             for d in diseases:
                 if d.onset is not None:
@@ -191,47 +155,34 @@ def _prepare_context(
                     terms_and_ancestors_with_onset_information.update(ancs)
             for hpo_id in terms_and_ancestors_with_onset_information:
                 hpo_id_to_has_cnset_count_d[hpo_id] += 1
+        
         # When we get here, we want to present the counts of HPO terms that have onset information
-        n_individuals = len(cohort.all_patients)
-        onset_threshold = int(0.2 * n_individuals)  # only show terms with a decent amount of information
-        has_onset_information = list()
-        for hpo_id, count in hpo_id_to_has_cnset_count_d.items():
-            if count < onset_threshold:
-                continue
-            label = self._hpo.get_term_name(hpo_id)
-            if label is not None:
-                display_label = f"{label} ({hpo_id})"
-            else:
-                display_label = hpo_id
-            has_onset_information.append({"HPO": display_label, "count": count})
-        n_has_onset_info = len(has_onset_information)
+        # onset_threshold = int(0.2 * len(cohort))  # only show terms with a decent amount of information
+        # has_onset_information = list()
+        # for hpo_id, count in hpo_id_to_has_cnset_count_d.items():
+        #     if count < onset_threshold:
+        #         continue
+        #     label = self._hpo.get_term_name(hpo_id)
+        #     if label is not None:
+        #         display_label = f"{label} ({hpo_id})"
+        #     else:
+        #         display_label = hpo_id
+        #     has_onset_information.append(
+        #         {"HPO": display_label, "count": count})
+        # n_has_onset_info = len(has_onset_information)
 
         # The following dictionary is used by the Jinja2 HTML template
         return {
-            "n_individuals": n_individuals,
-            "n_male": n_male,
-            "n_female": n_female,
-            "n_unknown_sex": n_unknown_sex,
-            "n_deceased": n_deceased,
-            "n_alive": n_alive,
-            "n_has_disease_onset": n_has_disease_onset,
-            "n_has_age_at_last_encounter": n_has_age_at_last_encounter,
-            "has_onset_information": has_onset_information,
-            "n_has_onset_info": n_has_onset_info,
-            "n_excluded": cohort.get_excluded_count(),
-            "total_hpo_count": len(cohort.all_phenotypes()),
+            "cohort": cohort,
+            "transcript_id": transcript_id,
             "top_hpo_count": self._top_phenotype_count,
             "hpo_counts": hpo_counts,
-            "total_measurement_count": n_measurements,
-            "measurement_counts": measurement_counts,
-            "unique_variant_count": len(cohort.all_variant_infos()),
             "top_var_count": self._top_variant_count,
             "var_counts": variant_counts,
-            "n_diseases": n_diseases,
+            # "n_diseases": n_diseases,
             "disease_counts": disease_counts,
             "has_transcript": has_transcript,
-            "var_effects_list": var_effects_list,
-            "transcript_id": transcript_id,
+            "variant_effects": variant_effects,
         }
 
     @staticmethod
diff --git a/src/gpsea/view/templates/cohort.html b/src/gpsea/view/templates/cohort.html
index fdd45d0a..d566d63f 100644
--- a/src/gpsea/view/templates/cohort.html
+++ b/src/gpsea/view/templates/cohort.html
@@ -1,9 +1,10 @@
 <!DOCTYPE html>
 <html lang="en">
+
 <head>
   <meta charset="utf-8">
   <title>Cohort</title>
-    <style>
+  <style>
     table {
       border-collapse: collapse;
       margin: 25px 0;
@@ -16,6 +17,7 @@
     .table .column-1 {
       text-align: left;
     }
+
     th {
       background-color: LightSkyBlue;
       border: 1px solid #dddddd;
@@ -38,7 +40,8 @@
       background-color: #f2f2f2;
     }
 
-    .table td, tr {
+    .table td,
+    tr {
       text-align: right;
     }
 
@@ -50,7 +53,8 @@
       display: block;
       width: 100%;
       padding-right: 1%;
-      margin-bottom: 20px; /* Add margin to separate divs */
+      margin-bottom: 20px;
+      /* Add margin to separate divs */
     }
 
     caption {
@@ -64,133 +68,165 @@
 
 <body>
   <h1>GPSEA cohort analysis</h1>
-  <p>Successfully loaded {{ n_individuals }} individuals. {{ n_male }} were recorded as male, {{ n_female }} as female,
-      and {{ n_unknown_sex }} as unknown sex. {{ n_alive }} were reported to be alive at the time of last
-      encounter, and {{ n_deceased }} were deceased. {{ n_has_disease_onset }} had disease onset information
-      and {{n_has_age_at_last_encounter  }} had information about the age at last encounter. In the cohort,
-      there were a total of {{ total_hpo_count }} HPO terms used for annotation.
-    {% if total_measurement_count > 0 %}
-      Information was provided for {{ total_measurement_count }}
-      measurement assays.
+  <p>
+    Successfully loaded {{ cohort | count | pluralize('individual') }}.
+
+    {# Sex #}
+    {{ cohort.count_males() | was_were }} recorded as male,
+    {{ cohort.count_females() }} as female,
+    and {{ cohort.count_unknown_sex() }} as unknown sex.
+
+    {# Vital status #}
+    {% if cohort | count == cohort.count_unknown_vital_status() %}
+    No information about individuals' vital status was reported.
+    {% else %}
+    {{ cohort.count_alive() | was_were }} reported to be alive at the time of last encounter,
+    {{ cohort.count_deceased() | was_were }} deceased,
+    and vital status was unreported for {{ cohort.count_unknown_vital_status() | pluralize('individual')}}.
+    {% endif %}
+
+    {# HPO terms #}
+    The cohort included {{ cohort.count_distinct_hpo_terms() | pluralize('HPO term') }}.
+
+    {# Disease and age at last encounter #}
+    {{ cohort.count_with_disease_onset() | pluralize('individual')}} had disease onset information
+    and {{ cohort.count_with_age_of_last_encounter() }} had information about the age of last encounter.
+
+
+    {# Measurements #}
+    {% if cohort.all_measurement | count > 0 %}
+    Information was provided for {{ cohort.all_measurement | count | pluralize('measurement assay') }}.
     {% endif %}
   </p>
-  {% if n_excluded > 0 %}
-  <p>Not able to load {{ n_excluded }} individuals.</p>
+
+  {% if cohort.get_excluded_count() > 0 %}
+  <p>Not able to load {{ cohort.get_excluded_count() }} individuals.</p>
   {% else %}
   <p>No errors encountered.</p>
   {% endif %}
 
-    <table>
+  {# The most common HPO terms #}
+  <table>
     <caption style="color: black;">
-        <h3>Top {{top_hpo_count}} HPO Terms</h3>
-        A total of {{ total_hpo_count }} HPO terms were used to annotated the cohort.
+      <h3>Top {{top_hpo_count}} HPO terms</h3>
+      A total of {{ cohort.count_distinct_hpo_terms() }} HPO terms were used to annotated the cohort.
+      {% if n_has_onset_info %} {# Optional onset info #}
+      {{ n_has_onset_info }} had onset information for at least 20% of individuals.
+      {% endif %}
     </caption>
-        <tbody>
-            <tr class="strng">
-                <th class="lft">HPO Term</th>
-                <th>ID</th>
-                <th>Seen in <em>n</em> individuals</th>
-            </tr>
-            {% for hpo_count in hpo_counts %}
-            <tr>
-                <td class="lft">{{ hpo_count.HPO }}</td>
-                <td>{{ hpo_count.ID }}</td>
-                <td>{{ hpo_count.Count }}</td>
-                </tr>
-            {% endfor %}
-                </tbody>
-    </table>
+    <tbody>
+      <tr class="strng">
+        <th class="lft">HPO Term</th>
+        <th>ID</th>
+        <th>Seen in <em>n</em> individuals</th>
+      </tr>
 
-  {% if n_has_onset_info > 0 %}
-      <p> {{ n_has_onset_info }} had onset information for at least 20% of individuals.</p>
-       <caption style="color: black;">
-        <h3>Top {{top_var_count}} HPO Terms</h3>
-        A total of {{ total_hpo_count }} HPO terms were used to annotated the cohort.
-    </caption>
-            <tbody>
-                <tr class="strng">
-                    <th>HPO</th>
-                    <th>Count</th>
-                </tr>
-                {% for onset_info in has_onset_information %}
-                <tr>
-                    <td>{{ onset_info.display_label }}</td>
-                    <td>{{ onset_info.count }}</td>
-                </tr>
-                {% endfor %}
-            </tbody>
-        </table>
-    <table>
+      {% for count in hpo_counts %}
+      <tr>
+        <td class="lft">{{ count['label'] }}</td>
+        <td>{{ count['term_id'] }}</td>
+        <td>{{ count['count'] }}</td>
+      </tr>
+      {% endfor %}
+    </tbody>
+  </table>
 
+  {# The table is inconsistent. Skip it for now!
+  {% if n_has_onset_info > 0 %}
+  <h3>Top {{top_var_count}} HPO Terms</h3>
+  <p>
+    A total of {{ cohort.count_distinct_hpo_terms() }} HPO terms were used to annotated the cohort.
+  </p>
+  <table>
+    <tbody>
+      <tr class="strng">
+        <th>HPO</th>
+        <th>Count</th>
+      </tr>
+      {% for onset_info in has_onset_information %}
+      <tr>
+        <td>{{ onset_info.display_label }}</td>
+        <td>{{ onset_info.count }}</td>
+      </tr>
+      {% endfor %}
+    </tbody>
+  </table>
   {% endif %}
+  #}
 
-    <table>
+  {# The most common variants #}
+  <table>
     <caption style="color: black;">
-        <h3>Top {{top_var_count}} Variants</h3>
-        Variants are shown according to {{ transcript_id }}. A total of {{ unique_variant_count}} unique variants were identified in the cohort.
+      <h3>Top {{top_var_count}} variants</h3>
+      Variants are shown according to {{ transcript_id }}.
+      A total of {{ cohort.all_variant_infos() | count | pluralize('unique variant') }}
+      were identified in the cohort.
     </caption>
-            <tbody>
-                <tr class="strng">
-                    <th>Seen in <em>n</em> individuals</th>
-                    <th class="lft">Variant key</th>
-                    <th>Variant Name</th>
-                    <th>Protein Variant</th>
-                    <th>Variant Class</th>
-                </tr>
-                {% for var_count in var_counts %}
-                <tr>
-                    <td>{{ var_count.Count }}</td>
-                    <td class="lft">{{ var_count.variant }}</td>
-                    <td>{{ var_count.variant_name }}</td>
-                    <td>{{ var_count.protein_name }}</td>
-                    <td>{{ var_count.variant_effects }}</td>
-                </tr>
-                {% endfor %}
-            </tbody>
-        </table>
-    <table>
+    <tbody>
+      <tr class="strng">
+        <th>Seen in <em>n</em> individuals</th>
+        <th class="lft">Variant key</th>
+        <th>Variant Name</th>
+        <th>Protein Effect</th>
+        <th>Variant Class</th>
+      </tr>
+      {% for count in var_counts %}
+      <tr>
+        <td>{{ count['count'] }}</td>
+        <td class="lft">{{ count['key'] }}</td>
+        <td>{{ count['hgvsc'] }}</td>
+        <td>{{ count['hgvsp'] }}</td>
+        <td>{{ count['effects'] }}</td>
+      </tr>
+      {% endfor %}
+    </tbody>
+  </table>
+  
+  {# The most common diseases #}
+  <table>
     <caption style="color: black;">
-        <h3>Diseases</h3>
+      <h3>Diseases</h3>
     </caption>
-        <tbody>
-            <tr class="strng">
-                <th class="lft">Disease Name</th>
-                <th >Disease ID</th>
-                <th>Annotation Count</th>
-              </tr>
-              {% for disease_count in disease_counts %}
-              <tr>
-                <td class="lft">{{ disease_count.disease_name   }}</td>
-                <td>{{ disease_count.disease_id }}</td>
-                <td>{{ disease_count.count }}</td>
-                </tr>
-            {% endfor %}
-           </tbody>
-    </table>
-    {% if has_transcript > 0 %}
-    <table>
+    <tbody>
+      <tr class="strng">
+        <th class="lft">Name</th>
+        <th>ID</th>
+        <th><em>N</em> diagnosed individuals</th>
+      </tr>
+      {% for disease_count in disease_counts %}
+      <tr>
+        <td class="lft">{{ disease_count['label'] }}</td>
+        <td>{{ disease_count['disease_id'] }}</td>
+        <td>{{ disease_count['count'] }}</td>
+      </tr>
+      {% endfor %}
+    </tbody>
+  </table>
+  
+  {# The most common variant effects #}
+  {% if has_transcript %}
+  <table>
     <caption style="color: black;">
-        <h3>Variant categories for {{ transcript_id }}</h3>
+      <h3>Variant categories for {{ transcript_id }}</h3>
     </caption>
-        <tbody>
-            <tr class="strng">
-                <th class="lft">Variant effect</th>
-                <th>Annotation Count</th>
-                <th>Percent</th>
-              </tr>
-              {% for var_effect in var_effects_list %}
-              <tr>
-                <td class="lft">{{ var_effect.effect }}</td>
-                <td>{{ var_effect.count }}</td>
-               <td>{{ var_effect.percent }}</td>
-                </tr>
-            {% endfor %}
-           </tbody>
-    </table>
+    <tbody>
+      <tr class="strng">
+        <th class="lft">Variant effect</th>
+        <th>Annotation count</th>
+        <th>Percent</th>
+      </tr>
+      {% for effect in variant_effects %}
+      <tr>
+        <td class="lft">{{ effect['effect'] }}</td>
+        <td>{{ effect['count'] }}</td>
+        <td>{{ effect['percent'] }}%</td>
+      </tr>
+      {% endfor %}
+    </tbody>
+  </table>
   {% else %}
   <p>Call this function with transcript to see table with variant effect counts.</p>
   {% endif %}
 
-
 </body>
-</html>
+</html>
\ No newline at end of file
diff --git a/tests/view/test_view.py b/tests/view/test_view.py
index 05792b48..8fe802c9 100644
--- a/tests/view/test_view.py
+++ b/tests/view/test_view.py
@@ -1,4 +1,4 @@
-import io
+import os
 
 import hpotk
 import pytest
@@ -7,6 +7,7 @@
 from gpsea.view import CohortViewer
 
 
+@pytest.mark.skip("Just for manual testing and debugging")
 class TestCohortViewer:
 
     @pytest.fixture
@@ -21,17 +22,13 @@ def cohort_viewer(
     def test_process(
         self,
         cohort_viewer: CohortViewer,
-        toy_cohort: Cohort,
+        suox_cohort: Cohort,
+        suox_mane_tx_id: str,
     ):
-        toy_transcript_id = "NM_123.1"
-
         report = cohort_viewer.process(
-            cohort=toy_cohort, transcript_id=toy_transcript_id
+            cohort=suox_cohort,
+            transcript_id=suox_mane_tx_id,
         )
 
-        buf = io.StringIO()
-        report.write(buf)
-        html = buf.getvalue()
-
-        # A dummy test for now.
-        assert len(html) != 0
+        with open(os.path.join("dev", "SUOX.cohort.html"), "w") as fh:
+            report.write(fh)

From 6080fe4167910ebc187918bb2b18e9f90349e05f Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 09:31:11 +0100
Subject: [PATCH 07/19] Plug in `PhenopacketOntologyTermOnsetParser` into
 phenopacket loading.

---
 src/gpsea/preprocessing/_phenopacket.py | 87 ++++++++++++++++++-------
 1 file changed, 64 insertions(+), 23 deletions(-)

diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py
index 3c7486c6..fd8f26ce 100644
--- a/src/gpsea/preprocessing/_phenopacket.py
+++ b/src/gpsea/preprocessing/_phenopacket.py
@@ -10,7 +10,7 @@
 
 import phenopackets.schema.v2.core.individual_pb2 as ppi
 from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket
-from phenopackets.schema.v2.core.base_pb2 import TimeElement as PPTimeElement
+from phenopackets.schema.v2.core.base_pb2 import TimeElement as PPTimeElement, OntologyClass as PPOntologyClass
 from phenopackets.schema.v2.core.phenotypic_feature_pb2 import PhenotypicFeature as PPPhenotypicFeature
 from phenopackets.schema.v2.core.disease_pb2 import Disease as PPDisease
 from phenopackets.schema.v2.core.measurement_pb2 import Measurement as PPMeasurement
@@ -226,6 +226,16 @@ def _looks_like_large_sv(
         return structural_type is not None and gene_context is not None
 
 
+class PhenopacketOntologyTermOnsetParser:
+    
+    def process(
+        self,
+        ontology_class: PPOntologyClass,
+        notepad: Notepad,
+    ) -> typing.Optional[Age]:
+        return None
+
+
 class PhenopacketPatientCreator(PatientCreator[Phenopacket]):
     """
     `PhenopacketPatientCreator` transforms `Phenopacket` into :class:`~gpsea.model.Patient`.
@@ -247,6 +257,7 @@ def __init__(
         functional_annotator: FunctionalAnnotator,
         imprecise_sv_functional_annotator: ImpreciseSvFunctionalAnnotator,
         hgvs_coordinate_finder: VariantCoordinateFinder[str],
+        term_onset_parser: typing.Optional[PhenopacketOntologyTermOnsetParser] = None,
     ):
         self._logger = logging.getLogger(__name__)
         # Violates DI, but it is specific to this class, so I'll leave it "as is".
@@ -255,8 +266,12 @@ def __init__(
         )
         self._gt_parser = PhenopacketGenotypeParser()
         self._validator = validate_instance(validator, ValidationRunner, 'validator')
+        if term_onset_parser is not None:
+            assert isinstance(term_onset_parser, PhenopacketOntologyTermOnsetParser)
+        self._term_onset_parser = term_onset_parser
         self._phenotype_creator = PhenopacketPhenotypicFeatureCreator(
             hpo=validate_instance(hpo, hpotk.MinimalOntology, 'hpo'),
+            term_onset_parser=term_onset_parser,
         )
         self._functional_annotator = validate_instance(
             functional_annotator, FunctionalAnnotator, "functional_annotator"
@@ -404,9 +419,10 @@ def _add_diseases(
             else:
                 term_id = hpotk.TermId.from_curie(dis.term.id)
             
-            if dis.HasField("onset"):
-                onset = PhenopacketPatientCreator._parse_time_element(
+            if dis.HasField("onset") and self._term_onset_parser is not None:
+                onset = parse_onset_element(
                     time_element=dis.onset,
+                    term_onset_parser=self._term_onset_parser,
                     notepad=ith_disease_subsection,
                 )
             else:
@@ -423,22 +439,6 @@ def _add_diseases(
             )
 
         return final_diseases
-    
-    @staticmethod
-    def _parse_time_element(
-        time_element: PPTimeElement,
-        notepad: Notepad,
-    ) -> typing.Optional[Age]:
-        case = time_element.WhichOneof("element")
-        if case == "gestational_age":
-            age = time_element.gestational_age
-            return Age.gestational(weeks=age.weeks, days=age.days)
-        elif case == "age":
-            age = time_element.age
-            return Age.from_iso8601_period(value=age.iso8601duration)
-        else:
-            notepad.add_warning(f"`time_element` is in currently unsupported format `{case}`")
-            return None
 
     def _add_measurements(
         self,
@@ -508,7 +508,7 @@ def _extract_age(
     ) -> typing.Optional[Age]:
         if individual.HasField("time_at_last_encounter"):
             tale = individual.time_at_last_encounter
-            return PhenopacketPatientCreator._parse_time_element(tale, notepad)
+            return parse_age_element(tale, notepad)
         return None
 
     @staticmethod
@@ -530,7 +530,7 @@ def _extract_vital_status(
                 status = Status.UNKNOWN
             
             if vital_status.HasField("time_of_death"):
-                age_of_death = PhenopacketPatientCreator._parse_time_element(
+                age_of_death = parse_age_element(
                     time_element=vital_status.time_of_death,
                     notepad=notepad,
                 )
@@ -746,8 +746,10 @@ class PhenopacketPhenotypicFeatureCreator:
     def __init__(
         self,
         hpo: hpotk.MinimalOntology,
+        term_onset_parser: typing.Optional[PhenopacketOntologyTermOnsetParser],
     ):
         self._hpo = hpo
+        self._term_onset_parser = term_onset_parser
 
     def process(
         self,
@@ -795,9 +797,10 @@ def process(
                 f'Replace {term_id} with the primary term ID {term.identifier}',
             )
         
-        if pf.HasField("onset"):
-            onset = PhenopacketPatientCreator._parse_time_element(
+        if pf.HasField("onset") and self._term_onset_parser is not None:
+            onset = parse_onset_element(
                 time_element=pf.onset,
+                term_onset_parser=self._term_onset_parser,
                 notepad=notepad,
             )
         else:
@@ -808,3 +811,41 @@ def process(
             is_observed=not pf.excluded,
             onset=onset,
         )
+
+def parse_onset_element(
+    time_element: PPTimeElement,
+    term_onset_parser: PhenopacketOntologyTermOnsetParser,
+    notepad: Notepad,
+) -> typing.Optional[Age]:
+    """
+    We allow to use `GestationalAge`, `Age` or `OntologyClass` as onset.
+    """
+    case = time_element.WhichOneof("element")
+    if case == "ontology_class":
+        term_onset_parser.process(
+            ontology_class=time_element.ontology_class,
+            notepad=notepad,
+        )
+    else:
+        return parse_age_element(
+            time_element=time_element,
+            notepad=notepad,
+        )
+
+def parse_age_element(
+    time_element: PPTimeElement,
+    notepad: Notepad,
+) -> typing.Optional[Age]:
+    """
+    We allow to use `GestationalAge` or `Age` as age.
+    """
+    case = time_element.WhichOneof("element")
+    if case == "gestational_age":
+        age = time_element.gestational_age
+        return Age.gestational(weeks=age.weeks, days=age.days)
+    elif case == "age":
+        age = time_element.age
+        return Age.from_iso8601_period(value=age.iso8601duration)
+    else:
+        notepad.add_warning(f"`time_element` is in currently unsupported format `{case}`")
+        return None

From a8fe9458f407528d71adf565b677d2cbf6b4d70b Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 10:49:33 +0100
Subject: [PATCH 08/19] Implement `PhenopacketOntologyTermOnsetParser`.

---
 src/gpsea/model/_temporal.py                  | 28 +++++-
 src/gpsea/preprocessing/_phenopacket.py       | 85 ++++++++++++++++++-
 src/gpsea/preprocessing/_test__phenopacket.py | 44 ++++++++++
 tests/model/test_temporal.py                  | 29 ++++++-
 4 files changed, 181 insertions(+), 5 deletions(-)
 create mode 100644 src/gpsea/preprocessing/_test__phenopacket.py

diff --git a/src/gpsea/model/_temporal.py b/src/gpsea/model/_temporal.py
index 2aede84e..32329eb1 100644
--- a/src/gpsea/model/_temporal.py
+++ b/src/gpsea/model/_temporal.py
@@ -92,6 +92,23 @@ def gestational(
         total = weeks * Age.DAYS_IN_WEEK + days
         return Age(days=float(total), timeline=Timeline.GESTATIONAL)
 
+    @staticmethod
+    def last_menstrual_period() -> "Age":
+        """
+        Age of an individual at last menstrual period.
+        """
+        return LAST_MENSTRUAL_PERIOD
+
+    @staticmethod
+    def gestational_days(days: typing.Union[int, float]) -> "Age":
+        """
+        Create a gestational age corresponding to the number of `days`
+        since the last menstrual period.
+        """
+        if isinstance(days, int):
+            days = float(days)
+        return Age(days=days, timeline=Timeline.GESTATIONAL)
+
     @staticmethod
     def birth() -> "Age":
         """
@@ -100,8 +117,14 @@ def birth() -> "Age":
         return BIRTH
 
     @staticmethod
-    def postnatal_days(days: int) -> "Age":
-        return Age(days=float(days), timeline=Timeline.POSTNATAL)
+    def postnatal_days(days: typing.Union[int, float]) -> "Age":
+        """
+        Create a postnatal age corresponding to the number of `days`
+        since birth.
+        """
+        if isinstance(days, int):
+            days = float(days)
+        return Age(days=days, timeline=Timeline.POSTNATAL)
 
     @staticmethod
     def postnatal_years(
@@ -274,3 +297,4 @@ def __str__(self) -> str:
 
 
 BIRTH = Age(days=0.0, timeline=Timeline.POSTNATAL)
+LAST_MENSTRUAL_PERIOD = Age(days=0.0, timeline=Timeline.GESTATIONAL)
diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py
index fd8f26ce..ffc32ce9 100644
--- a/src/gpsea/preprocessing/_phenopacket.py
+++ b/src/gpsea/preprocessing/_phenopacket.py
@@ -227,13 +227,96 @@ def _looks_like_large_sv(
 
 
 class PhenopacketOntologyTermOnsetParser:
+    """
+    Parser for mapping an onset formatted as an ontology class to the corresponding :class:`~gpsea.model.Age`.
+
+    Each HPO onset includes start and end bounds (e.g. 29th day to 16th year for Pediatric onset) of the onset range
+    and the onset is mapped into the midpoint of the range.
+
+    Use `default_parser` to create the parser for parsing current HPO
+    or provide the curie -> :class:`~gpsea.model.Age` mapping via `__init__`.
+    """
     
+    @staticmethod
+    def default_parser() -> "PhenopacketOntologyTermOnsetParser":
+        # These ranges are horribly general.
+        # Assuming 40 weeks as birth date and 80 years as age of death.
+        weeks_at_birth = 40
+        age_at_death = 80
+        term_id_to_range={
+            'HP:0030674': (Age.last_menstrual_period(), Age.gestational(weeks=weeks_at_birth)),  # Antenatal onset
+            'HP:0011460': (Age.last_menstrual_period(), Age.gestational(weeks=11)),  # Embryonal onset
+            'HP:0011461': (Age.gestational(weeks=11), Age.gestational(weeks=weeks_at_birth)),  # Fetal onset
+            'HP:0034199': (Age.gestational(weeks=11), Age.gestational(weeks=14)),  # Late first trimester onset
+            'HP:0034198': (Age.gestational(weeks=14), Age.gestational(weeks=28)),  # Second trimester onset
+            'HP:0034197': (Age.gestational(weeks=28), Age.gestational(weeks=weeks_at_birth)),  # Third trimester onset
+            
+            'HP:0003577': (Age.birth(), Age.birth()),  # Congenital onset
+            'HP:0003623': (Age.birth(), Age.postnatal_days(29)),  # Neonatal onset
+
+            'HP:0410280': (Age.postnatal_days(29), Age.postnatal_years(16)),  # Pediatric onset
+            'HP:0003593': (Age.postnatal_days(29), Age.postnatal_years(1)),  # Infantile onset
+            'HP:0011463': (Age.postnatal_years(1), Age.postnatal_years(5)),  # Childhood onset
+            'HP:0003621': (Age.postnatal_years(5), Age.postnatal_years(16)),  # Juvenile onset
+
+            'HP:0003581': (Age.postnatal_years(16), Age.postnatal_years(age_at_death)),  # Adult onset
+            'HP:0011462': (Age.postnatal_years(16), Age.postnatal_years(40)),  # Young adult onset
+            'HP:0025708': (Age.postnatal_years(16), Age.postnatal_years(19)),  # Early young adult onset
+            'HP:0025709': (Age.postnatal_years(19), Age.postnatal_years(25)),  # Intermediate young adult onset
+            'HP:0025710': (Age.postnatal_years(25), Age.postnatal_years(40)),  # Late young adult onset
+            
+            'HP:0003596': (Age.postnatal_years(40), Age.postnatal_years(60)),  # Middle age onset
+            'HP:0003584': (Age.postnatal_years(60), Age.postnatal_years(age_at_death)),  # Late onset
+        }
+        return PhenopacketOntologyTermOnsetParser(
+            term_id_to_age={
+                curie: PhenopacketOntologyTermOnsetParser._median_age(start, end)
+                for curie, (start, end) in term_id_to_range.items()
+            },
+        )
+
+    @staticmethod
+    def _median_age(left: Age, right: Age) -> Age:
+        # Assuming right is at or after left
+        days = left.days + ((right.days - left.days) / 2) 
+        if left.is_gestational and right.is_gestational:
+            return Age.gestational_days(days=days)
+        elif left.is_postnatal and right.is_postnatal:
+            return Age.postnatal_days(days=days)
+        else:
+            raise ValueError(f'`left` and `right` must be on the same timeline, but left={left.timeline}, right={right.timeline}`')
+
+    def __init__(
+        self,
+        term_id_to_age: typing.Mapping[str, Age],
+    ):
+        """
+        Create the onset parser from term to age mapping.
+
+        :param term_id_to_age: a mapping from HPO curie (e.g. `HP:0410280`) to the corresponding :class:`~gpsea.model.Age`.
+        """
+        self._term_id_to_age = dict(term_id_to_age)
+
     def process(
         self,
         ontology_class: PPOntologyClass,
         notepad: Notepad,
     ) -> typing.Optional[Age]:
-        return None
+        curie = ontology_class.id       
+        if curie.startswith('HP:'):
+            age = self._term_id_to_age.get(curie, None)
+            if age is None:
+                notepad.add_warning(
+                    f'Unknown onset term {curie}',
+                    solution='Use a term from HPO\'s Onset [HP:0003674] module',
+                )
+            return age
+        else:
+            notepad.add_warning(
+                f'Unsupported ontology class {curie}',
+                solution='Use a term from HPO\'s Onset [HP:0003674] module',
+            )
+            return None
 
 
 class PhenopacketPatientCreator(PatientCreator[Phenopacket]):
diff --git a/src/gpsea/preprocessing/_test__phenopacket.py b/src/gpsea/preprocessing/_test__phenopacket.py
new file mode 100644
index 00000000..54418b9c
--- /dev/null
+++ b/src/gpsea/preprocessing/_test__phenopacket.py
@@ -0,0 +1,44 @@
+import pytest
+
+from stairval.notepad import create_notepad
+from phenopackets.schema.v2.core.base_pb2 import OntologyClass
+
+from ._phenopacket import PhenopacketOntologyTermOnsetParser
+
+class TestPhenopacketOntologyTermOnsetParser:
+
+    @pytest.fixture(scope='class')
+    def parser(self) -> PhenopacketOntologyTermOnsetParser:
+        return PhenopacketOntologyTermOnsetParser.default_parser()
+
+    @pytest.mark.parametrize(
+        'curie, days, is_postnatal',
+        [
+            ('HP:0030674', 140., False),   # Antenatal onset
+            ('HP:0011460', 38.5, False),   # Embryonal onset
+            
+            ('HP:0003577', 0., True),      # Congenital onset
+            ('HP:0003623', 14.5, True),    # Neonatal onset
+            ('HP:0410280', 2936.5, True),  # Pediatric onset
+            ('HP:0011462', 10227.0, True), # Young adult onset
+
+            ('HP:0003584', 25567.5, True), # Late onset
+        ]
+    )
+    def test_process(
+        self,
+        curie: str,
+        days: float,
+        is_postnatal: bool,
+        parser: PhenopacketOntologyTermOnsetParser,
+    ):
+        notepad = create_notepad(label='Top')
+        oc = OntologyClass(id=curie, label='Whatever')
+        
+        age = parser.process(oc, notepad)
+        
+        assert age is not None
+        assert age.is_postnatal == is_postnatal
+        assert age.days == pytest.approx(days)
+
+        assert not notepad.has_errors_or_warnings(include_subsections=True)
diff --git a/tests/model/test_temporal.py b/tests/model/test_temporal.py
index 78e228bf..2c0170de 100644
--- a/tests/model/test_temporal.py
+++ b/tests/model/test_temporal.py
@@ -1,3 +1,5 @@
+import typing
+
 import pytest
 
 from gpsea.model import Age
@@ -13,11 +15,33 @@ class TestAge:
             (20, 0, 140.0),
         ],
     )
-    def test_gestational(self, weeks: int, days: int, expected: int):
+    def test_gestational(
+        self,
+        weeks: int,
+        days: int,
+        expected: int,
+    ):
         age = Age.gestational(weeks, days=days)
         assert age.days == expected
         assert age.is_gestational
 
+    @pytest.mark.parametrize(
+        "days, expected",
+        [
+            (6, 6.0),
+            (10., 10.),  # input as `float` is also OK.
+        ],
+    )
+    def test_gestational_days(
+        self,
+        days: typing.Union[int, float],
+        expected: int,
+    ):
+        age = Age.gestational_days(days=days)
+        assert age.days == expected
+        assert age.is_gestational
+
+
     @pytest.mark.parametrize(
         "years, expected",
         [
@@ -40,11 +64,12 @@ def test_postnatal_years(
             (0, 0.),
             (1, 1.),
             (10, 10.),
+            (10., 10.),  # input as `float` is also OK.
         ],
     )
     def test_postnatal_days(
         self,
-        days: int,
+        days: typing.Union[int, float],
         expected: float,
     ):
         age = Age.postnatal_days(days=days)

From a49f57ea75be3c7b01fbb7b5b99549efe9a8be36 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 11:27:52 +0100
Subject: [PATCH 09/19] Parse onsets formatted as an ontology class.

---
 src/gpsea/model/_cohort.py                    | 62 +++++++++++++++----
 src/gpsea/preprocessing/__init__.py           |  4 +-
 src/gpsea/preprocessing/_config.py            | 12 +++-
 src/gpsea/preprocessing/_phenopacket.py       | 17 ++---
 tests/preprocessing/test_phenopacket.py       | 16 ++++-
 .../PMID_30968594_individual_1.json           |  6 ++
 6 files changed, 94 insertions(+), 23 deletions(-)

diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py
index e0f1a523..55898bb8 100644
--- a/src/gpsea/model/_cohort.py
+++ b/src/gpsea/model/_cohort.py
@@ -13,6 +13,12 @@
 from ._variant import Variant, VariantInfo
 
 
+I = typing.TypeVar('I', bound=hpotk.model.Identified)
+"""
+Anything that extends `Identified` (e.g. `Disease`, `Phenotype`, `Measurement`).
+"""
+
+
 class Status(enum.Enum):
     UNKNOWN = 0
     ALIVE = 1
@@ -162,6 +168,16 @@ def phenotypes(self) -> typing.Sequence[Phenotype]:
         Get the phenotypes observed and excluded in the patient.
         """
         return self._phenotypes
+    
+    def phenotype_by_id(
+        self,
+        term_id: typing.Union[str, hpotk.TermId],
+    ) -> typing.Optional[Phenotype]:
+        """
+        Get a phenotype with an identifier or `None` if the individual has no such phenotype.
+        """
+        term_id = Patient._check_id(term_id)
+        return Patient._find_first_by_id(term_id, self.phenotypes)
 
     @property
     def measurements(self) -> typing.Sequence[Measurement]:
@@ -181,18 +197,8 @@ def measurement_by_id(
             representing the term ID of a measurement (e.g. `LOINC:2986-8` for *Testosterone[Mass/Vol]*).
         :returns: the corresponding :class:`Measurement` or `None` if not found in the patient.
         """
-        if isinstance(term_id, str):
-            pass
-        elif isinstance(term_id, hpotk.TermId):
-            term_id = term_id.value
-        else:
-            raise ValueError(f'`term_id` must be a `str` or `hpotk.TermId` but was {type(term_id)}')
-
-        for m in self._measurements:
-            if m.identifier.value == term_id:
-                return m
-
-        return None
+        term_id = Patient._check_id(term_id)
+        return Patient._find_first_by_id(term_id, self.measurements)
 
     @property
     def diseases(self) -> typing.Sequence[Disease]:
@@ -201,6 +207,16 @@ def diseases(self) -> typing.Sequence[Disease]:
         """
         return self._diseases
 
+    def disease_by_id(
+        self,
+        term_id: typing.Union[str, hpotk.TermId],
+    ) -> typing.Optional[Disease]:
+        """
+        Get a disease with an identifier or `None` if the individual has no such disease.
+        """
+        term_id = Patient._check_id(term_id)
+        return Patient._find_first_by_id(term_id, self.diseases)
+
     @property
     def variants(self) -> typing.Sequence[Variant]:
         """
@@ -232,6 +248,28 @@ def excluded_diseases(self) -> typing.Iterator[Disease]:
         """
         return filter(lambda d: not d.is_present, self._diseases)
 
+    @staticmethod
+    def _check_id(
+        term_id: typing.Union[str, hpotk.TermId],
+    ) -> hpotk.TermId:
+        if isinstance(term_id, str):
+            return hpotk.TermId.from_curie(term_id)
+        elif isinstance(term_id, hpotk.TermId):
+            return term_id
+        else:
+            raise ValueError(f'`term_id` must be a `str` or `hpotk.TermId` but was {type(term_id)}')
+        
+    @staticmethod
+    def _find_first_by_id(
+        term_id: hpotk.TermId,
+        items: typing.Iterable[I],
+    ) -> typing.Optional[I]:
+        for m in items:
+            if m.identifier == term_id:
+                return m
+
+        return None
+
     def __str__(self) -> str:
         return (f"Patient("
                 f"labels:{self._labels}, "
diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py
index 3c85d3e6..7cfb2797 100644
--- a/src/gpsea/preprocessing/__init__.py
+++ b/src/gpsea/preprocessing/__init__.py
@@ -6,7 +6,7 @@
 from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service
 from ._generic import DefaultImpreciseSvFunctionalAnnotator
 from ._patient import PatientCreator, CohortCreator
-from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator
+from ._phenopacket import PhenopacketVariantCoordinateFinder, PhenopacketPatientCreator, PhenopacketOntologyTermOnsetParser
 from ._protein import ProteinAnnotationCache, ProtCachingMetadataService
 from ._uniprot import UniprotProteinMetadataService
 from ._variant import VariantAnnotationCache, VarCachingFunctionalAnnotator
@@ -18,7 +18,7 @@
     'configure_default_protein_metadata_service', 'configure_protein_metadata_service',
     'VariantCoordinateFinder', 'FunctionalAnnotator', 'ImpreciseSvFunctionalAnnotator', 'ProteinMetadataService',
     'PatientCreator', 'CohortCreator',
-    'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator',
+    'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'PhenopacketOntologyTermOnsetParser',
     'load_phenopacket_folder', 'load_phenopacket_files', 'load_phenopackets',
     'PreprocessingValidationResult',
     'TranscriptCoordinateService', 'GeneCoordinateService',
diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py
index b69ac7e9..e8b99d5c 100644
--- a/src/gpsea/preprocessing/_config.py
+++ b/src/gpsea/preprocessing/_config.py
@@ -24,7 +24,7 @@
 from ._api import FunctionalAnnotator, PreprocessingValidationResult
 from ._generic import DefaultImpreciseSvFunctionalAnnotator
 from ._patient import CohortCreator
-from ._phenopacket import PhenopacketPatientCreator
+from ._phenopacket import PhenopacketPatientCreator, PhenopacketOntologyTermOnsetParser
 
 from ._protein import (
     ProteinMetadataService,
@@ -44,6 +44,7 @@ def configure_caching_cohort_creator(
     genome_build: str = "GRCh38.p13",
     validation_runner: typing.Optional[ValidationRunner] = None,
     cache_dir: typing.Optional[str] = None,
+    include_ontology_class_onsets: bool = True,
     variant_fallback: str = "VEP",
     timeout: float = 30.0,
 ) -> CohortCreator[Phenopacket]:
@@ -58,6 +59,8 @@ def configure_caching_cohort_creator(
     :param cache_dir: path to the folder where we will cache the results fetched from the remote APIs or `None`
         if the cache location should be determined as described in :func:`~gpsea.config.get_cache_dir_path`.
         In any case, the directory will be created if it does not exist (including non-existing parents).
+    :param include_ontology_class_onsets: `True` if onsets in the ontology class format 
+        (e.g. `HP:0003621` for Juvenile onset) should be included (default `True`).
     :param variant_fallback: the fallback variant annotator to use if we cannot find the annotation locally.
      Choose from ``{'VEP'}`` (just one fallback implementation is available at the moment).
     :param timeout: timeout in seconds for the REST APIs
@@ -73,6 +76,7 @@ def configure_caching_cohort_creator(
         build, cache_dir, timeout
     )
     hgvs_annotator = VVHgvsVariantCoordinateFinder(build)
+    term_onset_parser = PhenopacketOntologyTermOnsetParser.default_parser() if include_ontology_class_onsets else None
     pc = PhenopacketPatientCreator(
         hpo=hpo,
         validator=validator,
@@ -80,6 +84,7 @@ def configure_caching_cohort_creator(
         functional_annotator=functional_annotator,
         imprecise_sv_functional_annotator=imprecise_sv_functional_annotator,
         hgvs_coordinate_finder=hgvs_annotator,
+        term_onset_parser=term_onset_parser,
     )
 
     return CohortCreator(pc)
@@ -89,6 +94,7 @@ def configure_cohort_creator(
     hpo: hpotk.MinimalOntology,
     genome_build: str = "GRCh38.p13",
     validation_runner: typing.Optional[ValidationRunner] = None,
+    include_ontology_class_onsets: bool = True,
     variant_fallback: str = "VEP",
     timeout: float = 30.0,
 ) -> CohortCreator[Phenopacket]:
@@ -102,6 +108,8 @@ def configure_cohort_creator(
     :param validation_runner: an instance of the validation runner.
      if the data should be cached in `.cache` folder in the current working directory.
      In any case, the directory will be created if it does not exist (including non-existing parents).
+    :param include_ontology_class_onsets: `True` if onsets in the ontology class format 
+        (e.g. `HP:0003621` for Juvenile onset) should be included (default `True`).
     :param variant_fallback: the fallback variant annotator to use if we cannot find the annotation locally.
      Choose from ``{'VEP'}`` (just one fallback implementation is available at the moment).
     :param timeout: timeout in seconds for the VEP API
@@ -116,6 +124,7 @@ def configure_cohort_creator(
         timeout=timeout,
     )
     hgvs_annotator = VVHgvsVariantCoordinateFinder(build)
+    term_onset_parser = PhenopacketOntologyTermOnsetParser.default_parser() if include_ontology_class_onsets else None
     pc = PhenopacketPatientCreator(
         hpo=hpo,
         validator=validator,
@@ -123,6 +132,7 @@ def configure_cohort_creator(
         functional_annotator=functional_annotator,
         imprecise_sv_functional_annotator=imprecise_sv_functional_annotator,
         hgvs_coordinate_finder=hgvs_annotator,
+        term_onset_parser=term_onset_parser,
     )
 
     return CohortCreator(pc)
diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py
index ffc32ce9..c7d90fb8 100644
--- a/src/gpsea/preprocessing/_phenopacket.py
+++ b/src/gpsea/preprocessing/_phenopacket.py
@@ -502,7 +502,7 @@ def _add_diseases(
             else:
                 term_id = hpotk.TermId.from_curie(dis.term.id)
             
-            if dis.HasField("onset") and self._term_onset_parser is not None:
+            if dis.HasField("onset"):
                 onset = parse_onset_element(
                     time_element=dis.onset,
                     term_onset_parser=self._term_onset_parser,
@@ -880,7 +880,7 @@ def process(
                 f'Replace {term_id} with the primary term ID {term.identifier}',
             )
         
-        if pf.HasField("onset") and self._term_onset_parser is not None:
+        if pf.HasField("onset"):
             onset = parse_onset_element(
                 time_element=pf.onset,
                 term_onset_parser=self._term_onset_parser,
@@ -897,7 +897,7 @@ def process(
 
 def parse_onset_element(
     time_element: PPTimeElement,
-    term_onset_parser: PhenopacketOntologyTermOnsetParser,
+    term_onset_parser: typing.Optional[PhenopacketOntologyTermOnsetParser],
     notepad: Notepad,
 ) -> typing.Optional[Age]:
     """
@@ -905,10 +905,13 @@ def parse_onset_element(
     """
     case = time_element.WhichOneof("element")
     if case == "ontology_class":
-        term_onset_parser.process(
-            ontology_class=time_element.ontology_class,
-            notepad=notepad,
-        )
+        if term_onset_parser is None:
+            return None
+        else:
+            return term_onset_parser.process(
+                ontology_class=time_element.ontology_class,
+                notepad=notepad,
+            )
     else:
         return parse_age_element(
             time_element=time_element,
diff --git a/tests/preprocessing/test_phenopacket.py b/tests/preprocessing/test_phenopacket.py
index 6c1a2e27..336519ca 100644
--- a/tests/preprocessing/test_phenopacket.py
+++ b/tests/preprocessing/test_phenopacket.py
@@ -24,7 +24,7 @@
     ImpreciseSvFunctionalAnnotator,
     DefaultImpreciseSvFunctionalAnnotator,
 )
-from gpsea.preprocessing import PhenopacketPatientCreator
+from gpsea.preprocessing import PhenopacketPatientCreator, PhenopacketOntologyTermOnsetParser
 from gpsea.preprocessing import VVMultiCoordinateService
 
 
@@ -173,6 +173,10 @@ def variant_coordinate_finder(
         return VVHgvsVariantCoordinateFinder(
             genome_build=genome_build,
         )
+    
+    @pytest.fixture(scope='class')
+    def onset_term_parser(self) -> PhenopacketOntologyTermOnsetParser:
+        return PhenopacketOntologyTermOnsetParser.default_parser()
 
     @pytest.fixture
     def patient_creator(
@@ -183,6 +187,7 @@ def patient_creator(
         functional_annotator: FunctionalAnnotator,
         imprecise_sv_functional_annotator: ImpreciseSvFunctionalAnnotator,
         variant_coordinate_finder: VariantCoordinateFinder,
+        onset_term_parser: PhenopacketOntologyTermOnsetParser,
     ) -> PhenopacketPatientCreator:
         return PhenopacketPatientCreator(
             hpo=hpo,
@@ -191,6 +196,7 @@ def patient_creator(
             functional_annotator=functional_annotator,
             imprecise_sv_functional_annotator=imprecise_sv_functional_annotator,
             hgvs_coordinate_finder=variant_coordinate_finder,
+            term_onset_parser=onset_term_parser,
         )
 
     @pytest.fixture
@@ -251,6 +257,14 @@ def test_phenopacket_patient_creator(
             True,
             False,
         )
+        # Check onset of Hyperpigmentation of the skin `HP:0000953`
+        hyperpigmentation_of_the_skin = patient.phenotype_by_id('HP:0000953')
+        assert hyperpigmentation_of_the_skin is not None
+        # Expecting congenital onset
+        onset = hyperpigmentation_of_the_skin.onset
+        assert onset is not None
+        assert onset.days == pytest.approx(0.)
+        assert onset.is_postnatal
 
         # 6 measurements
         assert len(patient.measurements) == 6
diff --git a/tests/test_data/phenopackets/PMID_30968594_individual_1.json b/tests/test_data/phenopackets/PMID_30968594_individual_1.json
index 81b5f510..71236669 100644
--- a/tests/test_data/phenopackets/PMID_30968594_individual_1.json
+++ b/tests/test_data/phenopackets/PMID_30968594_individual_1.json
@@ -22,6 +22,12 @@
       "type": {
         "id": "HP:0000953",
         "label": "Hyperpigmentation of the skin"
+      },
+      "onset": {
+        "ontologyClass": {
+          "id": "HP:0003577",
+          "label": "Congenital onset"
+        }
       }
     },
     {

From 22351f5bd682b93187b98162a828f3ee97d1d3df Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 11:57:36 +0100
Subject: [PATCH 10/19] Add method for setting up the default HPO term
 analysis.

---
 src/gpsea/analysis/pcats/__init__.py |  4 ++++
 src/gpsea/analysis/pcats/_config.py  | 29 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 src/gpsea/analysis/pcats/_config.py

diff --git a/src/gpsea/analysis/pcats/__init__.py b/src/gpsea/analysis/pcats/__init__.py
index 59eceb97..51042892 100644
--- a/src/gpsea/analysis/pcats/__init__.py
+++ b/src/gpsea/analysis/pcats/__init__.py
@@ -16,16 +16,20 @@
 
 The results are provided as :class:`MultiPhenotypeAnalysisResult`
 (or more specific :class:`HpoTermAnalysisResult` for :class:`HpoTermAnalysis`).
+
+Use :func:`configure_hpo_term_analysis` to configure the HPO term analysis with the default parameters. 
 """
 
 from ._impl import MultiPhenotypeAnalysis, MultiPhenotypeAnalysisResult
 from ._impl import DiseaseAnalysis
 from ._impl import HpoTermAnalysis, HpoTermAnalysisResult
 from ._impl import apply_predicates_on_patients
+from ._config import configure_hpo_term_analysis
 
 __all__ = [
     'MultiPhenotypeAnalysis', 'MultiPhenotypeAnalysisResult',
     'DiseaseAnalysis',
     'HpoTermAnalysis', 'HpoTermAnalysisResult',
     'apply_predicates_on_patients',
+    'configure_hpo_term_analysis',
 ]
diff --git a/src/gpsea/analysis/pcats/_config.py b/src/gpsea/analysis/pcats/_config.py
new file mode 100644
index 00000000..2a06fc3e
--- /dev/null
+++ b/src/gpsea/analysis/pcats/_config.py
@@ -0,0 +1,29 @@
+import typing
+
+import hpotk
+
+from ..mtc_filter import HpoMtcFilter
+from ._impl import HpoTermAnalysis
+from .stats import CountStatistic, FisherExactTest
+
+
+def configure_hpo_term_analysis(
+    hpo: hpotk.MinimalOntology,
+    count_statistic: CountStatistic = FisherExactTest(),
+    mtc_correction: typing.Optional[str] = "fdr_bh",
+    mtc_alpha: float = 0.05,
+) -> HpoTermAnalysis:
+    """
+    Configure HPO term analysis with default parameters.
+
+    The default analysis will pre-filter HPO terms with :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`,
+    then compute nominal p values using `count_statistic` (default Fisher exact test),
+    and apply multiple testing correction (default Benjamini/Hochberg (`fdr_bh`))
+    with target `mtc_alpha` (default `0.05`).
+    """
+    return HpoTermAnalysis(
+        mtc_filter=HpoMtcFilter.default_filter(hpo),
+        count_statistic=count_statistic,
+        mtc_correction=mtc_correction,
+        mtc_alpha=mtc_alpha,
+    )

From 525bd81e1ef8c9bcf44571b3182f894bf2f67fe8 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 13:37:15 +0100
Subject: [PATCH 11/19] Document the intended usage
 `configure_hpo_term_analysis`.

---
 docs/tutorial.rst                             | 35 ++------
 docs/user-guide/analyses/phenotype-groups.rst | 81 ++++++++++++-------
 docs/user-guide/mtc.rst                       | 28 +++----
 3 files changed, 70 insertions(+), 74 deletions(-)

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 48972404..71e1fdbc 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -237,39 +237,20 @@ Testing multiple hypothesis on the same dataset increases the chance of receivin
 However, GPSEA simplifies the application of an appropriate multiple testing correction.
 
 For general use, we recommend using a combination
-of a *Phenotype MTC filter* (:class:`~gpsea.analysis.mtc_filter.PhenotypeMtcFilter`) with a *multiple testing correction*.
-Phenotype MTC filter chooses the HPO terms to test according to several heuristics, which
+of a *phenotype MT filter* (:class:`~gpsea.analysis.mtc_filter.PhenotypeMtcFilter`) with a *multiple testing correction*.
+Phenotype MT filter chooses the HPO terms to test according to several heuristics, which
 reduce the multiple testing burden and focus the analysis
-on the most interesting terms (see :ref:`HPO MTC filter <hpo-mtc-filter-strategy>` for more info).
+on the most interesting terms (see :ref:`HPO MT filter <hpo-mtc-filter-strategy>` for more info).
 Then the multiple testing correction, such as Bonferroni or Benjamini-Hochberg,
 is used to control the family-wise error rate or the false discovery rate.
 See :ref:`mtc` for more information.
 
-In this example, we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`)
-with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``)
-with a false discovery control level at (``mtc_alpha=0.05``):
+>>> from gpsea.analysis.pcats import configure_hpo_term_analysis
+>>> analysis = configure_hpo_term_analysis(hpo)
 
->>> from gpsea.analysis.mtc_filter import HpoMtcFilter
->>> mtc_filter = HpoMtcFilter.default_filter(hpo)
->>> mtc_correction = 'fdr_bh'
->>> mtc_alpha = 0.05
-
-Choosing the statistical procedure for assessment of association between genotype and phenotype
-groups is the last missing piece of the analysis. We will use Fisher Exact Test:
-
->>> from gpsea.analysis.pcats.stats import FisherExactTest
->>> count_statistic = FisherExactTest()
-
-and we finalize the analysis setup by putting all components together
-into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
-
->>> from gpsea.analysis.pcats import HpoTermAnalysis
->>> analysis = HpoTermAnalysis(
-...     count_statistic=count_statistic,
-...     mtc_filter=mtc_filter,
-...     mtc_correction=mtc_correction,
-...     mtc_alpha=mtc_alpha,
-... )
+:func:`~gpsea.analysis.pcats.configure_hpo_term_analysis` configures the analysis
+that uses HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) for selecting HPO terms of interest,
+Fisher Exact test for computing nominal p values, and Benjamini-Hochberg for multiple testing correction.
 
 Now we can perform the analysis and investigate the results.
 
diff --git a/docs/user-guide/analyses/phenotype-groups.rst b/docs/user-guide/analyses/phenotype-groups.rst
index 86b72f59..d9a7b811 100644
--- a/docs/user-guide/analyses/phenotype-groups.rst
+++ b/docs/user-guide/analyses/phenotype-groups.rst
@@ -167,59 +167,84 @@ The function finds 369 HPO terms that annotate at least one individual,
 including the *indirect* annotations whose presence is implied by the :ref:`true-path-rule`.
 
 
-Statistical test
-----------------
+Statistical analysis
+--------------------
 
 We will use :ref:`fisher-exact-test` to test the association
 between genotype and phenotype groups, as described previously.
 
->>> from gpsea.analysis.pcats.stats import FisherExactTest
->>> count_statistic = FisherExactTest()
+In the case of this cohort, we can test association between having a frameshift variant and one of 369 HPO terms.
+However, testing multiple hypotheses on the same dataset increases the risk of finding
+a significant association solely by chance.
+GPSEA uses a two-pronged strategy to reduce the number of tests and, therefore, mitigate this risk:
+a phenotype multiple testing (MT) filter and multiple testing correction (MTC).
 
-FET will compute a p value for each genotype phenotype group.
+Phenotype MT filter selects a (sub)set of HPO terms for testing,
+for instance only the user-selected terms (see :class:`~gpsea.analysis.mtc_filter.SpecifyTermsStrategy`)
+or the terms selected by :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`.
 
+Multiple testing correction then adjusts the nominal p values for the increased risk
+of false positive G/P associations.
+The available MTC procedures are listed in the :ref:`mtc-correction-procedures` section.
 
-Multiple testing correction
----------------------------
+We must pick one of these to perform genotype-phenotype analysis.
 
-In the case of this cohort, we could test association between having a frameshift variant and one of 369 HPO terms.
-However, testing multiple hypotheses on the same dataset increases the risk of finding a significant association
-by chance.
-GPSEA uses a two-pronged strategy to mitigate this risk - a phenotype MTC filter and multiple testing correction.
 
-.. note::
+Default analysis
+^^^^^^^^^^^^^^^^
+
+We recommend using HPO MT filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`) as a phenotype MT filter
+and Benjamini-Hochberg for multiple testing correction.
+The default analysis can be configured with :func:`~gpsea.analysis.pcats.configure_hpo_term_analysis` convenience method.
+
+>>> from gpsea.analysis.pcats import configure_hpo_term_analysis
+>>> analysis = configure_hpo_term_analysis(hpo)
 
-   See the :ref:`mtc` section for more info on multiple testing procedures.
 
-Here we will use a combination of the HPO MTC filter (:class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`)
-with Benjamini-Hochberg procedure (``mtc_correction='fdr_bh'``)
-with a false discovery control level set to `0.05` (``mtc_alpha=0.05``):
+Custom analysis
+^^^^^^^^^^^^^^^
+
+If the defaults do not work, we can configure the analysis manually.
+First, we choose a phenotype MT filter (e.g. :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`):
 
 >>> from gpsea.analysis.mtc_filter import HpoMtcFilter
->>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=0.2)
->>> mtc_correction = 'fdr_bh'
->>> mtc_alpha = 0.05
+>>> mtc_filter = HpoMtcFilter.default_filter(hpo, term_frequency_threshold=.2)
+
+.. note::
 
+   See the :ref:`mtc-filters` section for more info on the available MT filters.
 
-Final analysis
---------------
+then a statistical test (e.g. Fisher Exact test):
+
+>>> from gpsea.analysis.pcats.stats import FisherExactTest
+>>> count_statistic = FisherExactTest()
+
+.. note::
+
+   See the :mod:`gpsea.analysis.pcats.stats` module for the available multiple testing procedures
+   (TL;DR, just Fisher Exact test at this time).
+
+and we finalize the setup by choosing a MTC procedure
+(e.g. `fdr_bh` for Benjamini-Hochberg) along with the MTC alpha:
+
+>>> mtc_correction = 'fdr_bh'
+>>> mtc_alpha = 0.05
 
-We finalize the analysis setup by putting all components together
-into :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
+The final :class:`~gpsea.analysis.pcats.HpoTermAnalysis` is created as:
 
 >>> from gpsea.analysis.pcats import HpoTermAnalysis
 >>> analysis = HpoTermAnalysis(
 ...     count_statistic=count_statistic,
 ...     mtc_filter=mtc_filter,
-...     mtc_correction=mtc_correction,
-...     mtc_alpha=mtc_alpha,
+...     mtc_correction='fdr_bh',
+...     mtc_alpha=0.05,
 ... )
 
 
 Analysis
 ========
 
-We can now execute the analysis:
+We can now test associations between the genotype groups and the HPO terms:
 
 >>> result = analysis.compare_genotype_vs_phenotypes(
 ...     cohort=cohort,
@@ -232,8 +257,8 @@ We can now execute the analysis:
 24
 
 
-Thanks to phenotype MTC filter, we only tested 24 out of 369 terms.
-We can learn more by showing the MTC filter report:
+Thanks to phenotype MT filter, we only tested 24 out of 369 terms.
+We can learn more by showing the MT filter report:
 
 >>> from gpsea.view import MtcStatsViewer
 >>> mtc_viewer = MtcStatsViewer()
diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst
index ffae4ded..b0fb478a 100644
--- a/docs/user-guide/mtc.rst
+++ b/docs/user-guide/mtc.rst
@@ -36,8 +36,9 @@ Therefore, unless we take into account the fact that multiple statistical tests
 it is likely that we will obtain one or more false-positive results.
 
 GPSEA offers two approaches to mitigate this problem: multiple-testing correction (MTC) procedures
-and MTC filters to choose the terms to be tested.
+and MT filters to choose the terms to be tested.
 
+.. _mtc-correction-procedures:
 
 Multiple-testing correction procedures
 ======================================
@@ -106,7 +107,7 @@ when creating an instance of :class:`~gpsea.analysis.pcats.HpoTermAnalysis`:
 
 .. _mtc-filters:
 
-MTC filters: Choosing which terms to test
+MT filters: Choosing which terms to test
 =========================================
 
 We can reduce the overall MTC burden by choosing which terms to test. 
@@ -117,18 +118,7 @@ may "survive" the multiple-testing correction.
 
 In the context of GPSEA, we represent the concept of phenotype filtering 
 by :class:`~gpsea.analysis.mtc_filter.PhenotypeMtcFilter`.
-The filter must be chosen before the :class:`~gpsea.analysis.pcats.MultiPhenotypeAnalysis`,
-such as :class:`~gpsea.analysis.pcats.HpoTermAnalysis`, is run:
-
->>> from gpsea.analysis.pcats import HpoTermAnalysis
->>> analysis = HpoTermAnalysis()  # doctest: +ELLIPSIS
-Traceback (most recent call last):
-  ...
-TypeError: HpoTermAnalysis.__init__() missing 2 required positional arguments: 'count_statistic' and 'mtc_filter'
-
-Note the missing `mtc_filter` option.
-
-We describe the three filtering strategies in the following sections.
+We provide three filtering strategies.
 
 
 .. _use-all-terms-strategy:
@@ -136,9 +126,9 @@ We describe the three filtering strategies in the following sections.
 Test all terms
 --------------
 
-The first MTC filtering strategy is the simplest - do not apply any filtering at all.
-This will result in testing all terms. We do not recommend this strategy, 
-but it can be useful to disable MTC filtering.
+The first MT filtering strategy is the simplest - do not apply any filtering at all.
+This will result in testing all terms and we do not recommend this strategy, 
+but it can be used to disable MT filtering.
 
 The strategy is implemented in :class:`~gpsea.analysis.mtc_filter.UseAllTermsMtcFilter`.
 
@@ -171,10 +161,10 @@ we pass an iterable (e.g. a tuple) with these two terms as an argument:
 
 .. _hpo-mtc-filter-strategy:
 
-HPO MTC filter strategy
+HPO MT filter strategy
 -----------------------
 
-Last, the HPO MTC strategy involves making several domain judgments to take advantage of the HPO structure.
+The HPO MT strategy involves making several domain judgments and takes advantage of the HPO structure.
 The strategy needs access to HPO:
 
 >>> import hpotk

From 2b07b0ee909e720e4bc9416d509ec19b7c9364d7 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 15:45:01 +0100
Subject: [PATCH 12/19] Reformat Uniprot response for `P17010`.

---
 .../P17010_manual_download.json               | 283 +++++++++++++++++-
 1 file changed, 282 insertions(+), 1 deletion(-)

diff --git a/tests/preprocessing/data/uniprot_response/P17010_manual_download.json b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json
index f7a36cec..eb8e66d3 100644
--- a/tests/preprocessing/data/uniprot_response/P17010_manual_download.json
+++ b/tests/preprocessing/data/uniprot_response/P17010_manual_download.json
@@ -1 +1,282 @@
-{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"P17010","features":[{"type":"Zinc finger","location":{"start":{"value":425,"modifier":"EXACT"},"end":{"value":447,"modifier":"EXACT"}},"description":"C2H2-type 1","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":456,"modifier":"EXACT"},"end":{"value":478,"modifier":"EXACT"}},"description":"C2H2-type 2","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":488,"modifier":"EXACT"},"end":{"value":510,"modifier":"EXACT"}},"description":"C2H2-type 3","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":519,"modifier":"EXACT"},"end":{"value":542,"modifier":"EXACT"}},"description":"C2H2-type 4","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":548,"modifier":"EXACT"},"end":{"value":570,"modifier":"EXACT"}},"description":"C2H2-type 5","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":576,"modifier":"EXACT"},"end":{"value":599,"modifier":"EXACT"}},"description":"C2H2-type 6","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":605,"modifier":"EXACT"},"end":{"value":627,"modifier":"EXACT"}},"description":"C2H2-type 7","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":633,"modifier":"EXACT"},"end":{"value":656,"modifier":"EXACT"}},"description":"C2H2-type 8","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":662,"modifier":"EXACT"},"end":{"value":684,"modifier":"EXACT"}},"description":"C2H2-type 9","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":690,"modifier":"EXACT"},"end":{"value":713,"modifier":"EXACT"}},"description":"C2H2-type 10","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":719,"modifier":"EXACT"},"end":{"value":741,"modifier":"EXACT"}},"description":"C2H2-type 11","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":747,"modifier":"EXACT"},"end":{"value":770,"modifier":"EXACT"}},"description":"C2H2-type 12","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]},{"type":"Zinc finger","location":{"start":{"value":776,"modifier":"EXACT"},"end":{"value":798,"modifier":"EXACT"}},"description":"C2H2-type 13","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00042"}]}],"extraAttributes":{"uniParcId":"UPI000013C504"}}
\ No newline at end of file
+{
+    "entryType": "UniProtKB reviewed (Swiss-Prot)",
+    "primaryAccession": "P17010",
+    "features": [
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 425,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 447,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 1",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 456,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 478,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 2",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 488,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 510,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 3",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 519,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 542,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 4",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 548,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 570,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 5",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 576,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 599,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 6",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 605,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 627,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 7",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 633,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 656,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 8",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 662,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 684,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 9",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 690,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 713,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 10",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 719,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 741,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 11",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 747,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 770,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 12",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        },
+        {
+            "type": "Zinc finger",
+            "location": {
+                "start": {
+                    "value": 776,
+                    "modifier": "EXACT"
+                },
+                "end": {
+                    "value": 798,
+                    "modifier": "EXACT"
+                }
+            },
+            "description": "C2H2-type 13",
+            "evidences": [
+                {
+                    "evidenceCode": "ECO:0000255",
+                    "source": "PROSITE-ProRule",
+                    "id": "PRU00042"
+                }
+            ]
+        }
+    ],
+    "extraAttributes": {
+        "uniParcId": "UPI000013C504"
+    }
+}
\ No newline at end of file

From b597fba780f8d8be8084eadbb506ec870d1b8110 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 15:49:50 +0100
Subject: [PATCH 13/19] Add typing annotations to the test.

---
 tests/preprocessing/test_uniprot_json.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/preprocessing/test_uniprot_json.py b/tests/preprocessing/test_uniprot_json.py
index e1e56e9b..51558b5c 100644
--- a/tests/preprocessing/test_uniprot_json.py
+++ b/tests/preprocessing/test_uniprot_json.py
@@ -50,7 +50,7 @@ def P17010_json_file_path(
     @pytest.fixture
     def P17010_protein_metadata(
             self,
-            P17010_json_file_path#: str,
+            P17010_json_file_path: str,
     ) -> ProteinMetadata:
         """
         :returns: ProteinMetadata created from a downloaded UniProt JSON file
@@ -96,8 +96,10 @@ def test_first_feature(
         assert feature_0.info.name == "Calponin-homology (CH) 1"
 
 
-    def test_ZFX(self,
-                 P17010_protein_metadata):
+    def test_ZFX(
+        self,
+        P17010_protein_metadata: ProteinMetadata,
+    ):
         """
         :[{"type":"Zinc finger",
         "location":{"start":{"value":425,"modifier":"EXACT"},

From ede82e5b088c519b01d0d729d38ac408266e481e Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 15:57:59 +0100
Subject: [PATCH 14/19] Fix `UniprotProteinMetadataService` pydoc.

---
 src/gpsea/preprocessing/_uniprot.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gpsea/preprocessing/_uniprot.py b/src/gpsea/preprocessing/_uniprot.py
index ccdf5ed7..03ee6663 100644
--- a/src/gpsea/preprocessing/_uniprot.py
+++ b/src/gpsea/preprocessing/_uniprot.py
@@ -110,7 +110,9 @@ def annotate(self, protein_id: str) -> ProteinMetadata:
         Args:
             protein_id (str): A protein ID
         Returns:
-            Sequence[ProteinMetadata]: A sequence of ProteinMetadata objects, or an empty sequence if no data was found.
+            ProteinMetadata: A :class:`~gpsea.model.ProteinMetadata` corresponding to the input `protein_id`.
+        Raises:
+            ValueError: in case of issues with `protein_id`, I/O issues, or parsing the REST response.
         """
         if not isinstance(protein_id, str):
             raise ValueError(f'Protein ID must be a str but it was {type(protein_id)}')

From 9b6a6b1a5af91e1dc503bce9d708370f345aee93 Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Wed, 13 Nov 2024 15:58:50 +0100
Subject: [PATCH 15/19] Raise an exception if unable to parse `ProteinMetadata`
 from Uniprot JSON.

---
 src/gpsea/model/_protein.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/gpsea/model/_protein.py b/src/gpsea/model/_protein.py
index 178c86e1..6db36e09 100644
--- a/src/gpsea/model/_protein.py
+++ b/src/gpsea/model/_protein.py
@@ -368,19 +368,16 @@ def from_uniprot_json(
 
         regions = list()
         for feature in data["features"]:
-            try:
-                region_name = feature["description"]
-                locus = feature["location"]
-                region_start = int(locus["start"]["value"]) - 1  # convert to 0-based coordinates
-                region_end = int(locus["end"]["value"])
-                feature_type = FeatureType.from_string(feature["type"])
-                finfo = FeatureInfo(
-                    name=region_name, region=Region(start=region_start, end=region_end)
-                )
-                pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
-                regions.append(pfeature)
-            except Exception as feature_exception:
-                print(f"Could not parse feature: {str(feature_exception)} (skipping)")
+            region_name = feature["description"]
+            locus = feature["location"]
+            region_start = int(locus["start"]["value"]) - 1  # convert to 0-based coordinates
+            region_end = int(locus["end"]["value"])
+            feature_type = FeatureType.from_string(feature["type"])
+            finfo = FeatureInfo(
+                name=region_name, region=Region(start=region_start, end=region_end)
+            )
+            pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
+            regions.append(pfeature)
 
         return ProteinMetadata(
             protein_id=protein_id,

From c19ad7ed83a9eca9ff50192bd1820990c8b2f4e4 Mon Sep 17 00:00:00 2001
From: Peter Robinson <peter.robinson@jax.org>
Date: Wed, 13 Nov 2024 17:36:28 +0100
Subject: [PATCH 16/19] updating DeVries score

---
 docs/user-guide/predicates/devries.rst | 42 ++++++++++++++-------
 src/gpsea/analysis/pscore/_hpo.py      | 51 ++++++++++++++------------
 2 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/docs/user-guide/predicates/devries.rst b/docs/user-guide/predicates/devries.rst
index 37989c86..84bbd948 100644
--- a/docs/user-guide/predicates/devries.rst
+++ b/docs/user-guide/predicates/devries.rst
@@ -15,7 +15,7 @@ Statistical significance of a difference in the De Vries score between groups ca
 determined using the Mann-Whitney-U test.
 
 We refer to `Feenstra et al. (2011) <https://pubmed.ncbi.nlm.nih.gov/21712853/>`_ for
-the original description of the adjusted De Vries score. Here we offer a version of the
+the original description of the adjusted De Vries score. Here we offer an adapted version of the
 score that leverages the structure of the Human Phenotype Ontology to assess the phenotype.
 
 
@@ -113,38 +113,54 @@ is 2 because the same individual cannot have both tall and short stature or both
 Facial dysmorphic features
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This section assigns two points if two or more anomalies are identified in the following
-categories: hypertelorism, nasal anomalies and ear anomalies. Our implementation of this feature counts the total
-number of terms or descendents of the following HPO terms.
+This section assigns two points if two or more facial dysmorphisms are identified. In contrast to the list of anomalies described
+in the original 2011 publication of the DeVries score, we leverage the structure of the HPO to include many more HPO terms that 
+denote various kinds of facial dysmorphism (e.g., `Abnormality of globe location <https://hpo.jax.org/browse/term/HP:0100886>`_ instead of just
+`Hypertelorism (HP:0000316) <https://hpo.jax.org/browse/term/HP:0000316>`_).
+
+Our implementation of this feature counts the total number of terms or descendents of the following HPO terms. Up to one point is given
+for each of the categories.
 
 +----------------------------------------------------------------------------------------------------------+-----------+
 | HPO term                                                                                                 | Score     |
 +==========================================================================================================+===========+
-| `Hypertelorism (HP:0000316) <https://hpo.jax.org/browse/term/HP:0000316>`_                               | 1         |
+| `Abnormality of globe location (HP:0000316) <https://hpo.jax.org/browse/term/HP:0100886>`_               | 0 or 1    |
++----------------------------------------------------------------------------------------------------------+-----------+
+| `Abnormal lip morphology (HP:0000159) <https://hpo.jax.org/browse/term/HP:0000159>`_                     |  0 or 1   |
++----------------------------------------------------------------------------------------------------------+-----------+
+| `Abnormal facial shape (HP:0001999) <https://hpo.jax.org/browse/term/HP:0001999>`_                       |  0 or 1   |
++----------------------------------------------------------------------------------------------------------+-----------+
+| `Abnormal midface morphology (HP:0000309) <https://hpo.jax.org/browse/term/HP:0000309>`_                 |  0 or 1   |
 +----------------------------------------------------------------------------------------------------------+-----------+
-| `Abnormal external nose morphology (HP:0010938) <https://hpo.jax.org/browse/term/HP:0010938>`_           | 1 each    |
+| `Abnormal forehead morphology (HP:0000290) <https://hpo.jax.org/browse/term/HP:0000290>`_                | 0 or 1    |
 +----------------------------------------------------------------------------------------------------------+-----------+
-| `Abnormal pinna morphology (HP:0000377)  <https://hpo.jax.org/browse/term/HP:0000377>`_                  | 1 each    |
+| `Abnormal chin morphology (HP:0000306) <https://hpo.jax.org/browse/term/HP:0000306>`_                    |  0 or 1   |
++----------------------------------------------------------------------------------------------------------+-----------+
+| `Abnormal external nose morphology (HP:0010938) <https://hpo.jax.org/browse/term/HP:0010938>`_           |  0 or 1   |
++----------------------------------------------------------------------------------------------------------+-----------+
+| `Abnormal pinna morphology (HP:0000377)  <https://hpo.jax.org/browse/term/HP:0000377>`_                  |  0 or 1   |
 +----------------------------------------------------------------------------------------------------------+-----------+
 
-If two or more terms are found, the score is 2, otherwise a score of zero is assigned.
+If items from two or more categories are found, the score is 2, otherwise a score of zero is assigned.
 
 
 Non-facial dysmorphism and congenital abnormalities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-One point is assigned for either the
-corresponding HPO terms or any of their descendents up to a maximum of two points.
+One point is assigned for either the corresponding HPO terms or any of their descendents up to a maximum of two points.
+A maximum of one point is assigned for each of the following categories.
 
 +----------------------------------------------------------------------------------------------------------+-----------+
 | HPO term                                                                                                 | Score     |
 +==========================================================================================================+===========+
-| `Abnormal hand morphology (HP:0005922) <https://hpo.jax.org/browse/term/HP:0005922>`_                    | 1 each    |
+| `Abnormal hand morphology (HP:0005922) <https://hpo.jax.org/browse/term/HP:0005922>`_                    | 0 or 1    |
 +----------------------------------------------------------------------------------------------------------+-----------+
-| `Abnormal heart morphology (HP:0001627) <https://hpo.jax.org/browse/term/HP:0001627>`_                   | 1 each    |
+| `Abnormal heart morphology (HP:0001627) <https://hpo.jax.org/browse/term/HP:0001627>`_                   |  0 or 1   |
 +----------------------------------------------------------------------------------------------------------+-----------+
-| `Hypospadias (HP:0000047) <https://hpo.jax.org/browse/term/HP:0000047>`_                                 | 1         |
+| `Abnormal external genitalia morphology (HP:0000811) <https://hpo.jax.org/browse/term/HP:0000811>`_      |  0 or 1   |
 +----------------------------------------------------------------------------------------------------------+-----------+
 
+The score for this section can thus be 0, 1, or 2.
+
 
 Final score
 ~~~~~~~~~~~
diff --git a/src/gpsea/analysis/pscore/_hpo.py b/src/gpsea/analysis/pscore/_hpo.py
index 6187b0ea..09b57451 100644
--- a/src/gpsea/analysis/pscore/_hpo.py
+++ b/src/gpsea/analysis/pscore/_hpo.py
@@ -166,7 +166,7 @@ def _developmental_delay_score(
     ) -> float:
         """
         Calculate the dev delay component of the score
-        
+
         Args:
             observed_term_ids: terms observed in patient
 
@@ -177,12 +177,12 @@ def _developmental_delay_score(
         for t in observed_term_ids:
             if t in self._gdd_tids:
                 return self._gdd_tids[t]
-        
+
         # Intellectual disability
         for t in observed_term_ids:
             if t in self._idd_tids:
                 return self._idd_tids[t]
-        
+
         return 0
 
     def _term_or_descendant(
@@ -202,7 +202,7 @@ def _term_or_descendant(
             if term_id == target_tid \
                or any(ancestor == target_tid for ancestor in self._hpo.graph.get_ancestors(term_id)):
                 return 1
-        
+
         return 0
 
     def _term_or_descendant_count(
@@ -216,14 +216,13 @@ def _term_or_descendant_count(
             observed_term_ids: all terms observed in patient
 
         Returns:
-            the total count of the terms equal to or descending from the target_tid
+            1 if at least one term is equal to or descending from the target_tid, otherwise 0
         """
-        total_count = 0
         for term_id in observed_term_ids:
             for desc_tid in self._hpo.graph.get_ancestors(term_id, include_source=True):
                 if desc_tid.value == target_tid:
-                    total_count += 1
-        return total_count
+                    return  1
+        return 0
 
     def _postnatal_growth_score(
         self,
@@ -231,7 +230,7 @@ def _postnatal_growth_score(
     ) -> int:
         """
         Calculate the postnatal growth component of the score.
-        
+
         Args:
             observed_term_ids: terms observed in patient
 
@@ -264,14 +263,22 @@ def _facial_dysmorphism_score(
         Returns: facial dysmorphism score (between 0 and 2)
 
         """
-        hypertelorism = 'HP:0000316'
+        globe_location = 'HP:0100886' # include Hypertelorism and others
+        lip = 'HP:0000159' # Abnormal lip morphology HP:0000159
         external_nose = 'HP:0010938'
         pinna_morphology = 'HP:0000377'
+        facial_shape = 'HP:0001999'  # Abnormal facial shape
+        midface = 'HP:0000309' #Abnormal midface morphology
+        chin = 'HP:0000306' #Abnormality of the chin
 
-        # No need to inspect descendants since Hypertelorism has none.
-        total_count = 1 if hypertelorism in observed_term_ids else 0
+        total_count = self._term_or_descendant_count(target_tid=globe_location, observed_term_ids=observed_term_ids)
+        total_count += self._term_or_descendant_count(target_tid=lip, observed_term_ids=observed_term_ids)
         total_count += self._term_or_descendant_count(target_tid=external_nose, observed_term_ids=observed_term_ids)
         total_count += self._term_or_descendant_count(target_tid=pinna_morphology, observed_term_ids=observed_term_ids)
+        total_count += self._term_or_descendant_count(target_tid=facial_shape, observed_term_ids=observed_term_ids)
+        total_count += self._term_or_descendant_count(target_tid=midface, observed_term_ids=observed_term_ids)
+        total_count += self._term_or_descendant_count(target_tid=chin, observed_term_ids=observed_term_ids)
+
         if total_count > 1:
             return 2
         else:
@@ -284,24 +291,20 @@ def _congenital_score(
         """
         Non-facial dysmorphism and congenital abnormalities component.
         One point is assigned for either the corresponding HPO terms or any of their descendents up to a maximum of 2.
-        
+
         Args:
             observed_term_ids:  terms observed in patient
 
         Returns:   Non-facial dysmorphism and congenital abnormalities score (between 0 and 2)
 
         """
-        hypospadias = 'HP:0000047'
+        abn_external_genitalia = 'HP:0000811' # Abnormal external genitalia
         abnormal_hand_morphology = 'HP:0005922'
         abnormal_heart_morphology = 'HP:0001627'
-        # total_count = len([t for t in observed_term_ids if t == hypospadias])
-        total_count = self._term_or_descendant_count(
-            target_tid=hypospadias, observed_term_ids=observed_term_ids,
-        )
-        total_count += self._term_or_descendant_count(target_tid=abnormal_hand_morphology,
-                                                      observed_term_ids=observed_term_ids)
-        total_count += self._term_or_descendant_count(target_tid=abnormal_heart_morphology,
-                                                      observed_term_ids=observed_term_ids)
+
+        total_count = self._term_or_descendant_count(target_tid=abn_external_genitalia, observed_term_ids=observed_term_ids,)
+        total_count += self._term_or_descendant_count(target_tid=abnormal_hand_morphology, observed_term_ids=observed_term_ids)
+        total_count += self._term_or_descendant_count(target_tid=abnormal_heart_morphology, observed_term_ids=observed_term_ids)
         return min(2, total_count)
 
     def _prenatal_growth_score(
@@ -328,7 +331,7 @@ def _prenatal_growth_score(
     def score(self, patient: Patient) -> float:
         """
         Calculate score based on list of strings with term identifiers or observed HPO terms.
-        
+
         Args:
             patient: list of strings with term identifiers or observed HPO terms
 
@@ -342,5 +345,5 @@ def score(self, patient: Patient) -> float:
         facial_score = self._facial_dysmorphism_score(observed_term_ids)
         congen_score = self._congenital_score(observed_term_ids)
         prenatal_score = self._prenatal_growth_score(observed_term_ids)
-        
+
         return delay_score + growth_score + facial_score + congen_score + prenatal_score

From c26186d7eb0c055760b2a5e3d0226c31d24d837b Mon Sep 17 00:00:00 2001
From: Peter Robinson <peter.robinson@jax.org>
Date: Wed, 13 Nov 2024 17:37:33 +0100
Subject: [PATCH 17/19] updating DeVries score

---
 src/gpsea/analysis/pscore/_hpo.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gpsea/analysis/pscore/_hpo.py b/src/gpsea/analysis/pscore/_hpo.py
index 09b57451..fae0cd74 100644
--- a/src/gpsea/analysis/pscore/_hpo.py
+++ b/src/gpsea/analysis/pscore/_hpo.py
@@ -221,7 +221,7 @@ def _term_or_descendant_count(
         for term_id in observed_term_ids:
             for desc_tid in self._hpo.graph.get_ancestors(term_id, include_source=True):
                 if desc_tid.value == target_tid:
-                    return  1
+                    return 1
         return 0
 
     def _postnatal_growth_score(
@@ -263,13 +263,13 @@ def _facial_dysmorphism_score(
         Returns: facial dysmorphism score (between 0 and 2)
 
         """
-        globe_location = 'HP:0100886' # include Hypertelorism and others
+        globe_location = 'HP:0100886'  # include Hypertelorism and others
         lip = 'HP:0000159' # Abnormal lip morphology HP:0000159
         external_nose = 'HP:0010938'
         pinna_morphology = 'HP:0000377'
         facial_shape = 'HP:0001999'  # Abnormal facial shape
-        midface = 'HP:0000309' #Abnormal midface morphology
-        chin = 'HP:0000306' #Abnormality of the chin
+        midface = 'HP:0000309'  # Abnormal midface morphology
+        chin = 'HP:0000306'  # Abnormality of the chin
 
         total_count = self._term_or_descendant_count(target_tid=globe_location, observed_term_ids=observed_term_ids)
         total_count += self._term_or_descendant_count(target_tid=lip, observed_term_ids=observed_term_ids)
@@ -298,7 +298,7 @@ def _congenital_score(
         Returns:   Non-facial dysmorphism and congenital abnormalities score (between 0 and 2)
 
         """
-        abn_external_genitalia = 'HP:0000811' # Abnormal external genitalia
+        abn_external_genitalia = 'HP:0000811'  # Abnormal external genitalia
         abnormal_hand_morphology = 'HP:0005922'
         abnormal_heart_morphology = 'HP:0001627'
 

From 230fff5441fb274ddb59028fdb6959fa24c1921e Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Fri, 15 Nov 2024 11:34:45 +0100
Subject: [PATCH 18/19] Set release to `0.7.1`.

---
 docs/conf.py          | 2 +-
 src/gpsea/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 7c204680..1e8d725a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -62,7 +62,7 @@
 # The short X.Y version.
 version = u'0.7'
 # The full version, including alpha/beta/rc tags.
-release = u'0.7.1.dev0'
+release = u'0.7.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/src/gpsea/__init__.py b/src/gpsea/__init__.py
index fb3db8e4..4b512d92 100644
--- a/src/gpsea/__init__.py
+++ b/src/gpsea/__init__.py
@@ -2,4 +2,4 @@
 GPSEA is a library for analyzing genotype-phenotype correlations in cohorts of rare disease patients.
 """
 
-__version__ = "0.7.1.dev0"
+__version__ = "0.7.1"

From 79e14ecfea8ff77c64876eda399f16ca5cb6e26a Mon Sep 17 00:00:00 2001
From: Daniel Danis <daniel.gordon.danis@protonmail.com>
Date: Fri, 15 Nov 2024 11:48:09 +0100
Subject: [PATCH 19/19] Simplify DeVries descendant counting.

---
 src/gpsea/analysis/pscore/_hpo.py | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/src/gpsea/analysis/pscore/_hpo.py b/src/gpsea/analysis/pscore/_hpo.py
index fae0cd74..2f12d288 100644
--- a/src/gpsea/analysis/pscore/_hpo.py
+++ b/src/gpsea/analysis/pscore/_hpo.py
@@ -185,26 +185,6 @@ def _developmental_delay_score(
 
         return 0
 
-    def _term_or_descendant(
-        self,
-        target_tid: str,
-        observed_term_ids: typing.Iterable[str],
-    ) -> int:
-        """
-        Args:
-            target_tid: term of interest
-            observed_term_ids: all terms observed in patient
-
-        Returns:
-            1 if the term or any descendant is present in the patient, otherwise 0
-        """
-        for term_id in observed_term_ids:
-            if term_id == target_tid \
-               or any(ancestor == target_tid for ancestor in self._hpo.graph.get_ancestors(term_id)):
-                return 1
-
-        return 0
-
     def _term_or_descendant_count(
         self,
         target_tid: str,
@@ -219,9 +199,8 @@ def _term_or_descendant_count(
             1 if at least one term is equal to or descending from the target_tid, otherwise 0
         """
         for term_id in observed_term_ids:
-            for desc_tid in self._hpo.graph.get_ancestors(term_id, include_source=True):
-                if desc_tid.value == target_tid:
-                    return 1
+            if term_id == target_tid or self._hpo.graph.is_descendant_of(term_id, target_tid):
+                return 1
         return 0
 
     def _postnatal_growth_score(
@@ -242,7 +221,7 @@ def _postnatal_growth_score(
         tall_stature = 'HP:0000098'
         total_count = 0
         for tid in (microcephaly, short_stature, macrocephaly, tall_stature):
-            total_count += self._term_or_descendant(tid, observed_term_ids)
+            total_count += self._term_or_descendant_count(tid, observed_term_ids)
         if total_count > 2:
             raise ValueError(f"Inconsistent annotations for postnatal growth score {total_count}:  {observed_term_ids}")
         return total_count