diff --git a/.DS_Store b/.DS_Store index 31c5ba3..87a427a 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/inst/pages/chapter1/figures/fig-evidence-histograms.pdf b/inst/pages/chapter1/figures/fig-evidence-histograms.pdf deleted file mode 100644 index 141a39a..0000000 Binary files a/inst/pages/chapter1/figures/fig-evidence-histograms.pdf and /dev/null differ diff --git a/inst/pages/chapter1/figures/fig-therapy-filter.pdf b/inst/pages/chapter1/figures/fig-therapy-filter.pdf deleted file mode 100644 index fb0d14d..0000000 Binary files a/inst/pages/chapter1/figures/fig-therapy-filter.pdf and /dev/null differ diff --git a/inst/pages/chapter1/figures/fig-therapy-validate-all.pdf b/inst/pages/chapter1/figures/fig-therapy-validate-all.pdf deleted file mode 100644 index 4adc6ad..0000000 Binary files a/inst/pages/chapter1/figures/fig-therapy-validate-all.pdf and /dev/null differ diff --git a/manuscript/.gitignore b/manuscript/.gitignore index afb7ea8..81a94e3 100644 --- a/manuscript/.gitignore +++ b/manuscript/.gitignore @@ -2,7 +2,6 @@ */.ipynb_checkpoints/* /.quarto/ -/_manuscript /agujournal2019.cls /trackchanges.sty diff --git a/manuscript/_freeze/index/execute-results/tex.json b/manuscript/_freeze/index/execute-results/tex.json index c3eb80c..7cbe6f0 100644 --- a/manuscript/_freeze/index/execute-results/tex.json +++ b/manuscript/_freeze/index/execute-results/tex.json @@ -1,8 +1,8 @@ { - "hash": "567ec0e678583c4a3f6acd4a830efe81", + "hash": "96bb29d73374ec85a3096915bc3b875f", "result": { "engine": "knitr", - "markdown": "---\ntitle: \"Cell type-specific contextualisation of the phenomic landscape: a comprehensive and scalable approach towards the diagnosis, prognosis and treatment of all rare diseases\"\nauthor:\n - name: Brian M. Schilder\n orcid: 0000-0001-5949-2191\n corresponding: true\n email: brian_schilder@alumni.brown.edu\n roles:\n - Investigation\n - Project administration\n - Software\n - Visualization\n affiliations:\n - Imperial College London\n - name: Kitty B. Murphy\n orcid: 0000-0002-8669-3076\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Robert Gordon-Smith\n orcid: 0000-0001-6698-7387\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Jai Chapman\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Momoko Otani\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Nathan G. Skene\n orcid: 0000-0002-6807-3180\n corresponding: true\n email: n.skene@imperial.ac.uk\n roles:\n - Project administration\n affiliations:\n - Imperial College London\nkeywords:\n - rare disease\n - phenotype\n - single-cell\n - gene therapy\nplain-language-summary: |\n We identified the cell types underlying the symptoms of all rare diseases.\nkey-points:\n - We used the Human Phenotype Ontology and single-cell RNA-seq references to characterise the phenome.\n - We then demonstrated how these results can be applied to clinical diagnosis, prognosis and therapeutics development.\ndate: last-modified\nbibliography: references.bib\ncitation:\n container-title: Nature Medicine\nformat:\n nature-pdf:\n journal: \"sn-nature\"\n keep-tex: true\n execute:\n cache: true\n echo: false\n docx:\n execute:\n cache: true\n echo: false\n---\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(data.table)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nhpo <- HPOExplorer::get_hpo()\np2g <- HPOExplorer::load_phenotype_to_genes()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nannot <- HPOExplorer::load_phenotype_to_genes(3)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nper_disease <- p2g[,list(ng=data.table::uniqueN(gene_symbol),\n np=data.table::uniqueN(hpo_id)),by=\"disease_id\"]\nper_phenotype <- p2g[,list(ng=data.table::uniqueN(gene_symbol),\n nd=data.table::uniqueN(disease_id)),by=\"hpo_id\"]\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Import precomputed results for reporting summaries\nresults <- MSTExplorer::load_example_results()\nresults <- HPOExplorer::add_hpo_name(results, hpo = hpo)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding HPO names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults <- HPOExplorer::add_ont_lvl(results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults <- HPOExplorer::add_ancestor(results, hpo = hpo)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding level-2 ancestor to each HPO ID.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor metadata already present. Use force_new=TRUE to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults <- MSTExplorer::map_celltype(results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping cell types to cell ontology terms.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding stage information.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nMSTExplorer::add_logfc(results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding logFC column.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults[,effect:=estimate]\n## Substitute B for \\beta for now since Quarto doesn't seem to support\n## Greek letters after they've been stored in a data.table...\nresults[,summary:=paste0(\n \"$\",\n \"FDR_{p,c}=\",format(q,digits=3),\",\",\n \"B=\",format(estimate,digits=3),\n \"$\"\n)] \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nres_summ <- MSTExplorer::summarise_results(results = results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving results --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmphp2ypb/summarise_results.csv10f4a3d4aac16\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nres_summ_all <- res_summ$tmerged[ctd==\"all\"] \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Must use `cache.lazy=FALSE` because sparse matrices not yet supported for caching\nctd_list <- MSTExplorer::load_example_ctd(c(\"ctd_DescartesHuman.rds\",\n \"ctd_HumanCellLandscape.rds\"),\n multi_dataset=TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_DescartesHuman.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_HumanCellLandscape.rds\n```\n\n\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nvalidate_associations_mkg_out <- MSTExplorer::validate_associations_mkg(results = results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ni All local files already up-to-date!\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRemaining: 82 phenotypes across 65 celltypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n90.24% phenotypes recovered.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nkg_hp <- validate_associations_mkg_out$kg[grepl(\"^HP:\",from)]\nX <- \"`**!!RECOMPUTE!!**`\"\nY <- \"`**!!RECOMPUTE!!**`\"\n```\n:::\n\n\n## Abstract\n\nRare diseases (RDs) are an extremely heterogeneous and underserved category of medical conditions.\nWhile the majority of RDs are strongly genetic, it remains largely unknown via which physiological mechanisms genetics cause RD.\nTherefore, we sought to systematically characterise the cell type-specific mechanisms underlying all RD phenotypes with a known genetic cause by leveraging the Human Phenotype Ontology and transcriptomic single-cell atlases of the entire human body from embryonic, foetal, and adult samples.\nIn total we identified significant associations between 201 cell types and 9,575/11,028 (86.7%) unique phenotypes across 8,628 RDs. We estimate that this represents an over 500-fold increase in the collective knowledge of RD phenotype-cell type mechanisms.\n\nNext, we demonstrated how these results may be used for personalised patient diagnosis and prognosis, as well as the development of novel therapeutics.\nFinally, we take a data-driven approach to highlight several of the most promising gene/cell therapy candidates with the highest probability of animal model-to-human patient translation.\nFurthermore, we have made these results entirely reproducible and freely accessible to the global community to maximise their impact.\nTo summarise, this work represents a significant step forward in the mission to treat patients across an extremely diverse spectrum of serious RDs.\n\n## Introduction {#sec-introduction}\n\nWhile rare diseases (RDs) are individually uncommon, they collectively account for an enormous global disease burden with over 10,000 recognised RDs affecting at least 300-400 million people globally [@Ferreira2019-jp] (1 in 10-20 people) [@Zhu2020-vo] . Over 75% of RDs primarily affect children with a 30% mortality rate by 5 years of age [@noauthor_undated-kp]. Despite the prevalence and severity of RDs, patients suffering from these conditions are vastly underserved due to several contributing factors. First, diagnosis is extremely challenging due to the highly variable clinical presentations of many of these diseases. The diagnostic odyssey can take patients and their families decades, with an average time to diagnosis of 5 years [@Marwaha2022-uy].\nOf those, \\~46% receive at least one incorrect diagnosis and over 75% of all patients never receive any diagnosis @Molster2016-da.\nSecond, prognosis is also made difficult by high variability in disease course and outcomes which makes matching patients with effective and timely treatment plans even more challenging.\nFinally, even for patients who receive an accurate diagnosis/prognosis, treatments are currently only available for less than 5% of all RDs [@Halley2022-pd].\nIn addition to the scientific challenges of understanding RDs, there are strong financial disincentives for pharmaceutical and biotechnology companies to develop expensive therapeutics for exceedingly small RD patient populations with little or no return on investment [@Institute_of_Medicine_US_Committee_on_Accelerating_Rare_Diseases_Research_and_Orphan_Product_Development2010-vj; @Yates2022-ra].\nThose that have been produced are amongst the world’s most expensive drugs, greatly limiting patients’ ability to access it [@Nuijten2022-yc; @Thielen2022-ud], The provision of timely, effective and affordable care for RD patients will require substantive transformations to our existing scientific, clinical, and regulatory frameworks.\n\nA major challenge in both healthcare and scientific research is the scalable exchange of information.\nEven in the age of electronic healthcare records (EHR) much of the information about an individual’s history is currently fractured across healthcare providers, often with differing nomenclatures for the same conditions.\nThe Human Phenotype Ontology (HPO) is a hierarchically organised set of controlled clinical terms that provides a much needed common framework by which clinicians and researchers can precisely communicate patient conditions [@Gargano2024-fc; @Kohler2019-pc @Robinson2008-ys; @Kohler2021-wk].\nThe HPO spans all domains of human physiology and currently describes 18082 phenotypes across 10,300 RDs.\nEach phenotype and disease is assigned its own unique identifier and organised as a hierarchical graph, such that higher-level terms describe broad phenotypic categories or *branches* (e.g. *HP:0033127*: 'Abnormality of the musculoskeletal system' which contains 4495 unique phenotypes) and lower-level terms describe increasingly precise phenotypes (e.g. *HP:0030675*: \"Contracture of proximal interphalangeal joints of 2nd-5th fingers\").\nIt has already been integrated into healthcare systems and clinical diagnostic tools around the world, with increasing adoption over time [@Gargano2024-fc].\nCommon ontology-controlled frameworks like the HPO open a wealth of new opportunities, especially when addressing RDs.\nServices such as the Matchmaker Exchange [@Osmond2022-ml; @Philippakis2015-dq] have enabled the discovery of hundreds of underlying genetic etiologies, and led to the diagnosis of many patients.\nThis also opens the possibility of gathering cohorts of geographically dispersed patients to run clinical trials, the only viable option for treatment in many individuals.\nTo further increase the number of individuals who qualify for these treatments, as well as the trial sample size, proposals have been made deviate from the traditional single-disease clinical trial model and instead perform basket trials on groups of RDs with shared molecular etiologies (SaME) [@Zanello2023-zd].\nHowever this approach, and indeed much of RD patient care, hinges upon first characterising the molecular mechanisms underlying each RD.\n\nOver 80% of RDs have a known genetic cause [@Nguengang_Wakap2020-cz; @noauthor_2022-ok].\nDespite this our knowledge of the physiological mechanisms via which genetics cause pathogenesis is lacking for most RDs, severely hindering our ability to effectively diagnose, prognose and treat RD patients.\nThe availability of standardised, ontology-controlled databases presents opportunities to systematically investigate RDs at scale.\nSince 2008, the HPO has been continuously updated using knowledge from the medical literature, as well as by integrating databases of expert validated gene-phenotype relationships, such as OMIM [@Amberger2019-vl; @Amberger2017-tg; @McKusick2007-di], Orphanet [@Maiella2013-oo; @Weinreich2008-wm], and DECIPHER @Firth2009-qg.\nA subset of the HPO contains gene annotations for 11,047 phenotypes across 8,631 diseases.\nYet genes alone do not tell the full story of how RDs come to be, as their expression and functional relevance varies drastically across the multitude of tissues and cell types contained within the human body.\n\nOur knowledge of single-cell-resolution biology has exploded over the course of the last decade and a half, with numerous applications in both scientific and clinical practices [@Baysoy2023-vt; @Haque2017-bn; @Qi2023-ev].\nMore recently, comprehensive single-cell transcriptomic atlases across tissues have also emerged [@CZI_Single-Cell_Biology_Program2023-fs; @Svensson2020-lg].\nIn particular, the Descartes Human @Cao2020-qz and Human Cell Landscape @Han2020-iq projects provide comprehensive multi-system single-cell RNA-seq (scRNA-seq) atlases in embryonic, foetal, and adult human samples from across the human body.\nThese datasets provide data-driven gene signatures for hundreds of cell subtypes.\nThey also allow us to investigate disease mechanisms in the context of specific life stages.\n\nHere, we combine and extend several of the most comprehensive genomic and transcriptomic resources currently available to systematically uncover the cell types underlying granular phenotypes across 8,628 diseases.\nWe then go on to highlight thousands of novel phenotype-cell type associations which collectively expand our knowledge of cell type-resolved phenotypes by an estimated 567-fold.\nNext, we present several potential avenues for real world applications of these results in the context of RD patient diagnosis, prognosis, treatment, and therapeutics development.\n\n## Results {#sec-results}\n\n### Phenotype-cell type associations\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Create phenotype-gene matrix filled with aggregated GenCC evidence scores\nymat <- HPOExplorer::hpo_to_matrix(formula = \"gene_symbol ~ hpo_id\")\n## Run phenomix with DescartesHuman CellTypeDataset\nlm_res1 <- MSTExplorer::run_phenomix(ctd_name = \"DescartesHuman\",\n annotLevel = 2, \n test_method = \"glm_univariate\",\n ymat = ymat)\n## Run phenomix with HumanCellLandscape CellTypeDataset\nlm_res2 <- MSTExplorer::run_phenomix(ctd_name = \"HumanCellLandscape\",\n annotLevel = 3, \n test_method = \"glm_univariate\",\n ymat = ymat)\n## Merge results\nresults <- data.table::rbindlist(list(DescartesHuman=lm_res1,\n HumanCellLandscape=lm_res2),\n idcol = \"ctd\")\n## Apply multiple testing correction\nresults[,q:=stats::p.adjust(p,method=\"fdr\")]\n```\n:::\n\n\nIn this study we systematically investigated the cell types underlying phenotypes across the HPO. \nA summary of the phenome-wide results stratified by single-cell atlas can be found in @tbl-summary.\nWithin the results using the Descartes Human single-cell atlas, 19,929/ 848,078 (2.35%) tests across 77/ 77 (100%) cell types and 7,340/11,047 (66.4%) phenotypes revealed significant phenotype-cell type associations after multiple-testing correction ($FDR_{p,c}<0.05$). Using the Human Cell Landscape single-cell atlas, 26,585/1,358,916 (1.96%) tests across 124/124 (100%) cell types and 9,049/11,047 (81.9%) phenotypes showed significant phenotype-cell type associations ($FDR_{p,c}<0.05$). The median number of significantly associated phenotypes per cell type was 252 (Descartes Human) and 200 (Human Cell Landscape), respectively.\n\nAcross both single-cell references, the median number of significantly associated cell types per phenotype was 3, suggesting reasonable specificity of the testing strategy.\n8,628/8,631 (\\~100%) of diseases within the HPO gene annotations showed significant cell type associations for at least one of their respective phenotypes.\n\n### Validation of expected phenotype-cell type relationships\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nplot_bar_dendro_out <- MSTExplorer::plot_bar_dendro(\n results = results, \n show_plot = FALSE) \n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: ggdendro\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor columns already present. Skipping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'ancestor_name' : 999,488 / 2,206,994 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ncardiocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nendocrine cell\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nphotoreceptor cellretinal cell\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nleukocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ncell of skeletal musclechondrocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nneural cell\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nrespiratory epithelial cellepithelial cell of lung\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConverted ontology to: igraph \n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConverted ontology to: igraph_dist \n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: tol\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: Invalid .internal.selfref detected and fixed by taking a (shallow)\ncopy of the data.table so that := can add this new column by reference. At an\nearlier point, this data.table has been copied by R (or was created manually\nusing structure() or similar). Avoid names<- and attr<- which in R currently\n(and oddly) may copy the whole data.table. Use set* syntax instead to avoid\ncopying: ?set, ?setnames and ?setattr. If this message doesn't help, please\nreport your use case to the data.table issue tracker so the root cause can be\nfixed or this message improved.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRunning tests: across_branches_per_celltype\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: Invalid .internal.selfref detected and fixed by taking a (shallow)\ncopy of the data.table so that := can add this new column by reference. At an\nearlier point, this data.table has been copied by R (or was created manually\nusing structure() or similar). Avoid names<- and attr<- which in R currently\n(and oddly) may copy the whole data.table. Use set* syntax instead to avoid\ncopying: ?set, ?setnames and ?setattr. If this message doesn't help, please\nreport your use case to the data.table issue tracker so the root cause can be\nfixed or this message improved.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nScale for x is already present.\nAdding another scale for x, which will replace the existing scale.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nScale for y is already present.\nAdding another scale for y, which will replace the existing scale.\nAncestor columns already present. Skipping.\n\n2,206,994 associations remain after filtering.\n\nCell type columns already present. Skipping mapping.\n\ncardiocyte\n\nTranslating ontology terms to ids.\n\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n\nendocrine cell\n\nTranslating ontology terms to ids.\n\nphotoreceptor cellretinal cell\n\nTranslating ontology terms to ids.\n\nleukocyte\n\nTranslating ontology terms to ids.\n\ncell of skeletal musclechondrocyte\n\nTranslating ontology terms to ids.\n\nneural cell\n\nTranslating ontology terms to ids.\n\nrespiratory epithelial cellepithelial cell of lung\n\nTranslating ontology terms to ids.\n\nProportional enrichment summary stats:\n\n - pct_min: 17.22\n\n - pct_max: 64.86\n\n - pct_max_mean: 40.3\n\n - pct_max_sd: 16.36\n\n - enrichment_mean: 6.09\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\noverrep_dat <- plot_bar_dendro_out$ggbars_out$data_summary\n\noverrep_dat[,summary:=paste0(ancestor_name,\": \", \n n_celltypes_sig,\"/\",n_celltypes,\n \" types of \",shQuote(target_celltypes),\n \" were overrepresented\",\n \" ($N_{p}$=\",phenotypes_per_ancestor,\").\")] \n```\n:::\n\n\nWithin each high-level branch in the HPO shown in [Fig. @fig-summary]b, we tested whether each cell type was more often associated with phenotypes in that branch relative to those in all other branches (including those not shown).\nWe then checked whether each cell type was overrepresented (at $FDR_{b,c}<0.05$) within its respective on-target HPO branch, where the number of phenotypes within that branch ($N_{p}$).\nAbnormality of the cardiovascular system: 5/6 types of 'cardiocyte' were overrepresented ($N_{p}$=673). Abnormality of the endocrine system: 3/4 types of 'endocrine cell' were overrepresented ($N_{p}$=291). Abnormality of the eye: 5/5 types of 'photoreceptor cell/retinal cell' were overrepresented ($N_{p}$=721). Abnormality of the immune system: 4/4 types of 'leukocyte' were overrepresented ($N_{p}$=255). Abnormality of the musculoskeletal system: 4/4 types of 'cell of skeletal muscle/chondrocyte' were overrepresented ($N_{p}$=2155). Abnormality of the nervous system: 19/23 types of 'neural cell' were overrepresented ($N_{p}$=1647). Abnormality of the respiratory system: 2/2 types of 'respiratory epithelial cell/epithelial cell of lung' were overrepresented ($N_{p}$=292)..\n\nAs an additional form of validation ([Fig. @fig-summary]d), we tested for a relationship between phenotype-cell type association significance ($-log_{e}(p_{p,c})$ where $log_{e}$ denotes natural log and and $p_{p,c}$ denotes uncorrected phenotype-cell type association p-values) and the proportion of on-target cell types. The list of on-target cell types were determined by matching each high-level HPO branch to a corresponding CL branch. These cross-ontology mappings can be found in @tbl-celltypes.\nFor this analysis we used raw p-values ($p_{p,c}$) rather than multiple-testing corrected p-values ($FDR_{p,c}$) to provide a more dynamic range of values (as the latter can drive values to 1).\nAll 7/7 high-level HPO branches showed a consistent upwards trend towards greater proportions of on-target cell types with increasing degrees of significance.\nFurthermore, all branches also showed a proportion of on-target cell types above that expected by chance (baseline = on-target cell types / total cell types) at $-log_{e}(p_{p,c})>1$.\n\n\n::: {#cell-fig-summary .cell}\n\n```{.r .cell-code .hidden}\nplot_bar_dendro_out$plot\n```\n\n::: {.cell-output-display}\n![Summary of significant associations between phenotypes and cell types, aggregated by HPO branch. Here we show **a**, the total number of significant phenotype enrichments per cell type ($FDR_{p,c}<0.05$) across all branches of the HPO. **b**, Number of phenotype association related to several high-level branches of the HPO. Asterisks above each bar indicate whether that cell type was significantly more often enriched in that branch relative to all other HPO branches, including those not shown here, as a proxy for how specifically that cell type is associated with that branch; $FDR _{b,c}<1e-04$ (\\*\\*\\*\\*), $FDR _{b,c}<0.001$ (\\*\\*\\*), $FDR_{b,c}<0.01$ (\\*\\*), $FDR _{b,c}<0.05$ (\\*). **c**, Dendrogram derived from the Cell Ontology (CL) showing the relatedness of all tested cell types to one another. For simplicity, cell type labels shown here are aligned to the CL [@Diehl2016-gt] and can therefore encompass one or more cell types annotated by the original authors of scRNA-seq datasets [@Cao2020-qz; @Han2020-iq]. **d**, Percentage of significant phenotype associations with on-target cell types (second row of facet labels), respective to the HPO branch. As significance increases ($-log_{10}(p)$ along the *x-axis*) the percentage of on-target enriched cell types also increases (*y-axis*).](index_files/figure-pdf/fig-summary-1.pdf){#fig-summary fig-pos='H'}\n:::\n:::\n\n\n### Validation of inter- and intra-dataset consistency\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(data.table)\n## Across CTD\nvalidate_associations_correlate_ctd_out <- MSTExplorer::validate_associations_correlate_ctd(\n results=results, \n group_var=\"ctd\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCasting results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n13 comparable celltypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n10945 comparable phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n10 comparable celltypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n554 comparable phenotypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGenerating plots.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRegistered S3 method overwritten by 'ggside':\n method from \n +.gg ggplot2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering statistics.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## Replace p-values of exactly 0 with smallest number R can compute\nvalidate_associations_correlate_ctd_out$data_stats$p.all$summary_data$p.value <- max(validate_associations_correlate_ctd_out$data_stats$p.all$summary_data$p.value,\n .Machine$double.xmin)\n\n## Within CTD: across developmental stages\nvalidate_associations_correlate_ctd_out_hcl <- MSTExplorer::validate_associations_correlate_ctd(\n results=results,\n filters= list(ctd=c(\"HumanCellLandscape\"), \n stage=c(\"Fetus\",\"Adult\")),\n group_var=\"stage\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'ctd' : 848,078 / 2,206,994 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'stage' : 54,795 / 1,358,916 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCasting results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n38 comparable celltypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n10959 comparable phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n33 comparable celltypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1584 comparable phenotypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGenerating plots.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering statistics.\n```\n\n\n:::\n:::\n\n\nNext, we sought to validate the consistency of our results across the two single-cell reference datasets (Descartes Human vs. Human Cell Landscape) across the subset of overlapping cell types [Fig. @fig-ctd-correlation].\nIn total there were 142285 phenotype-cell type associations to compare across the two datasets (across 10945 phenotypes and 13 cell types annotated to the exact same CL term.\nWe found that the correlation between p-values of the two datasets was high ($rho=0.492, p=1.08e-93$).\nWithin the subset of results that were significant in both single-cell datasets ($FDR_{p,c}<0.05$), we found that correlation of the association effect size were even stronger ($rho=0.723, p=1.08e-93$).\nWe also checked for the intra-dataset consistency between the p-values of the foetal and adult samples in the Human Cell Landscape, showing a very similar degree of correlation as the inter-dataset comparison ($rho=0.436, p=2.36e-149$).\nTogether, these results suggest that our approach to identifying phenotype-cell type associations is highly replicable and generalisable to new datasets.\n\n### More specific phenotypes are associated with fewer genes and cell types\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nplot_ontology_levels_out <- MSTExplorer::plot_ontology_levels(\n results = results, \n ctd_list = ctd_list,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: gginnards\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nplot_ontology_levels_out_stats <- plot_ontology_levels_out$data_stats$summary_data|>\n data.table::setkeyv(\"parameter2\")\n## replace pvalues of exactly 0 with the minimum computable number in R\n## This avoids creating -Inf when logging values.\nplot_ontology_levels_out_stats[p.value==0, p.value:=.Machine$double.xmin]\nplot_ontology_levels_out_stats[q.value==0, q.value:=.Machine$double.xmin]\nplot_ontology_levels_out_stats[,summary:=paste0(\n \"$\",\n \"p=\",format(p.value,digits=3),\", \", \n \"q=\",format(q.value,digits=3),\", \",\n \"rho=\",format(estimate,digits=3),\n \"$\"\n )][,summary:=gsub(\"=[ ]\",\"=\",summary)]\n```\n:::\n\n\nFirst, we found that phenotype ontology showed a significant negative correlation with the number of genes annotated to that phenotype in the HPO data ([Fig. @fig-ontology-lvl]a; $p=2.23e-308, q=2.23e-308, rho=-0.2634$).\nThis is expected as broader phenotypes tend to have large gene set annotations.\nNext, we reasoned that lower HPO ontology levels representing more specific phenotypes were likely to be associated with fewer, more specific subsets of cell types.\nThis was indeed the case, as we observed a strongly significant negative correlation between the two variables ([Fig. @fig-ontology-lvl]b; $p=2.23e-308, q=2.23e-308, rho=-0.2927$).\nWe also found that the effect size of significant phenotype-cell type associations ($FDR_{p,c}<0.05$) increased with greater phenotype specificity, though the relationship was rather weak ([Fig. @fig-ontology-lvl]c; $p=7.30e-97, q=7.30e-97, rho=0.0966$).\nFinally, we found that the mean expression specificity of phenotype-associated genes (within the cell types significantly associated with those respective phenotypes at $FDR_{p,c}<0.05$) was positively correlated phenotype ontology depth ([Fig. @fig-ontology-lvl]d; $p=2.71e-174, q=3.61e-174, rho=0.1398$).\n\n\n::: {#cell-fig-ontology-lvl .cell}\n\n```{.r .cell-code .hidden}\nplot_ontology_levels_out$plot\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`geom_smooth()` using formula = 'y ~ x'\n`geom_smooth()` using formula = 'y ~ x'\n`geom_smooth()` using formula = 'y ~ x'\n`geom_smooth()` using formula = 'y ~ x'\n```\n\n\n:::\n\n::: {.cell-output-display}\n![More specific phenotypes are associated with fewer, more specific genes and cell types. Box plots showing relationship between HPO phenotype level and **a**, the number of genes annotated to each phenotype, **b**, the number of significantly enriched cell types, **c**, the effect size of phenotype-cell type association tests at $FDR_{p,c}<0.05$, and **d**, the mean expression specificity of phenotype-associated genes in the cell types significantly associated with those respective phenotypes ($FDR_{p,c}<0.05$). Ontology level 0 represents the most inclusive HPO term 'All', while higher ontology levels (max=16) indicate progressively more specific HPO terms (e.g. 'Contracture of proximal interphalangeal joints of 2nd-5th fingers'). Boxes are coloured by the mean value (respective to the subplot) within each HPO level.](index_files/figure-pdf/fig-ontology-lvl-1.pdf){#fig-ontology-lvl fig-pos='H'}\n:::\n:::\n\n\n### Hepatoblasts have a unique role in recurrent Neisserial infections\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nresults_tmp <- HPOExplorer::add_ancestor(data.table::copy(results),\n lvl = 7,\n force_new = TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nForce new. Removing existing ancestor columns.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding level-7 ancestor to each HPO ID.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3027 ancestors found at level 7\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ntarget_branches <- list(\"Recurrent bacterial infections\"=\"leukocyte\")\ninfections_out <- MSTExplorer::plot_bar_dendro_facets(\n results=results_tmp,\n target_branches=target_branches,\n facets = \"hpo_name\",\n legend.position=\"top\",\n lvl=9,\n ncol=2,\n vlines=\"hepatoblast\",\n fill_var=\"ancestor_name_original\",\n facets_n=NULL,\n q_threshold=0.05,\n background_full=FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nForce new. Removing existing ancestor columns.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding level-9 ancestor to each HPO ID.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2384 ancestors found at level 9\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nKeeping descendants of 1 term(s).\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n20 terms remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n4,020 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nleukocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRunning tests: across_branches_per_celltype\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: gnuplot\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nremove(results_tmp)\n\nstaph_res <- infections_out$data[hpo_name==\"Recurrent staphylococcal infections\"]\nstaph_res_top <- staph_res[,.SD[p %in% head(sort(p), 1)], by=c(\"hpo_id\")]\n \nrecurrent_infections_ids <- KGExplorer::get_ontology_descendants(ont = hpo,\n terms = \"Recurrent infections\")[[1]]\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRecurrent infections\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nhepatoblast_res <- results[q<0.05 &hpo_id %in% recurrent_infections_ids & cl_name==\"hepatoblast\"]\nhepatocyte_res <- results[q<0.05 & ancestor_name==\"Abnormality of the immune system\" & grepl(\"hepatocyte\",CellType,ignore.case = TRUE)] \n```\n:::\n\n\nWe selected the HPO term 'Recurrent bacterial infections' and all of its descendants (19 phenotypes) as an example of how investigations at the level of granular phenotypes can reveal different cell type-specific mechanisms ([Fig. @fig-rni]).\nAs expected, these phenotypes are primarily associated with immune cell types (e.g. macrophages, dendritic cells, T cells, monocytes, neutrophils).\nSome associations confirm relationships previously suggested in the literature, such as that between 'Recurrent staphylococcal infections' and myeloid cells [@Heim2014-du; @Pidwill2020-le; @Stoll2018-dc; @Tebartz2015-xs].\nSpecifically, our results pinpoint monocytes as the most strongly associated cell subtypes ($FDR_{p,c}= 1.03e-30,B= 1.76e-01$).\n\nIn contrast to all other recurrent infection types, 'Recurrent Neisserial infections' highlighted a novel association with hepatoblasts (Descartes Human : $FDR_{p,c}= 1.13e-06,B= 8.24e-02$).\nWhilst unexpected, a convincing explanation involves the complement system, a key driver of innate immune response to Neisserial infections.\nHepatocytes, which derive from hepatoblasts, produce the majority of complement proteins [@Zhou2016-kq], and Kupffer cells express complement receptors @Dixon2013-ok.\nIn addition, individuals with deficits in complement are at high risk for Neisserial infections [@Ladhani2019-nf; @Rosain2017-ih], and a genome-wide association study in those with a Neisserial infection identified risk variants within complement proteins [@The_International_Meningococcal_Genetics_Consortium2010-if] .\nWhile the potential of therapeutically targeting complement in RDs (including Neisserial infections) has been proposed previously [@Lung2019-il; @Reis2015-yz], performing this in a gene- and cell type-specific manner may help to improve efficacy and reduce toxicity (e.g. due to off-target effects).\nImportantly, there are over 56 known genes within the complement system [@Seal2023-pa], highlighting the need for a systematic, evidence-based approach to identify effective gene targets.\n\nAlso of note, despite the fact that our datasets contain both hepatoblasts and their mature counterpart, hepatocytes, only the hepatoblasts showed this association.\nThis suggests that the genetic factors that predispose individuals for risk of Neisserial infections are specifically affecting hepatoblasts before they become fully differentiated.\nIt is also notable that these phenotypes were the only ones within the 'Recurrent bacterial infections' branch, or even the broader 'Recurrent infections' branch, perhaps indicating a unique role for hepatoblasts in recurrent infectious disease.\nThe only phenotypes within the even broader 'Abnormality of the immune system' HPO branch that significantly associated with mature hepatocytes were 'Pancreatitis' ($FDR_{p,c}= 2.08e-02,B= 5.25e-02$) and 'Susceptibility to chickenpox' ($FDR_{p,c}= 1.20e-02,B= 5.49e-02$) both of which are well-known to involve the liver [@Al-Hamoudi2009-le; @Brewer2018-dg; @Eshchar1973-tz].\n\n\n::: {#cell-fig-rni .cell}\n\n```{.r .cell-code .hidden}\ninfections_out$plot + ggplot2::guides(fill=ggplot2::guide_legend(ncol=2))\n```\n\n::: {.cell-output-display}\n![Hepatoblasts have a unique role in recurrent Neisserial infections. Significant phenotype-cell type tests for phenotypes within the branch 'Recurrent bacterial infections'. Amongst all different kinds of recurrent bacterial infections, hepatoblasts (highlighted by vertical dotted lines) are exclusively enriched in 'Recurrent gram−negative bacterial infections'. Note that terms from multiple levels of the same ontology branch are shown as separate facets (e.g. 'Recurrent bacterial infections' and 'Recurrent gram−negative bacterial infections').](index_files/figure-pdf/fig-rni-1.pdf){#fig-rni fig-pos='H'}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Annotate results with disease/symptom-level and gene-level information\n## filtering q-values at this step yields the same results as filtering at the next step, \n## albeit with much fast computation.\nresults_annot <- HPOExplorer::add_genes(results[q<0.05],\n allow.cartesian = TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults_annot <- MSTExplorer::add_symptom_results(results = results_annot, \n ctd_list = ctd_list)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding symptom-level results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSubsetting results by q_threshold and effect.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,301,874 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## Plot multi-scale mechanisms as an interactive network\nphenotype <- \"Recurrent Neisserial infections\"\nvn_rni <- MSTExplorer::prioritise_targets_network(\n top_targets = results_annot[hpo_name==phenotype], \n main = NULL, \n height = \"400px\",\n width = \"500px\",\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: visNetwork\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpRH55Yj/file11bed3d97c051_prioritise_targets_network.html\n```\n\n\n:::\n:::\n\n\nNext, we sought to link multi-scale mechanisms at the levels of disease, phenotype, cell type, and gene and visualise these as a network ([Fig. @fig-network-rni]).\nThis revealed that genetic deficiencies in different complement system genes (*C5*, *C8*, and *C7*) are primarily mediated by different cell types (hepatoblasts, stratified epithelial cells, and stromal cells, respectively).\nWhile genes of the complement system are expressed throughout many different tissues and cell types, these results indicate that different subsets of these genes may mediate their effects through different cell types.\nThis finding suggests that investigating (during diagnosis) and targeting (during treatment) different cell types may be critical for the diagnosis and treatment of these closely related, yet mechanistically distinct, diseases.\n\n\n::: {#cell-fig-network-rni .cell}\n\n```{.r .cell-code .hidden}\nvn_rni$plot\n```\n\n![Multi-scale mechanisms of Recurrent Neisserial infections. Starting from the bottom of the plot, one can trace how causal genes (yellow boxes) mediate their effects through cell types (orange circles), phenotypes (pruple cylinders) and ultimately diseases (blue cylinders). Cell types are connected to phenotypes via association testing ($FDR_{p,c}<0.05$), and to diseases when the symptom gene set overlap is >25%. Nodes were spatially arranged using the Sugiyama algorithm [@Sugiyama1981-ev].](index_files/figure-pdf/fig-network-rni-1.pdf){#fig-network-rni fig-pos='H'}\n:::\n\n\n### Monarch Knowledge Graph recall\n\nNext, we used the Monarch Knowledge Graph (MKG) as a proxy for the field's current state of knowledge of phenotype-cell type associations.\nWe evaluated the proportion of MKG associations that were recapitulation by our results.\nIn total, our results contained at least one significant cell type associations for \\>90% of the phenotypes described in the MKG.\nOf these phenotypes, we captured \\>45% of the MKG phenotype-cell associations when only considering exact overlap of CL-aligned cell type annotations.\nThis proportion increased with greater flexibility in the matching of cell type annotations, reaching a maximum of `**!!RECOMPUTE!!**`% at a ontology graph distance of `**!!RECOMPUTE!!**` when considering the overlap of cell type annotations at the level of cell type ontology terms.\nThis suggests that our results are in line with the current state of knowledge, and that our approach can be used to identify novel phenotype-cell type associations.\n\n### Annotation of phenotypes using generative large language models\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngpt_check <- HPOExplorer::gpt_annot_check()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nNumber of phenotype hits per query group:\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n - intellectual_disability: 6\n - impaired_mobility: 292\n - physical_malformations: 78\n - blindness: 1\n - sensory_impairments: 252\n - immunodeficiency: 5\n - cancer: 695\n - reduced_fertility: 5\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.\ni Please use the `rows` argument instead.\ni The deprecated feature was likely used in the HPOExplorer package.\n Please report the issue at\n .\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngpt_annot <- HPOExplorer::gpt_annot_codify()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngpt_annot$annot_weighted[,hpo_name:=gsub(\"^obsolete \",\"\",hpo_name)]\nleast_severe_phenotype <- gpt_annot$annot_weighted[hpo_name==\"Thin toenail\" & severity_score_gpt==0,]\n```\n:::\n\n\nSeverity annotations were gathered from GPT-4 for 16982/18082 (93.9166%) HPO phenotypes.\nIn our companion study, benchmarking tests of these results using ground-truth HPO branch annotations.\nFor example, phenotypes within the 'Blindness' HPO branch (*HP:0000618*) were correctly annotated as causing blindness by GPT-4.\nAcross all annotations, the recall rate of GPT-4 annotations was 91.26% (min=70.1%, max=100%, SD=11.84) with a mean consistency score of 91.21% (min=80.96%, max=97.48%, SD=5.739) for phenotypes whose annotation were collected more than once.\nThis clearly demonstrates the ability of GPT-4 to accurately annotate phenotypes.\nThis allowed us to begin using these annotations to compute systematically collected severity scores for all phenotypes in the HPO.\n\nFrom these annotations we computed a weighted severity score metric for each phenotype ranging from 0-100 (100 being the theoretical maximum severity of a phenotype that always causes every annotation).\nWithin our annotations, the most severe phenotype was 'Anencephaly' (*HP:0002323*) with a severity score of 58, followed by 'Atrophy/Degeneration affecting the central nervous system' (*HP:0007367*) with a severity score of 58.\nThere were 677 phenotypes with a severity score of 0 (e.g. 'Thin toenail').\nThe mean severity score across all phenotypes was 14.89 (median=14, standard deviation=8.517).\n\n### Enrichment of foetal cell types in congenital phenotypes\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nplot_congenital_annotations_out <- MSTExplorer::plot_congenital_annotations(\n results = results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nplot_congenital_annotations_out_data <- \n data.table::data.table(plot_congenital_annotations_out$data, key=\"congenital_onset\")\nplot_congenital_annotations_out_data[,summary:=paste0(\n shQuote(congenital_onset),\"=\",.label,\n \" (n=\",counts,\" associations)\"\n)]\nplot_congenital_annotations_out_stats <- data.table::data.table(\n plot_congenital_annotations_out$data_stats$summary_data\n )[,summary:=paste0(\n \"$\",\n \"p=\",format(p.value,digits=2),\",\",\n \"\\\\chi^2_{Pearson}=\",format(statistic,digits=2),\",\",\n \"\\\\hat{V}_{Cramer}=\",format(estimate,digits=2),\n \"$\"\n)]\n```\n:::\n\n\nThe frequency of congenital onset with each phenotype (as determined by GPT-4 annotations) was strongly predictive with the proportion of significantly associated foetal cell types in our results ($p=2e-203,\\chi^2_{Pearson}=940,\\hat{V}_{Cramer}=0.14$).\nFurthermore, increasing congenital frequency annotation (on an ordinal scale) corresponded to an increase in the proportion of foetal cell types: 'always'=24% (n=1636 associations), 'often'=20% (n=2979 associations), 'rarely'=12% (n=1956 associations), 'never'=10% (n=811 associations).\nThis is consistent with the expected role of foetal cell types in development and the aetiology of congenital disorders.\n\n\n::: {#cell-fig-congenital .cell}\n\n```{.r .cell-code .hidden}\nplot_congenital_annotations_out$plot\n```\n\n::: {.cell-output-display}\n![Congenital phenotypes are more often associated with foetal cell types. As a phenotype is more often congenital in nature, the greater proportion of foetal cell types are significantly asscoaited with it.](index_files/figure-pdf/fig-congenital-1.pdf){#fig-congenital fig-pos='H'}\n:::\n:::\n\n\n### Diagnosis via cell type-specific disease prediction\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Define input phenotypes/genes\nphenotypes_diagnose <- c(\"Generalized neonatal hypotonia\",\n \"Scrotal hypospadias\",\n \"Increased circulating progesterone\")\nphenotypes_diagnose <- HPOExplorer::map_phenotypes(phenotypes_diagnose,\n to=\"id\",\n hpo=hpo)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngenes_include <- c(\"HSD3B2\",\"HERC2\")\ngenes_exclude <- c(\"SNORD115-1\")\n## Predict cell types\npredict_celltypes_out <- MSTExplorer::predict_celltypes(\n phenotypes = names(phenotypes_diagnose),\n genes_include = genes_include, \n genes_exclude = genes_exclude,\n phenotype_to_genes = p2g,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding logFC column.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping cell types to cell ontology terms.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding stage information.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_DescartesHuman.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_HumanCellLandscape.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: Invalid .internal.selfref detected and fixed by taking a (shallow)\ncopy of the data.table so that := can add this new column by reference. At an\nearlier point, this data.table has been copied by R (or was created manually\nusing structure() or similar). Avoid names<- and attr<- which in R currently\n(and oddly) may copy the whole data.table. Use set* syntax instead to avoid\ncopying: ?set, ?setnames and ?setattr. If this message doesn't help, please\nreport your use case to the data.table issue tracker so the root cause can be\nfixed or this message improved.\n```\n\n\n:::\n:::\n\n\nUsing the function `MSTExplorer::predict_celltypes` we input 3 inclusion phenotypes ('Generalized neonatal hypotonia' (*HP:0008935*), 'Scrotal hypospadias' (*HP:0012853*), 'Increased circulating progesterone' (*HP:0031216*)), 2 genes in which the patient is known to have deleterious mutations (*HSD3B2*, *HERC2*) and 1 gene in which the patient is known not to have any deleterious mutations (*SNORD115-1*). This predicted that cortical cell of adrenal gland (score sum=1.38, score mean=0.0256, score standard deviation=0.137) were the most probable cell types underlying this combination of phenotypes and genotypes ([Fig. @fig-diagnosis]), which is highly consistent with existing evidence that adrenal insufficiency can cause both phenotypes via mutations in these genes [@Srivastava2023-ge; @Utsch2004-re]. This was the only cell type to receive a score two standard deviations from the mean score of all cell types (mean score: 0.000668).\n\n\n::: {#cell-fig-diagnosis .cell}\n\n```{.r .cell-code .hidden}\npredict_celltypes_out$plot\n```\n\n::: {.cell-output-display}\n![Diagnosis - Observed phenotypes/genotypes can be used to identify causal cell types in individuals. Our phenotype-cell type association results can be used to make predictions about which cell types are underlying a set of phenotypes observed in a given patient. Here we input three inclusion phenotypes, two inclusion genes, and one exclusion gene into the function `MSTExplorer::predict_celltypes`. The output is a ranked list of the top 10 most probable cell types (*x-axis*) underlying this combination of phenotypes/genotypes (highest to lowest rank from left to right). The score on the *y-axis* is computed by aggregating phenotype-celltype association summary statistics and evidence-weighted phenotype-gene associations. In this simple example, cortical cells of the adrenal gland were predicted as the most probable cell type. The mean of the score sum is shown as a dashed line, while one standard deviation (SD) above this is shown as a dotted line. Each bar is coloured by its mean.](index_files/figure-pdf/fig-diagnosis-1.pdf){#fig-diagnosis fig-pos='H'}\n:::\n:::\n\n\n### Prognosis via cell type-mediated differential outcomes\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Count number of diseases associated with these phenotypes\nkeep_descendants <- \"Hypotonia\" ## HP:0001252\nhypotonia_results <- HPOExplorer::filter_descendants(results, \n keep_descendants = keep_descendants) \n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nKeeping descendants of 1 term(s).\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n14 terms remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,814 associations remain after filtering.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nhypotonia_results <- HPOExplorer::add_death(hypotonia_results,\n allow.cartesian = TRUE,\n agg_by = c(\"disease_id\",\"hpo_id\"))\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with AgeOfDeath.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nhypotonia_results <- MSTExplorer::map_celltype(hypotonia_results) \n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## Generate plot\nplot_differential_outcomes_heatmap_out <- MSTExplorer::plot_differential_outcomes_heatmap( \n results = hypotonia_results, \n print_phenotypes = TRUE,\n fill_limits = c(1,8),\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding symptom-level results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSubsetting results by q_threshold and effect.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n52,234 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_DescartesHuman.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_HumanCellLandscape.rds\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nmin_ageofdeath_score_celltypes <- unique(plot_differential_outcomes_heatmap_out$data_agg[which(AgeOfDeath_score_min==min(AgeOfDeath_score_min))]$celltype_symptom)\n```\n:::\n\n\nHypotonia (*HP:0001252*) is a very broad phenotype containing 13 subterms (e.g. \"Generalised neonatal hypotonia\") and is associated with 2569 unique diseases in the HPO gene annotations.\nTogether, these hypotonia phenotypes were significantly associated with 29/99 (29.29%) unique CL-aligned cell types.\nThis reflects the highly variable set of disease etiologies that can cause this broad-level phenotype.\nAcross all diseases, hypotonia phenotypes tended to be most consistently severe (lower mean age of death score) when associated with the cell type inhibitory interneuron.\nWhile other cell types were associated with lower mean age of death scores (e.g. stromal cell, astrocyte), the severity of the outcomes were more variable.\n\n\n::: {#cell-fig-prognosis .cell}\n\n```{.r .cell-code .hidden}\nplot_differential_outcomes_heatmap_out$plot\n```\n\n::: {.cell-output-display}\n![Prognosis - Cell types predict the probability of deadly diseases. The broad phenotype 'Hypotonia' and its descendants occur in many different diseases (1,832 diseases in the HPO annotations).Therefore, it can be difficult to prognose clinical outcomes of a newborn individual with hypotonia. With additional knowledge of the particular cell types underlying a patient's hypotonia phenotype, one can greatly narrow down the range of potential outcomes (e.g. age of death). **a**, Here, we show the various cell types by which hypotonia phenotypes confer disease risk. **b**, We also computed the mean age of death score for each cell type across hypotonia-associated diseases, revealing that disrupted inhibitory neurons confer the greatest risk of early death. Ordinal age of death categories from the HPO disease annotations were encoded numerically and averaged ([Table @tbl-death]) to produce mean Age of Death scores for each disease (on a scale from 1-8). For example, a score of 1 corresponds to prenatal death, while a score of 8 corresponds to death in late adulthood.](index_files/figure-pdf/fig-prognosis-1.pdf){#fig-prognosis fig-pos='H'}\n:::\n:::\n\n\n### Therapeutic target identification\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nprioritise_targets_out <- MSTExplorer::prioritise_targets(\n results = results, \n ctd_list = ctd_list,\n phenotype_to_genes = p2g,\n hpo = hpo,\n \n keep_deaths=NULL,\n keep_onsets=NULL,\n keep_specificity_quantiles = seq(30,40), ## NULL:70, 30-40:64 \n keep_mean_exp_quantiles = seq(30,40), ## NULL:65, 10:55\n info_content_threshold=8, ## 8:55, 5:64 \n effect_threshold=NULL, ## 1:39\n severity_score_gpt_threshold=NULL, ## 10:78, NULL:82\n symptom_intersection_threshold=.25, ## .25:57\n evidence_score_threshold=3, ## 5:47, 4:47, 3:64\n top_n = 10, ## 5:38, 20:42, 30:45, 40:52, 50:55\n group_vars = \"hpo_id\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritising gene targets.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding term definitions.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding information_content scores.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='start' \n - Rows: 51,874,984 \n - Phenotypes: 11,028 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering @ q-value <= 0.05\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='q_threshold' \n - Rows: 2,115,670 \n - Phenotypes: 9,575 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='effect_threshold' \n - Rows: 2,115,670 \n - Phenotypes: 9,575 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with AgeOfDeath.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_deaths' \n - Rows: 2,115,670 \n - Phenotypes: 9,575 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor columns already present. Skipping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nKeeping descendants of 1 term(s).\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n17,548 terms remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1,889,042 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_descendants' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_ont_levels' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='gpt_filters' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='severity_score_gpt_threshold' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='info_content_threshold' \n - Rows: 979,360 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with onset.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_onsets' \n - Rows: 979,360 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Tiers.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_tiers' \n - Rows: 979,360 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with modifiers\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='severity_threshold' \n - Rows: 979,435 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='severity_threshold_max' \n - Rows: 979,435 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with n_diseases\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: genes_to_phenotype.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='pheno_ndiseases_threshold' \n - Rows: 979,435 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenotype frequencies.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='pheno_frequency_threshold' \n - Rows: 981,649 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_celltypes' \n - Rows: 981,649 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConverting phenos to GRanges.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: ensembldb\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering metadata for 4926 unique genes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: EnsDb.Hsapiens.v75\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='symptom_gene_overlap' \n - Rows: 1,092,098 \n - Phenotypes: 7,015 \n - Diseases: 8,102 \n - Cell types: 201 \n - Genes: 4,650\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering by keep_chr.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_chr' \n - Rows: 1,092,098 \n - Phenotypes: 7,015 \n - Diseases: 8,102 \n - Cell types: 201 \n - Genes: 4,650\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering by gene-disease association evidence.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='evidence_score_threshold' \n - Rows: 825,255 \n - Phenotypes: 6,605 \n - Diseases: 6,622 \n - Cell types: 201 \n - Genes: 3,938\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering by gene size.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3,938 / 3,938 genes kept.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='gene_size' \n - Rows: 825,255 \n - Phenotypes: 6,605 \n - Diseases: 6,622 \n - Cell types: 201 \n - Genes: 3,938\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_biotypes' \n - Rows: 825,255 \n - Phenotypes: 6,605 \n - Diseases: 6,622 \n - Cell types: 201 \n - Genes: 3,938\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='add_driver_genes' \n - Rows: 255,497 \n - Phenotypes: 6,447 \n - Diseases: 6,442 \n - Cell types: 201 \n - Genes: 3,873\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding symptom-level results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSubsetting results by q_threshold and effect.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n255,497 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='symptom_intersection_threshold' \n - Rows: 255,497 \n - Phenotypes: 6,447 \n - Diseases: 6,442 \n - Cell types: 201 \n - Genes: 3,873\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene frequencies.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: genes_to_phenotype.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='gene_frequency_threshold' \n - Rows: 350,367 \n - Phenotypes: 6,447 \n - Diseases: 6,442 \n - Cell types: 201 \n - Genes: 3,873\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPruning ancestors.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n5287 / 6447 terms were kept after pruning.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='prune_ancestors' \n - Rows: 196,856 \n - Phenotypes: 5,287 \n - Diseases: 6,147 \n - Cell types: 201 \n - Genes: 3,774\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSorting rows.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFinding top 10 gene targets per: hpo_id\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='top_n' \n - Rows: 32,735 \n - Phenotypes: 5,287 \n - Diseases: 4,850 \n - Cell types: 201 \n - Genes: 3,180\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='end' \n - Rows: 32,735 \n - Phenotypes: 5,287 \n - Diseases: 4,850 \n - Cell types: 201 \n - Genes: 3,180\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngenes_per_pheno <- prioritise_targets_out$top_targets[,list(n=data.table::uniqueN(gene_symbol)),by=\"hpo_id\"]\np2g <- HPOExplorer::add_ont_lvl(p2g)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nmin_ont_lvl <- 3\ngenes_per_pheno_all <- p2g[ontLvl>min_ont_lvl,\n list(n=data.table::uniqueN(gene_symbol)),by=\"hpo_id\"]\n\n\ntop_celltypes <- prioritise_targets_out$top_targets[,list(\n np=data.table::uniqueN(hpo_id)),\n by=\"cl_name\"]|>data.table::setorderv(\"np\",-1)\ntop_ancestors <- prioritise_targets_out$top_targets[,list(\n np=data.table::uniqueN(hpo_id),\n nc=data.table::uniqueN(CellType),\n ng=data.table::uniqueN(gene_symbol)\n ),\n by=\"ancestor_name\"]|>\n data.table::setorderv(\"np\",-1)\n```\n:::\n\n\nNext, we identified putative cell type-specific gene targets for several severe disease phenotypes.\nThis yielded putative therapeutic targets for 5287 phenotypes across 4850 diseases in 201 cell types and 3180 genes ([Fig. @fig-therapy-filter]).\nWhile this constitutes a large number of genes in total, each phenotype was assigned a median of 2 gene targets (mean=3.29, min=1, max=10).\nRelative to the number of genes annotations per phenotype in the HPO overall (median=7, mean=61.95, min=1, max=5003) this represents a substantial decrease in the number of candidate target genes, even when excluding high-level phenotypes (HPO level\\>3).\nIt is also important to note that the phenotypes in the prioritised targets list are ranked by their severity, allowing us to distinguish between phenotypes with a high medical urgency (e.g. 'Hydranencephaly') from those with lower medical urgency (e.g. 'Hyperplastic labia majora').\nThis can be useful for both clinicians, biomedical scientists, and pharmaceutical manufacturers who wish to focus their research efforts on phenotypes with the greatest need for intervention.\n\nAcross all phenotypes, epithelial cell were most commonly implicated (834 phenotypes), followed by stromal cell (627 phenotypes), stromal cell (627 phenotypes), neuron (478 phenotypes), chondrocyte (385 phenotypes), and endothelial cell (363 phenotypes).\nGrouped by higher-order ontology category, 'Abnormality of the musculoskeletal system' had the greatest number of enriched phenotypes (961 phenotypes, 863 genes), followed by 'Abnormality of the nervous system' (745 phenotypes, 1163 genes), 'Abnormality of head or neck' (545 phenotypes, 997 genes), 'Abnormality of the genitourinary system' (446 phenotypes, 710 genes), and 'Abnormality of the eye' (379 phenotypes, 572 genes).\n\n### Therapeutic target validation\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Gene therapy only\nttd_check_out <- MSTExplorer::ttd_check(\n top_targets=prioritise_targets_out$top_targets, \n drug_types = \"Gene therapy\",\n allow.cartesian = TRUE,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: readxl\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRetrieving all organisms available in gprofiler.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing stored `gprofiler_orgs`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping species name: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1 organism identified from search: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n35,812 / 51,339 (69.76%) genes mapped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nNon-failed gene targets enrichment p-value: 0.0104281415165849\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFailed gene targets depletion p-value: 0.364508393285371\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in ggplot2::geom_text(data = unique(dat_sub[, c(\"HIGHEST_STATUS\", : Ignoring unknown aesthetics: fill\nIgnoring unknown aesthetics: fill\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## All therapy types\nttd_check_all_out <- MSTExplorer::ttd_check(\n top_targets=prioritise_targets_out$top_targets, \n allow.cartesian = TRUE,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRetrieving all organisms available in gprofiler.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing stored `gprofiler_orgs`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping species name: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1 organism identified from search: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n35,812 / 51,339 (69.76%) genes mapped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nNon-failed gene targets enrichment p-value: 3.05778572878398e-19\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFailed gene targets depletion p-value: 2.51631585215379e-199\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in ggplot2::geom_text(data = unique(dat_sub[, c(\"HIGHEST_STATUS\", : Ignoring unknown aesthetics: fill\nIgnoring unknown aesthetics: fill\n```\n\n\n:::\n:::\n\n\nTo determine whether the genes prioritised by our therapeutic targets pipeline were plausible, we checked what percentage of gene therapy targets we recapitulated.\nData on therapeutic approval status was gathered from the Therapeutic Target Database (TTD; release 2024-03-22) [@Liu2011-qd].\nOverall, we prioritised 79% of all non-failed existing gene therapy targets.\nA hypergeometric test confirmed that our prioritised targets were significantly enriched for non-failed gene therapy targets ($p=0.0104$).\nImportantly, we did not prioritise any of the failed therapeutics (0%), defined as having been terminated or withdrawn from the market.\nThe hypergeometric test for depletion of failed targets did not reach significance ($p=0.365$), but this is to be expected as there was only one failed gene therapy target in the TTD database.\n\nEven when considering therapeutics of any kind ([Fig. @fig-therapy-validate-all]), not just gene therapies, we recapitulated 44% of the non-failed therapeutic targets and 0% of the terminated/withdrawn therapeutic targets (n=1255).\nHere we found that our prioritised targets were significantly enriched for non-failed therapeutics ($p=3e-19$), and highly significantly depleted for failed therapeutics ($p=3e-199$).\nThis suggests that our multi-scale evidence-based prioritisation pipeline is capable of selectively identifying genes that are likely to be effective therapeutic targets.\n\n\n::: {#cell-fig-therapy-validate .cell}\n\n```{.r .cell-code .hidden}\nttd_check_out$plot\n```\n\n::: {.cell-output-display}\n![Therapeutics - Validation of prioritised therapeutic targets. The proportion of existing gene therapy targets (documented in the Therapeutic Target Database) recapitulated by our prioritisation pipeline. Therapetics are stratified by the stage of clinical development they were at during the time of writing.](index_files/figure-pdf/fig-therapy-validate-1.pdf){#fig-therapy-validate fig-pos='H'}\n:::\n:::\n\n\n### Selected example targets\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ntop_targets <- prioritise_targets_out$top_targets[,n_genes:=data.table::uniqueN(gene_symbol),\n by=\"hpo_id\"][n_genes<5 & proportion_driver_genes_symptom>.25]\n\ntop_phenotypes <- unique(top_targets$hpo_name)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nheight <- \"80vh\"\nwidth <- \"100vw\"\nphenotypes_network <- c(\"Lethal skeletal dysplasia\",\n \"GM2-ganglioside accumulation\",\n \"Alzheimer disease\",\n \"Parkinson disease\")\n\nphenotype <- phenotypes_network[1]\nvn_therapy_eg1 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[hpo_name==phenotype], \n main = NULL,\n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: visNetwork\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af371adf511_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- phenotypes_network[2]\nvn_therapy_eg2 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[hpo_name==phenotype], \n main = NULL,\n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af3697390d1_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- phenotypes_network[3]\nvn_therapy_eg3 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = NULL, \n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af3212f9ea0_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- phenotypes_network[4]\nvn_therapy_eg4 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = NULL, \n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af325e21359_prioritise_targets_network.html\n```\n\n\n:::\n:::\n\n::: {#fig-therapy-examples .cell layout=\"[[1,1], [1], [1]]\"}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg1$plot\n```\n\n![Lethal skeletal dysplasia](index_files/figure-pdf/fig-therapy-examples-1.pdf){#fig-therapy-examples-1}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg2$plot\n```\n\n![GM2-ganglioside accumulation](index_files/figure-pdf/fig-therapy-examples-2.pdf){#fig-therapy-examples-2}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg3$plot\n```\n\n![Alzheimer disease](index_files/figure-pdf/fig-therapy-examples-3.pdf){#fig-therapy-examples-3}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg4$plot\n```\n\n![Parkinson disease](index_files/figure-pdf/fig-therapy-examples-4.pdf){#fig-therapy-examples-4}\n\nExample cell type-specific gene therapy targets for several severe phenotypes and their associated diseases. Each disease (blue cylinders) is connected to its phenotype (purple cylinders) based on well-established clinical observations recorded within the HPO [@Gargano2024-fc]. Phenotypes are connected to cell types (red circles) via association testing between weighted gene sets ($FDR_{p,c}<0.05$). Each cell type is connected to the prioritised gene targets (yellow boxes) based on the driver gene analysis.The thickness of the edges connecting the nodes represent the (mean) fold-change from the bootstrapped enrichment tests. Nodes were spatially arranged using the Sugiyama algorithm [@Sugiyama1981-ev].\n:::\n\n\nFrom our prioritised targets, we selected the following four sets of phenotypes or diseases as examples: 'Lethal skeletal dysplasia', 'GM2-ganglioside accumulation', 'Alzheimer disease', 'Parkinson disease'.\n\nSkeletal dysplasia is a heterogeneous group of over 450 disorders that affect the growth and development of bone and cartilage.\nThis phenotype can be lethal when deficient bone growth leads to the constriction of vital organs such as the lungs.\nEven after surgical interventions, these complications continue to arise as the child develops.\nPharmacological interventions to treat this condition have largely been ineffective.\nWhile there are various cell types involved in skeletal system development, our pipeline nominated chondrocytes as the causal cell type underlying the lethal form of this condition.\nAssuringly, we found that the disease 'Achondrogenesis Type 1B' is caused by the genes *SLC26A2* and *COL2A1* via chondrocytes.\nWe also found that 'Platyspondylic lethal skeletal dysplasia, Torrance type'.\nThus, in cases where surgical intervention is insufficient, targeting these genes within chondrocytes may prove a viable long-term solution for children suffering from lethal skeletal dysplasia.\n\nTay-Sachs disease is a devastating disease in which children are born appearing healthy, which gradually degrades leading to death after 3-5 years.\nThe underlying cause is the toxic accumulation of gangliosides in the nervous system due to a loss of the enzyme produced by *HEXA*.\nWhile this could in theory be corrected with gene editing technologies, there remain some outstanding challenges.\nOne of which is early detection and diagnosis, before irreversible damage has occurred.\nOur pipeline implicated extravillous trophoblasts of the placenta in 'GM2-ganglioside accumulation'.\nWhile not necessarily a target for gene therapy, checking these cells *in utero* for an absence of *HEXA* may serve as a viable biomarker as these cells normally express the gene at high levels.\nEarly detection of Tay-Sachs disease may lengthen the window of opportunity for therapeutic intervention, especially when genetic sequencing is not available or variants of unknown significance are found within *HEXA*.\n\nAlzheimer disease (AD) is the most common neurodegenerative condition. It is characterised by a set of variably penetrant phenotypes including memory loss, cognitive decline, cerebral proteinopathy. Interestingly, we found that different forms of early onset AD (which are defined by the presence of a specific disease gene) are each associated with different cell types via different phenotypes. For example, AD 3 and AD 4 are primarily associated with cells of the digestive system ('enterocyte', 'gastric goblet cell') and are implied to be responsible for the phenotypes 'Senile plaques', 'Alzheimer disease', 'Parietal hypometabolism in FDG PET', 'Cerebral amyloid angiopathy'. Meanwhile, early-onset autosomal dominant\nAD and AD 2 are primarily associated with immune cells ('alternatively activated macrophage', 'microglial cell') and are implied to be responsible for the phenotypes 'Neurofibrillary tangles', 'Long-tract signs', 'Finger agnosia', 'Semantic dementia'. This suggests that different forms of AD may be driven by different cell types and phenotypes, which may help explain its variability in onset and clinical presentation. \n\nFinally, Parkinson disease (PD) is characterised by motor symptoms such as tremor, rigidity, and bradykinesia. However there are a number of additional phenotypes associated with the disease that span multiple physiological systems. PD 19a and PD 8 seemed to align most closely with the canonical understanding of PD as a disease of the central nervous system in that they implicated oligodendrocytes and neurons. Though the reference datasets being used in this study were not annotated at sufficient resolution to distinguish between different subtypes of neurons, in particular dopaminergic neurons. PD 19a/8 also suggested that risk variants in *LRRK2* mediate their effects on PD through both myeloid cells and oligodendrocytes by causing gliosis of the substantia nigra. The remaining clusters of PD mechanisms revolved around chondrocytes (PD 20), amacrine cells of the eye (hereditary late-onset PD), and the respiratory/immune system (PD 14). While the diversity in cell type-specific mechanisms is somewhat surprising, it may help to explain the wide variety of cross-system phenotypes frequently observed in PD.\n\nIt should be noted that the HPO only includes gene annotations for the monogenic forms of AD and PD. However it has previously been shown that there is at least partial overlap in their phenotypic and genetic aetiology with respect to their common forms. Thus understanding the monogenic forms of these diseases may shed light onto their more common counterparts.\n\n### Experimental model translatability\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\npheno_map_genes_match <- KGExplorer::map_upheno_data()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached data: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/pheno_map_genes_match.rds\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\npheno_map_targets <- pheno_map_genes_match[\n id1 %in% unique(prioritise_targets_out$top_targets$hpo_id)\n ]|>\n data.table::setorderv(\"phenotype_genotype_score\",-1)\ntaxa_count <- sort(table(pheno_map_targets$gene_taxon_label2), decreasing = TRUE)\n\npheno_map_targets_severe <- pheno_map_targets[\n id1 %in% unique(prioritise_targets_out$top_targets[severity_score_gpt>10,]$hpo_id)\n ]\n\npheno_map_targets_severe[,summary:=paste0(\n shQuote(object_label1), \n \" ($SIM_{o,g}=\",round(phenotype_genotype_score,3),\"$)\"\n )]\n```\n:::\n\n\nWe computed interspecies translatability scores using a combination of both ontological ($SIM_{o}$) and genotypic ($SIM_{g}$) similarity relative to each homologous human phenotype and its associated genes [Fig. @fig-animal-models]. In total, we mapped 278 non-human phenotypes (in *Caenorhabditis elegans*, *Danio rerio*, *Mus musculus*, *Rattus norvegicus*) to 849 homologous human phenotypes. Amongst the 5287 phenotype within our prioritised therapy targets, 356 had viable animal models in at least on non-human species. Per species, the number of homologous phenotypes was: *Danio rerio* (n=214), *Mus musculus* (n=152), *Caenorhabditis elegans* (n=35), *Rattus norvegicus* (n=3). Amongst our prioritised targets with a GPT-4 severity score of >10, the phenotypes with the greatest animal model similarity were 'Anterior vertebral fusion' ($SIM_{o,g}=0.967$), 'Disc-like vertebral bodies' ($SIM_{o,g}=0.964$), 'Metaphyseal enchondromatosis' ($SIM_{o,g}=0.946$), 'Peripheral retinal avascularization' ($SIM_{o,g}=0.943$), 'Retinal vascular malformation' ($SIM_{o,g}=0.943$).\n\n## Discussion {#sec-discussion}\n\nAcross the 201 cell types and 11,047 RD-associated phenotypes investigated, more than 46,514 significant phenotype-cell type relationships were discovered.\nThe examples we have highlighted above recapitulate well-known relationships, provide additional cellular context to many of these known relationships, and discover novel relationships at multiple biological scales.\n\nInvestigating RDs at the level of phenotypes offers several key advantages.\nFirst, the vast majority of RDs only have one associated gene (7671/8631 diseases = 89%).\nAggregating gene sets across diseases into phenotype-centric \"buckets\" permits sufficiently well-powered analyses, with an average of \\~76 genes per phenotype (median=7) [see Fig. @fig-diagram].\nSecond, we hypothesise that these phenotype-level gene sets converge on a limited number of molecular and cellular pathways.\nPerturbations to these pathways manifest as one or more phenotypes which, when considered together, tend to be clinically diagnosed as a certain disease.\nThird, RDs are often highly heterogeneous in their clinical presentation across individuals, leading to the creation of an ever increasing number of disease subtypes (some of which only have a single documented case).\nIn contrast, a phenotype-centric approach enables us to more accurately describe a particular individual’s version of a disease without relying on the generation of additional disease subcategories.\nBy characterising an individual’s precise phenotypes over time, we may better understand the underlying biological mechanisms that have caused their condition.\nHowever, in order to achieve a truly precision-based approach to clinical care, we must first characterise the molecular and cellular mechanisms that cause the emergence of each phenotype.\nHere, we provide a highly reproducible framework that enables this at the scale of the entire phenome.\nThis presents an opportunity to design basket trials of patients with different diseases but overlapping phenotypes and cellular mechanisms [@Zanello2023-zd].\nIt may be especially helpful for complex patients with diagnostically ambiguous sets of phenotypes who would otherwise be excluded from traditional clinical trials [@Diaz-Santiago2020-ep].\n\nIt was paramount to the success of this study to ensure our results were anchored in ground-truth benchmarks, generated falsifiable hypotheses, and rigorously guarded against false-positive associations.\nExtensive validation using multiple approaches demonstrated that our methodology consistently recapitulates expected phenotype-cell type associations ([Fig. @fig-summary]-[Fig. @fig-congenital]).\nThis was made possible by the existence of comprehensive, structured ontologies for all phenotypes (HPO) and cell types (CL), which provide an abundance of clear and falsifiable hypotheses for which to test our predictions against.\nSeveral key examples include 1) strong enrichment of associations between cell types and phenotypes within the same anatomical systems ([Fig. @fig-summary]b-d), 2) a strong relationship between phenotype-specificity and the strength and number of cell type associations ([Fig. @fig-ontology-lvl]), 3) identification of the precise cell subtypes involved in susceptibility to various subtypes of recurrent bacterial infections ([Fig. @fig-rni]), 4) a strong positive correlation between the frequency of congenital onset of a phenotype and the proportion of developmental cell types associated with it ([Fig. @fig-congenital])), and 5) consistent phenotype-cell type associations across multiple independent single-cell datasets ([Fig. @fig-ctd-correlation]).\nHaving validated our phenotype-cell type associations, we then went on to demonstrate how these results may be used in each stage of clinical care: diagnosis ([Fig. @fig-diagnosis]), prognosis ([Fig. @fig-prognosis]), treatment, and therapeutics development ([Fig. @fig-therapy-examples]).\n\nDiagnosis is an essential but challenging step in RD patient care.\nAdditional phenotypes that emerge over time may assist a clinician to reach a more confident disease diagnosis.\nHowever many of these phenotypes can have a serious impact on patient quality of life or survival and avoiding them would be far better for patient outcomes.\nOften times phenotypes alone cannot clearly pinpoint the disease and thus a diagnosis is never reached.\nHaving a more complete understanding of the mechanisms underlying observed phenotypes allows clinicians to far more effectively make predictions about what additional, less obvious phenotypes they should search for to confirm or reject their hypothesis of disease diagnosis (e.g. with imaging or biomarker tests).\n\nConsider the following hypothetical scenario.\nA clinician observes that a newborn patient has several phenotypes ('Generalized neonatal hypotonia', 'Scrotal hypospadias', 'Increased circulating progesterone'), none of which conclusively point to a single disease diagnosis.\nUnder the strong suspicion that the phenotypes are genetic in origin, the clinician orders whole-genome sequencing (WGS) on the patient as well as the patient’s family.\nThe clinician finds that the patient has a number of putative causal mutations, narrowing down the number of potential diseases from hundreds to just 10.\nFurther narrowing down the possibilities at this stage can be extremely challenging even for expert clinical geneticists.\nHowever, additional knowledge of which tissues and cell types are primarily affected allow the clinician to make a series of testable hypotheses that they may begin to investigate.\nFor example, two of the putative diseases are known to cause aberrant splicing events in a gene that is only expressed in adrenocortical cells ([Fig. @fig-diagnosis]), providing justification to order a needle biopsy of the adrenal gland.\nRNA sequencing is performed on the tissue biopsy and it is discovered that the patient does indeed have high expression of the dysfunctional transcript, confirming the disease diagnosis [@Lord2021-rf].\nThis opens new avenues for the patient to receive timely and effective treatments for their specific condition, which is important as their version of the disease tends to lead to death in early childhood if left untreated ([Fig. @fig-prognosis]).\nFortunately, their diagnosis now qualifies them to participate in a clinical trial of a novel gene therapy with promising preliminary results.\nFurthermore, it is predicted that this patient would respond especially well to this treatment given that the mechanisms of action of the gene therapy primarily acts on adrenocortical cells ([Fig. @fig-therapy-examples]).\n\nUnfortunately, there are currently only treatments available for less than 5% of RDs [@Halley2022-pd].\nNovel technologies including CRISPR, prime editing, antisense oligonucleotides, viral vectors, and/or lipid nanoparticles, have been undergone significant advances in the last several years [@Bueren2023-ma; @Bulaklak2020-ta; @Godbout2023-uo; @Kohn2023-vh; @Zhao2023-qy] and proven remarkable clinical success in an increasing number of clinical applications [@Darrow2019-om; @Mendell2017-kg; @Mueller2017-fz; @Russell2017-dh].\nThe U.S. Food and Drug Administration (FDA) recently announced an landmark program aimed towards improving the international regulatory framework to take advantage of the evolving gene/cell therapy technologies @Lu2024-kl with the aim of bringing dozens more therapies to patients in a substantially shorter timeframe than traditional pharmaceutical product development (typically 5-20 years with a median of 8.3 years) [@Brown2022-ye].\nWhile these technologies have the potential to revolutionise RD medicine, their successful application is dependent on first understanding the mechanisms causing each disease.\n\nTo address this critical gap in knowledge, we used our results to create a reproducible and customisable pipeline to nominate cell type-resolved therapeutic targets ([Fig. @fig-therapy-filter]-[Fig. @fig-therapy-examples]).\nTargeting cell type-specific mechanisms underlying granular RD phenotypes can improve therapeutic effectiveness by treating the causal root of an individual's conditions [@Bulaklak2020-ta; @Moffat2017-al].\nA cell type-specific approach also helps to reduce the number of harmful side effects caused by unintentionally delivering the therapeutic to off-target tissues/cell types (which may induce aberrant gene activity), especially when combined with technologies that can target cell surface antigens (e.g viral vectors) [@Zhou2013-wx].\nThis has the additional benefit of reducing the minimal effective dose of a therapeutic, which can be both immunogenic and extremely financially costly [@Bueren2023-ma; @Kohn2023-vh; @Nuijten2022-yc; @Thielen2022-ud].\nHere, we demonstrate the utility of a high-throughput evidence-based approach to RD therapeutics discovery by highlighting several of the most promising therapeutic candidates.\nOur pipeline takes into account a myriad of factors, including the strength of the phenotype-cell type associations, symptom-cell type associations, cell type-specificity of causal genes, the severity and frequency of the phenotypes, suitability for gene therapy delivery systems (e.g. recombinant adeno-associated viral vectors (rAAV)), as well as a quantitative analysis of phenotypic and genetic animal model translatability ([Fig. @fig-animal-models]).\nWe validated these candidates by comparing the proportional overlap with gene therapies that are presently in the market or undergoing clinical trials, in which we recovered 79% of all active gene therapies and 0% of failed gene therapies ([Fig. @fig-therapy-validate], [Fig. @fig-therapy-validate-all]). Despite nominating a large number of putative targets, hypergeometric tests confirmed that our targets were strongly enriched for targets of existing therapies that are either approved or currently undergoing clinical trials.\n\nIt should be noted that our study has several key limitations.\nFirst, while our cell type datasets are amongst the most comprehensive human scRNA-seq references currently available, they are nevertheless missing certain tissues, cell types (e.g. spermatocytes, oocytes), and life stages (post-natal childhood, senility).\nIt is also possible that we have not captured certain cell state signatures that only occur in disease (e.g. disease-associated microglia \\[**CITATION**\\]).\nThough we reasoned that using only control cell type signatures would mitigate bias towards any particular disease, and avoid degradation of gene signatures due to loss of function mutations.\nSecond, the collective knowledge of gene-phenotype and gene-disease associations is far from complete and we fully anticipate that these annotations will continue to expand and change well into the future.\nIt is for this reason we designed this study to be easily reproduced within a single containerised script so that we (or others) may rerun it with updated datasets at any point.\nFinally, causality is notoriously difficult to prove definitively from associative testing alone, and our study is not exempt from this rule.\nDespite this, there are several reasons to believe that our approach is able to better approximate causal relationships than traditional approaches.\nFirst, we did not intentionally preselect any subset of phenotypes or cell types to investigate here.\nAlong with a scaling prestep during linear modelling, this means that all the results are internally consistent and can be directly compared to one another (in stark contrast to literature meta-analyses).\nFurthermore, for the phenotype gene signatures we used expert-curated GenCC annotations [@DiStefano2022-ao; @DiStefano2023-np] to weight the current strength of evidence supporting a causal relationship between each gene and phenotype.\nThis is especially important for phenotypes with large genes lists (thousands of annotations) for which some of the relationships may be tenuous.\nWithin the cell type references, we deliberately chose to use specificity scores (rather than raw gene expression) as this normalisation procedure has previously been demonstrated to better distinguish between signatures of highly similar cell types/subtypes [@Skene2016-rb].\n\nMoving forward, we are now actively seeking industry and academic partnerships to begin experimentally validating our multi-scale target predictions and exploring their potential for therapeutic translation.\nNevertheless, there are more promising therapeutic targets here than our research group could ever hope to pursue by ourselves.\nIn the interest of accelerating research and ensuring RD patients are able to benefit from this work as quickly as possible, we have decided to publicly release all of the results described in this study.\nThese can be accessed in multiple ways, including through a suite of R packages as well as a web app, the [Rare Disease Celltyping Portal](https://neurogenomics.github.io/rare_disease_celltyping_apps/home/).\nThe latter allows our results to be easily queried, filtered, visualised, and downloaded without any knowledge of programming.\nThrough these resources we aim to make our findings useful to a wide variety of RD stakeholders including subdomain experts, clinicians, advocacy groups, and patients.\n\n## Conclusions {#sec-conclusions}\n\nUltimately, our primary objective was to develop a methodology capable of generating high-throughput phenome-wide predictions while preserving the accuracy and clinical utility typically associated with more narrowly focused studies.\nWith the rapid advancement of gene therapy technologies, and a regulatory landscape that is evolving to better meet the needs of a large and diverse patient population, there is finally momentum to begin to realise the promise of personalised medicine.\nThis has especially important implications for the global RD community which has remained relatively neglected.\nHere, we lay out the groundwork necessary for this watershed moment by providing a scalable, cost-effective, and fully reproducible means of resolving the multi-scale, cell-type specific mechanisms of virtually all rare diseases.\n\n## Methods {#sec-methods}\n\n### Human Phenotype Ontology\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngencc <- KGExplorer::get_gencc(agg_by = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngencc_version <- KGExplorer::get_version(gencc, return_version = TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n:::\n\n\nThe latest version of the HPO (release 2024-02-08) was downloaded from the EMBL-EBI Ontology Lookup Service [@Cote2010-gp] and imported into R using the `HPOExplorer` package.\nThis R object was used to extract ontological relationships between phenotypes as well as to assign absolute and relative ontological levels to each phenotype.\nThe latest version of the HPO phenotype-to-gene mappings and phenotype annotations were downloaded from the official HPO GitHub repository and imported into R using `HPOExplorer`.\nThis contains lists of genes associated with phenotypes via particular diseases, formatted as three columns in a table (gene, phenotype, disease).\n\nHowever, not all genes have equally strong evidence of causality with a disease or phenotype, especially when considering that the variety of resources used to generate these annotations (OMIM, Orphanet, DECIPHER) use variable methodologies (e.g. expert-curated review of the medical literature vs. automated text mining of the literature).\nTherefore we imported data from the Gene Curation Coalition (GenCC) [@DiStefano2022-ao; @DiStefano2023-np], which (as of 2024-03-01) 21798 evidence scores across 7229 diseases and 5142 genes.\nEvidence scores are defined by GenCC using a standardised ordinal rubric which we then encoded as a semi-quantitative score ranging from 0 (no evidence of disease-gene relationship) to 6 (strongest evidence of disease-gene relationship) (see @tbl-gencc).\nWe then summed evidence scores per disease, merged this table with the HPO disease-phenotype-gene annotation table, and then cast the data into a gene-by-phenotype matrix filled with the aggregated mean evidence score.\nThis can be expressed as the following equations.\n\nLet us denote:\n\n- $D$ as the set of $d$ diseases.\n\n- $p$ as a phenotype.\n\n- $g$ as a gene.\n\nThe final evidence-weighted gene-by-phenotype matrix ($M_{g,p}$) can be expressed as:\n\n::: {#eq-evidence-scores .content-hidden unless-format=\"html\"}\n![](equations/eq1.png){height=\"300px\"}\n\nConstruction of the evidence-weighted gene-by-phenotype matrix.\n:::\n\n\\\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[NavyBlue]{n1}{M_{g,p}}\n =\n \\frac{\n \\eqnmarkbox[Cerulean]{n3a}{\\sum_{d \\in D}}\n \\eqnmarkbox[blue]{n4a}{R(g,p,d)} \n \\times \n \\eqnmarkbox[BlueViolet]{n5}{E(g,d)} \n }{\n \\eqnmarkbox[Cerulean]{n3b}{\\sum_{d \\in D}}\n \\eqnmarkbox[blue]{n4b}{R(g,p,d)}\n }\n\\end{equation*}\n\\annotate[yshift=1em]{left}{n1}{Weighted gene-by-phenotype \\\\evidence score matrix} \n\\annotate[yshift=-2em]{below,left}{n3a,n3b}{Iterate over all diseases}\n\\annotate[yshift=-2.5em,xshift=2.5em]{below,right}{n4a,n4b}{Binary gene-by-phenotype \\\\relationship matrix,\\\\ (1=relationship, 0=no relationship)}\n\\annotate[yshift=2em]{left}{n5}{Weighted gene-by-disease \\\\evidence score matrix}\n```\n\n:::\n\n\\\n\nHistograms of evidence score distributions at each step in processing can be found in [Fig. @fig-evidence-histograms].\n\n### Single-cell transcriptomic atlases\n\nIn this study, the gene by cell type specificity matrix was constructed using the Descartes Human transcriptome atlas of foetal gene expression, which contains a mixture of single-nucleus and single-cell RNA-seq data (collected with sci-RNA-seq3) [@Cao2020-qz].\nThis dataset contains 377,456 cells representing 77 distinct cell types across 15 tissues.\nAll 121 human foetal samples ranged from 72 to 129 days in estimated postconceptual age.\nTo independently replicate our findings, we also used the Human Cell Landscape which contains single-cell transcriptomic data (collected with microwell-seq) from embryonic, foetal, and adult human samples across 49 tissues [@Han2020-iq].\n\nSpecificity matrices were generated separately for each transcriptomic atlas using the R package `EWCE` (v1.11.3) [@Skene2016-rb].\nWithin each atlas, cell types were defined using the authors’ original freeform annotations in order to preserve the granularity of cell subtypes as well as incorporate expert-identified rare cell types.\nCell types were only aligned and aggregated to the level of corresponding Cell Ontology (CL) [@Diehl2016-gt] annotations afterwards when generating summary figures and performing cross-atlas analyses.\nUsing the original gene-by-cell count matrices from each single-cell atlas, we computed gene-by-cell type expression specificity matrices as follows.\n\nLet us denote: $g$ as a gene, $c$ as a cell type, and $i$ as a single cell.\nGenes with very no expression across any cell types were considered to be uninformative and were therefore removed from the input gene-by-cell matrix $F(g,i,c)$.\n\n::: {#eq-ctd-filter .content-hidden unless-format=\"html\"}\n![](equations/eq2.png){height=\"200px\"}\n\nFiltering of the gene-by-cell expression matrices.\n:::\n\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[purple]{f1}{F(g,i,c)}\n =\n \\begin{cases}\n \\eqnmarkbox[WildStrawberry]{f2}{r_{g,i}},\n \\text{ }l_i = c\\\\0,\n \\text{ }l_i \\neq c \n \\end{cases}\n\\end{equation*}\n\\annotate[yshift=1em]{left}{f1}{Filtered gene-by-cell expression matrix} \n\\annotate[yshift=2em]{left}{f2}{Expression of gene $g$ in cell $i$} \n```\n\n:::\n\n\\\n\nNext, we calculated the mean expression per cell type and normalised the resulting matrix to transform it into a gene-by-cell type expression specificity matrix ($S_{g,c}$).\nIn other words, each gene in each cell type had a 0-1 score where 1 indicated the gene was mostly specifically expressed in that particular cell type relative to all other cell types.\nThis procedure was repeated separately for each of the single-cell atlases and can be summarised as:\n\n::: {#eq-ctd-specificity .content-hidden unless-format=\"html\"}\n![](equations/eq3.png){height=\"300px\"}\n\nConstruction of the gene-by-cell type specificity matrix.\n:::\n\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[orange]{s1}{S_{g,c}}\n =\n \\frac{\n \\eqnmarkbox[purple]{s3a}{\n \\frac{\n \\sum_{i=1}^{|L|} F(g,i,c)\n }{\n N_c \n }\n } \n }{\n \\eqnmarkbox[OrangeRed]{s6}{\\sum_{r=1}^{k}}(\n \\eqnmarkbox[purple]{s3b}{\n \\frac{\n \\sum_{i=1}^{|L|} F(g,i,c)\n }{\n N_c \n }\n } \n ) \n }\n\\end{equation*}\n\\annotate[yshift=1em]{left}{s1}{Gene-by-cell type specificity matrix} \n\\annotate[yshift=2em]{left}{s3a,s3b}{Compute mean expression of each gene per cell type} \n\\annotate{below,left}{s6}{Compute row sums of \\\\mean gene-by-cell type matrix}\n```\n\n:::\n\n\\\n\n### Phenotype-cell type associations\n\nTo test for relationships between each pairwise combination of phenotype (n=11,047) and cell type (n=201) we ran a series of univariate generalised linear models implemented via the `stats::glm` function in R.\nFirst, we filtered the gene-by-phenotype evidence score matrix ($M _{g,p}$) and the gene-by-cell type expression specificity matrix ($S _{g,c}$) to only include genes present in both matrices (n=4,949 genes in the Descartes Human analyses; n=4,653 genes in the Human Cell Landscape analyses).\nThen, within each matrix any rows or columns with a sum of 0 were removed as these were uninformative data points that did not vary.\nTo improve interpretability of the results $\\beta$ coefficient estimates across models (i.e. effect size), we performed a scaling prestep on all dependent and independent variables.\nInitial tests showed that this had virtually no impact on the total number of significant results or any of the benchmarking metrics based on p-value thresholds [Fig. @fig-summary].\nThis scaling prestep improved our ability to rank cell types by the strength of their association with a given phenotype as determined by separate linear models.\n\nWe repeated the aforementioned procedure separately for each of the single-cell references.\nOnce all results were generated using both cell type references (2,206,994 association tests total), we applied Benjamini-Hochberg false discovery rate [@Benjamini1995-vo] (denoted as $FDR_{p,c}$) to account for multiple testing.\nOf note, we applied this correction across all results at once (as opposed to each single-cell reference separately) to ensure the $FDR_{p,c}$ was stringently controlled for across all tests performed in this study.\n\n### Symptom-cell type associations\n\nHere we define a symptom as a phenotype as it presents within the context of the specific disease.\nThe features of a given symptom can be described as the subset of genes annotated to phenotype $p$ via a particular disease $d$, denoted as $G_{d,p}$ ([see Fig. @fig-diagram]).\nTo attribute our phenotype-level cell type enrichment signatures to specific diseases, we first identified the gene subset that was most strongly driving the phenotype-cell type association by computing the intersect of genes that were both in the phenotype annotation and within the top 25% specificity percentile for the associated cell type.\nWe then computed the intersect between symptom genes ($G_{d,p}$) and driver genes ($G_{p,c}$), resulting in the gene subset $G_{d \\cap p \\cap c}$.\nOnly $G_{d \\cap p \\cap c}$ gene sets with 25% or greater overlap with the symptom gene subset ($G_{d,p}$) were kept.\nThis procedure was repeated for all phenotype-cell type-disease triads, which can be summarised as follows:\n\n::: {#eq-symptoms .content-hidden unless-format=\"html\"}\n![](equations/eq4.png){height=\"300px\"}\n:::\n\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\frac{\n \\eqnmarkbox[Chartreuse3]{g1}{|G_{d \\cap p \\cap c} |}\n }{\n \\eqnmarkbox[Emerald]{g2}{|G_{d,p}|}} \n \\geq \\eqnmarkbox[SeaGreen]{g3}{.25} \n\\end{equation*}\n\\annotate[yshift=1em]{left}{g1}{Intersect between \\\\symptom genes ($G_{d,p}$) and driver genes ($G_{p,c}$)} \n\\annotate[yshift=-1em]{below,left}{g2}{Symptom genes \\\\(i.e. genes annotated to a phenotype\\\\ via a specific disease)} \n\\annotate[yshift=-1em]{below,right}{g3}{Minimum proportion of overlap \\\\between $G_{d,p,c}$ and $G_{d,p}$}\n```\n\n:::\n\n\\\n\n### Validation of expected phenotype-cell type relationships\n\nWe first sought to confirm that our tests (across both single-cell references) were able to recover expected phenotype-cell type relationships across seven high-level branches within the HPO ([Fig. @fig-summary]), including abnormalities of the cardiovascular system, endocrine system, eye, immune system, musculoskeletal system, nervous system, and respiratory system.\nWithin each branch the number of significant tests in a given cell type were plotted ([Fig. @fig-summary]b).\nMappings between freeform annotations (the level at which we performed our phenotype- cell type association tests) provided by the original atlas authors and their closest CL term equivalents were provided by CellxGene [@CZI_Single-Cell_Biology_Program2023-fs].\nCL terms along the *x-axis* of [Fig. @fig-summary]b were assigned colours corresponding to which HPO branch showed the greatest number of enrichments (after normalising within each branch to account for differences in scale).\nThe normalised colouring allows readers to quickly assess which HPO branch was most often associated with each cell type, while accounting for differences in the number of phenotypes across branches.\nWe then ran a series of Analysis of Variance (ANOVA) tests to determine whether (within a given branch) a given cell type was more often enriched ($FDR_{p,c}<0.05$) within that branch relative to all of the other HPO branches of an equivalent level in the ontology (including all branches not shown in [Fig. @fig-summary]b).\nAfter applying Benjamini-Hochberg multiple testing correction [@Benjamini1995-vo] (denoted as $FDR _{b,c}$), we annotated each respective branch-by-cell type bar according to the significance (\\*\\*\\*\\* : $FDR _{b,c}<1e-04$, \\*\\*\\* : $FDR _{b,c}<0.001$, \\*\\* : $FDR _{b,c}<0.01$, \\* : $FDR _{b,c}<0.05$).\nCell types in [Fig. @fig-summary]a-b were ordered along the *x-axis* according to a dendrogram derived from the CL ontology ([Fig. @fig-summary]c), which provides ground-truth semantic relationships between all cell types (e.g. different neuronal subtypes are grouped together).\n\nAs an additional measure of the accuracy of our phenotype-cell types test results we identified conceptually matched branches across the HPO and the CL ([Fig. @fig-summary]d and @tbl-celltypes).\nFor example, 'Abnormality of the cardiovascular system' in the HPO was matched with 'cardiocytes' in the CL which includes all cell types specific to the heart.\nAnalogously, 'Abnormality of the nervous system' in the HPO was matched with 'neural cell' in the CL which includes all descendant subtypes of neurons and glia.\nThis cross-ontology matching was repeated for each HPO branch and can be referred to as on-target cell types.\nWithin each branch, the $-log_{10}(FDR _{p,c})$ values of on-target cell types were binned by rounding to the nearest integer (*x-axis*) and the percentage of tests for on-target cell types relative to all cell types were computed at each bin (*y-axis*) ([Fig. @fig-summary]d).\nThe baseline level (dotted horizontal line) illustrates the percentage of on-target cell types relative to the total number of observed cell types.\nAny percentages above this baseline level represent greater than chance representation of the on-target cell types in the significant tests.\n\n### Monarch Knowledge Graph recall\n\nFinally, we gathered known phenotype-cell type relationships from the Monarch Knowledge Graph (MKG), a comprehensive database of links between many aspects of disease biology [@Putman2024-et].\nThis currently includes 103 links between HPO phenotypes (n=103) and CL cell types (n=79).\nOf these, we only considered the 82 phenotypes that we were able to test given that our approach was reliant on gene annotations.\nWe considered instances where we found a significant relationship between exactly matching pairs of HPO-CL terms as a hit.\n\nHowever, as the cell types in MKG were not necessarily annotated at the same level as our single-cell references, we also considered instances where the MKG cell type was an ancestor term of our cell type (e.g. 'myeloid cell' vs. 'monocyte'), or *vice versa*, as hits.\nUsing these criteria, we determined our results recapitulated `**!!RECOMPUTE!!**`% of known phenotype-cell type relationships in the MKG.\nWe next computed how far along the CL ontological tree we would need to travel in order to reach a common ancestor between the MKG cell type and our cell type, for each phenotype-cell type link in the MKG.\nThis provides a metric of not just whether we recapitulated the exact cell types, but how dissimilar our identified cell types were for a given phenotype-cell type association ([Fig. @fig-monarch-recall]).\n\n### Annotation of phenotypes using generative large language models\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngpt_codes <- formals(HPOExplorer::gpt_annot_codify)\ncode_dict <- paste0(shQuote(names(eval(gpt_codes$code_dict))),\"=\",\n eval(gpt_codes$code_dict), collapse = \", \")\ntiers_dict <- paste0(shQuote(names(eval(gpt_codes$tiers_dict))),\"=\",\n eval(gpt_codes$tiers_dict), collapse = \", \") \n```\n:::\n\n\nOnly a small fraction of the the phenotypes in HPO (\\<1%) have metadata annotations containing information on their time course, consequences, and severity.\nThis is due to the time-consuming nature of manually annotating thousands of phenotypes.\nTo generate such annotations at scale, we used Generative Pre-trained Transformer 4 (GPT-4), a large language model (LLM) as implemented within OpenAI’s chatGPT Application Programming Interface (API).\nAfter extensive prompt engineering and ground-truth benchmarking, we were able to acquire annotations on how often each phenotype directly causes intellectual disability, death, impaired mobility, physical malformations, blindness, sensory impairments, immunodeficiency, cancer, reduced fertility, or is associated with a congenital onset.\nThese criteria were previously defined in surveys of medical experts as a means of systematically assessing phenotype severity [@Lazarin2014-we].\nResponses for each metric were provided in a consistent one-word format which could be one of: 'never', 'rarely', 'often', 'always'.\nThis procedure was repeated in batches (to avoid exceeding token limits) until annotations were gathered for 16982/18082 HPO phenotypes.\n\nWe then encoded these responses into a semi-quantitative scoring system ('never'=0, 'rarely'=1, 'often'=2, 'always'=3), which were then weighted by multiplying a semi-subjective scoring of the relevance of each metric to the concept of severity on a scale from 1-5, with 5 being the most severe ('intellectual_disability'=5, 'death'=5, 'impaired_mobility'=4, 'physical_malformations'=3, 'blindness'=4, 'sensory_impairments'=3, 'immunodeficiency'=3, 'cancer'=3, 'reduced_fertility'=1, 'congenital_onset'=4).\nFinally, the product of the score was normalised to a quantitative severity score ranging from 0-100, where 100 is the theoretical maximum severity score.\nThis phenotype severity scoring procedure can be expressed as follows.\n\n::: {#eq-gpt .content-hidden unless-format=\"html\"}\n![](equations/eq5.png){height=\"300px\"}\n\nComputing normalised severity score from encoded GPT-4 annotations.\n:::\n\n\\\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[Brown4]{nss}{NSS_p}\n =\n \\frac{ \n \\eqnmarkbox[Goldenrod]{nss2}{\\sum_{j=1}^{m}} \n (\n \\eqnmarkbox[Goldenrod4]{nss3}{F_{pj}}\n \\times \n \\eqnmarkbox[IndianRed4]{nss4}{W_j}\n )\n }{\n \\eqnmarkbox[Tan]{nss5}{\\sum_{j=1}^{m}(\\max\\{F_j\\} \\times W_j)} \n } \\times 100\n\\end{equation*}\n\\annotate[yshift=1em]{left}{nss}{Normalised Severity Score \\\\for each phenotype}\n\\annotate[yshift=3em]{left}{nss2}{Sum of weighted annotation values \\\\across all metrics}\n\\annotate[yshift=3em]{right}{nss3}{Numerically encoded annotation value \\\\of metric $j$ for phenotype $p$}\n\\annotate[yshift=1em]{right}{nss4}{Weight for metric $j$} \n\\annotate[yshift=-1em]{below,right}{nss5}{Theoretical maximum severity score}\n```\n\n:::\n\n\\\n\n### Enrichment of foetal cell types in congenital phenotypes\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nfetal_keywords <- shQuote(eval(formals(MSTExplorer::plot_congenital_annotations)$fetal_keywords) )\n```\n:::\n\n\nThe GPT-4 annotations also enabled us to assess whether foetal cell types were more often significantly associated with congenital phenotypes in our Human Cell Landscape results as this single-cell reference contained both adult and foetal versions of cell types ([Fig. @fig-congenital]).\nTo do this, we performed a chi-squared ($\\chi^2$) test on the proportion of significantly associated cell types containing any of the substrings 'fetal', 'fetus', 'primordial', 'hESC' or 'embryonic' (within cell types annotations from the original Human Cell Landscape authors [@Han2020-iq]) vs. those associated without, stratified by how often the corresponding phenotype had a congenital onset according to the GPT phenotype annotations (including 'never', 'rarely', 'often', 'always').\nIn addition, a series of $\\chi^2$ tests were performed within each congenital onset frequency strata, to determine whether the observed proportion of foetal cell types vs. non-foetal cell types significantly deviated from the proportions expected by chance.\n\n### Diagnosis via cell type-specific disease prediction\n\nWe designed an algorithm that uses our results to predict the most likely cell types underlying a set of phenotypic and genotypic traits observed in a patient ([Fig. @fig-diagnosis]).\nThis is implemented within `MSTExplorer::predict_celltypes` and takes HPO phenotypes as inputs.\nIt can optionally take included risk genes, excluded risk genes, included diseases and/or excluded diseases as additional inputs.\nIt then computes the It then outputs a weighted ranking of cell types, where higher ranking indicates a higher likelihood of being the underlying mechanism of the patient’s particular form of disease(s).\n\n### Prognosis via cell type-mediated differential outcomes\n\nThe phenotype hypotonia is associated with diseases that range in severity from benign to debilitating to fatal [@Ahmed2016-ag].\nIn the absence of additional information, making an accurate diagnosis is extremely challenging even for experienced physicians.\nThe magnitude of this challenge is highlighted by the fact that each disease is associated with anywhere between 1-595 unique phenotypes (median=61, mean=77.74) within the HPO.\nConversely, each phenotype is associated with 1-5404 diseases (median=6, mean=60.74).\nWe addressed this challenge by applying our phenotype-cell type association results in combination with expert-curated HPO annotations of clinical outcomes associated with each phenotype-disease pairing ([Fig. @fig-prognosis]).\nWe first extracted results for the phenotype 'Hypotonia' (*HP:0001252*) and its 13 descendant subterms from our phenotype-cell type association analyses.\nNext, we encoded the \"Age of Death\" categories associated with each disease in an ordinal scale ranging from 1, corresponding to prenatal death, to 8, corresponding to death in late adulthood (@tbl-death).\nTo determine whether cell type identity significantly predicted the age of death, we conducted an ANOVA where cell type was the predictor and \"Age of Death score\" was the outcome.\n\n### Therapeutic target identification\n\nWe developed a systematic and automated strategy for identifying putative cell type-specific gene targets for each phenotype based on a series of filters at phenotype, cell type, and gene levels.\nThe entire target prioritisation procedure can be replicated with a single function: `MSTExplorer::prioritise_targets`.\nThis function automates all of the reference data gathering (e.g. phenotype metadata, cell type metadata, cell type signature reference, gene lengths, severity tiers) and takes a variety of arguments at each step for greater customisability.\n\n### Therapeutic target validation\n\nTo assess whether our prioritised therapeutic targets were likely to be viable, we computed the overlap between our gene targets and those of existing gene therapies at various stages of clinical development ([Fig. @fig-therapy-validate]).\nGene targets were obtained for each therapy from the Therapeutic Target Database (TTD; release 2024-03-22) and mapped onto standardised HUGO Gene Nomenclature Committee (HGNC) gene symbols using the `orthogene` R package.\nWe stratified our overlap metrics according to whether the therapies had failed (unsuccessful clinical trials or withdrawn), or were non-failed (successful or ongoing clinical trials).\nWe then conducted hypergeometric tests to determine whether the observed overlap between our prioritised targets and the non-failed therapy targets was significantly greater than expected by chance (i.e. enrichment).\nWe also conducted a second hypergeometric test to determine whether the observed overlap between our prioritised targets and the failed therapy targets was significantly less than expected by chance (i.e. depletion).\nFinally, we repeated the analysis against all therapeutic targets, not just those of gene therapies, to determine whether our prioritised targets had relevance to other therapeutic modalities.\n\n### Experimental model translatability\n\nTo improve the likelihood of successful translation between preclinical animal models and human patients, we created an interspecies translatability prediction tool for each phenotype nominated by our gene therapy prioritised pipeline ([Fig. @fig-animal-models]).\nFirst, we extracted ontological similarity scores of homologous phenotypes across species from the MKG [@Putman2024-et].\nBriefly, the ontological similarity scores ($SIM_o$) are computed for each homologous pair of phenotypes across two ontologies by calculating the overlap in homologous phenotypes that are ancestors or descendants of the target phenotype.\nNext, we generated genotypic similarity scores ($SIM_g$) for each homologous phenotype pair by computing the proportion of 1:1 orthologous genes using gene annotation from their respective ontologies.\nInterspecies orthologs were also obtained from the MKG.\nFinally, both scores are multiplied together to yield a unified ontological-genotypic similarity score ($SIM_{o,g}$).\n\n### Novel R packages\n\nTo facilitate all analyses described in this study and to make them more easily reproducible by others, we created several open-source R packages.\n[`KGExplorer`](https://github.com/neurogenomics/KGExplorer) imports and analyses large-scale biomedical knowledge graphs and ontologies.\n[`HPOExplorer`](https://github.com/neurogenomics/HPOExplorer) aids in managing and querying the directed acyclic ontology graph within the HPO.\n[`MSTExplorer`](https://github.com/neurogenomics/MSTExplorer) facilitates the efficient analysis of many thousands of phenotype-cell type association tests, and provides a suite of multi-scale therapeutic target prioritisation and visualisation functions.\nThese R packages also include various functions for distributing the post-processed results from this study in an organised, tabular format.\nOf note, `MSTExplorer::load_example_results` loads all summary statistics from our phenotype-cell type tests performed here.\n\n### Rare Disease Celltyping Portal\n\nTo further increase the ease of access for stakeholders in the RD community without the need for programmatic experience, we developed a series of web apps to interactively explore, visualise, and download the results from our study.\nCollectively, these web apps are called the Rare Disease Celltyping Portal.\nThe landing page for the website was made using HTML, CSS, and javascript and the web apps were created using the Shiny Web application framework for R and deployed on the [shinyapps.io](https://www.shinyapps.io) server.\nThe website can be accessed [here](https://neurogenomics.github.io/rare_disease_celltyping_apps/home). All code used to generate the website can be found [here](https://github.com/neurogenomics/rare_disease_celltyping_apps).\n\n## Tables\n\n\n::: {#tbl-summary .cell tbl-cap='Summary statistics of enrichment results stratified by single-cell atlas.' tbl-subcap='Summary statistics at multiple levels (tests, cell types, phenotypes, diseases, cell types per phenotype, phenotypes per cell type) stratified by the single-cell atlas that was used as a cell type signature reference (Descartes Human or Human Cell Atlas).'}\n\n```{.r .cell-code .hidden}\ndata.frame(res_summ$tmerged[,-c(\"ctd\")], \n row.names = res_summ$tmerged$ctd,\n check.names = FALSE) |>\n t()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n DescartesHuman HumanCellLandscape all \ntests significant \"19,929\" \"26,585\" \"46,514\" \ntests \" 848,078\" \"1,358,916\" \"2,206,994\"\ntests significant (%) \"2.35\" \"1.96\" \"2.11\" \ncell types significant \" 77\" \"124\" \"201\" \ncell types \" 77\" \"124\" \"201\" \ncell types significant (%) \"100\" \"100\" \"100\" \nphenotypes significant \"7,340\" \"9,049\" \"9,575\" \nphenotypes tested \"11,014\" \"10,959\" \"11,028\" \nphenotypes \"11,047\" \"11,047\" \"11,047\" \nphenotypes significant (%) \"66.4\" \"81.9\" \"86.7\" \ndiseases significant \"8,628\" \"8,627\" \"8,628\" \ndiseases \"8,631\" \"8,631\" \"8,631\" \ndiseases significant (%) \"100\" \"100\" \"100\" \ncell types per phenotype (mean) \"1.81\" \"2.43\" \"4.22\" \ncell types per phenotype (median) \"1\" \"2\" \"3\" \ncell types per phenotype (min) \"0\" \"0\" \"0\" \ncell types per phenotype (max) \"31\" \"28\" \"59\" \nphenotypes per cell type (mean) \"259\" \"214\" \"231\" \nphenotypes per cell type (median) \"252\" \"200\" \"209\" \nphenotypes per cell type (min) \"71\" \"57\" \"57\" \nphenotypes per cell type (max) \"696\" \"735\" \"735\" \n```\n\n\n:::\n:::\n\n\n## Data and Code Availability\n\nAll data and code is made freely available through preexisting databases and/or GitHub repositories / software associated with this publication.\n\n- [Human Phenotype Ontology](https://hpo.jax.org)\n- [GenCC](https://thegencc.org/)\n- [Descartes Human scRNA-seq atlas](https://cellxgene.cziscience.com/collections/c114c20f-1ef4-49a5-9c2e-d965787fb90c)\n- [Human Cell Landscape scRNA-seq atlas](https://cellxgene.cziscience.com/collections/38833785-fac5-48fd-944a-0f62a4c23ed1)\n- [Rare Disease Celltyping Portal](https://neurogenomics.github.io/rare_disease_celltyping_apps/home)\n- [`KGExplorer`](https://github.com/neurogenomics/KGExplorer)\n- [`HPOExplorer`](https://github.com/neurogenomics/HPOExplorer)\n- [`MSTExplorer`](https://github.com/neurogenomics/MSTExplorer)\n- [Code to replicate analyses](https://github.com/neurogenomics/rare_disease_celltyping)\n- [Cell type-specific gene target prioritisation](https://neurogenomics.github.io/RareDiseasePrioritisation/reports/prioritise_targets) \n- [Complement system gene list](https://www.genenames.org/data/genegroup/#!/group/492)\n\n\n## Acknowledgements\n\nWe would like to thank the following individuals for their insightful feedback and assistance with data resources: Sarah J. Marzi, Gerton Lunter, Peter Robinson, Melissa Haendel, Ben Coleman, Nico Matentzoglu, Shawn T. O'Neil, Alan E. Murphy, Sarada Gurung.\n\n### Funding\n\nThis work was supported by a UK Dementia Research Institute (UK DRI) Future Leaders Fellowship \\[MR/T04327X/1\\] and the UK DRI which receives its funding from UK DRI Ltd, funded by the UK Medical Research Council, Alzheimer’s Society and Alzheimer's Research UK.\n\n\n## References {.unnumbered}\n\n:::{#refs}\n\n:::\n\n\\\n\n\n{{< pagebreak >}}\n\n\n\n## Supplementary Materials\n\n### Supplementary Figures\n\n\n::: {#cell-fig-evidence-histograms .cell}\n\n```{.r .cell-code .hidden}\nevidence_plot <- HPOExplorer::plot_evidence(phenotype_to_genes = p2g,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConstructing HPO gene x phenotype matrix.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nevidence_plot$plot\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n::: {.cell-output-display}\n![Distribution of evidence scores at each processing step.](index_files/figure-pdf/fig-evidence-histograms-1.pdf){#fig-evidence-histograms fig-pos='H'}\n:::\n:::\n\n\n\n::: {#fig-diagram}\n![](img/fig-diagram.png)\n\nDiagrammatic overview of multi-scale disease investigation strategy.\nHere we provide an abstract example of differential disease aetiology across multiple scales: diseases ($D$), phenotypes ($P$), cell types ($C$), genes ($G$), and clinical outcomes ($O$).\nIn the HPO, genes are assigned to phenotypes via particular diseases ($G_{d,p}$).\nTherefore, the final gene list for each phenotype is aggregated from across multiple diseases ($G_{p}$).\nWe performed association tests for all pairwise combinations of cell types and phenotypes and filtered results after multiple testing corrections ($FDR_{p,c}<0.05$).\nEach phenotype in the context of a given disease is referred to here as a symptom.\nLinks were established between symptoms and cell types through proportional gene set overlap at a minimum threshold of 25%.\n:::\n\n\n::: {#fig-ctd-correlation .cell layout-ncol=\"2\"}\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out$plot$p.all\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out$plot$logFC.significant\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out_hcl$plot$p.all\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out_hcl$plot$logFC.significant\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n::: {.cell-output-display}\n![Correlation between the uncorrected p-values from all phenotype-cell type association tests using the Descartes Human vs. Human Cell Landscape CTDs.](index_files/figure-pdf/fig-ctd-correlation-1.pdf){#fig-ctd-correlation-1}\n:::\n\n::: {.cell-output-display}\n![Correlation between the $log_{10}(fold-change)$ from significant phenotype-cell type association tests ($FDR_{p,c}<0.05$) using the Descartes Human vs. Human Cell Landscape CTDs.](index_files/figure-pdf/fig-ctd-correlation-2.pdf){#fig-ctd-correlation-2}\n:::\n\n::: {.cell-output-display}\n![Correlation between the uncorrected p-values from all phenotype-cell type association tests using the Human Cell Landscape fetal samples vs. Human Cell Landscape adult samples.](index_files/figure-pdf/fig-ctd-correlation-3.pdf){#fig-ctd-correlation-3}\n:::\n\n::: {.cell-output-display}\n![Correlation between the $log_{10}(fold-change)$ from significant phenotype-cell type association tests ($FDR_{p,c}<0.05$) using the Human Cell Landscape fetal samples vs. Human Cell Landscape adult samples.](index_files/figure-pdf/fig-ctd-correlation-4.pdf){#fig-ctd-correlation-4}\n:::\n\nInter- and intra-dataset validation across the different CellTypeDataset (CTD) and developmental stages. Correlations are computed using Pearson's correlation coefficient. Point density is plotted using a 2D kernel density estimate.\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n### Generate filtering report plot\nplot_report_out <- MSTExplorer::plot_report(\n results = results,\n rep_dt = prioritise_targets_out$report,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nplot_report:: Preparing data.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nplot_report:: Preparing plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot ==> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpQjrQLG/file73fc5fbe045d_plot_report.pdf\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving 5.5 x 3.5 in image\n```\n\n\n:::\n:::\n\n::: {#cell-fig-therapy-filter .cell}\n\n```{.r .cell-code .hidden}\nplot_report_out\n```\n\n::: {.cell-output-display}\n![Therapeutics - Prioritised target filtering steps. This plot visualises the number of unique phenotype-cell type associations, cell types, genes, and phenotypes (*y-axis*) at each filtering step (*x-axis*) within the multi-scale therapeutic target prioritisation pipeline. Each step in the pipeline can be easily adjusted according to user preference and use case. See **Methods** for descriptions and criterion of each filtering step.**a**, The percentage of phenotypes belonging to each severity Tier after each filtering step (Tier 1 being the most severe). **b**, The number of phenotypes, cell types, associated diseases and genes remaining after each filtering step during the gene prioritisation pipeline.](index_files/figure-pdf/fig-therapy-filter-1.pdf){#fig-therapy-filter fig-pos='H'}\n:::\n:::\n\n::: {#cell-fig-monarch-recall .cell}\n\n```{.r .cell-code .hidden}\ncat(\"!!!RECOMPUTE!!!\")\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n!!!RECOMPUTE!!!\n```\n\n\n:::\n:::\n\n::: {#cell-fig-therapy-validate-all .cell}\n\n```{.r .cell-code .hidden}\nttd_check_all_out$plot\n```\n\n::: {.cell-output-display}\n![Therapeutics - Validation of prioritised therapeutic targets. Proportion of existing all therapy targets (documented in the Therapeutic Target Database) recapitulated by our prioritisation pipeline.](index_files/figure-pdf/fig-therapy-validate-all-1.pdf){#fig-therapy-validate-all fig-pos='H'}\n:::\n:::\n\n::: {#cell-fig-animal-models .cell}\n\n```{.r .cell-code .hidden}\nlibrary(ggplot2) # <-- Necessary due to bug in one of the plotting dependencies \n\ntop_ids <- unique(prioritise_targets_out$top_targets$hpo_id)[1:1000]\nplot_upheno_out <- KGExplorer::plot_upheno(\n pheno_map_genes_match = pheno_map_genes_match, \n filters=list(id1=top_ids)\n )\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'id1' : 883 / 987 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3 / 4 species remain after filtering by `subset_db1`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: ggdist\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: tidyquant\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRegistered S3 method overwritten by 'quantmod':\n method from\n as.zoo.data.frame zoo \n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/upheno.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor metadata already present. Use force_new=TRUE to overwrite.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nplot_upheno_out$heatmap\n```\n\n::: {.cell-output-display}\n![Identification of translatable experimental models. Interspecies translatability of human phenotypes nominated by our gene therapy prioritised pipeline. Above, our combined ontological-genotypic similarity score ($SIM_{o,g}$) is displayed as the heatmap fill colour stratified by the model organism (*x-axis*). An additional column (“n_genes_db1” on the far left) displays the total number of unique genes annotated to the phenotypic within the HPO. Phenotypes are clustered according to their ontological similarity in the HPO (*y-axis*).](index_files/figure-pdf/fig-animal-models-1.pdf){#fig-animal-models fig-pos='H'}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ntop_targets <- prioritise_targets_out$top_targets[,n_genes:=data.table::uniqueN(gene_symbol),\n by=\"hpo_id\"][n_genes<5 & proportion_driver_genes_symptom>.25]\n\nheight <- \"60vh\"\n\nphenotype <- \"respiratory failure\"\nvn_therapy_eg1 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)],\n main = phenotype, \n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: visNetwork\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a42436e8b7a_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- \"amyotrophic lateral sclerosis\"\nvn_therapy_eg2 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)],\n main = phenotype,\n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a4247859f27_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- \"neurodegeneration\"\nvn_therapy_eg3 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = phenotype, \n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a421edc11f1_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- \"small vessel disease\"\nvn_therapy_eg4 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = phenotype, \n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a427cc2a642_prioritise_targets_network.html\n```\n\n\n:::\n:::\n\n::: {#fig-therapy-examples-supp .cell layout=\"[[1,1], [1], [1]]\"}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg1$plot\n```\n\n![Respiratory failure](index_files/figure-pdf/fig-therapy-examples-supp-1.pdf){#fig-therapy-examples-supp-1}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg2$plot\n```\n\n![Amyotrophic lateral sclerosis](index_files/figure-pdf/fig-therapy-examples-supp-2.pdf){#fig-therapy-examples-supp-2}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg3$plot\n```\n\n![Neurodegeneration](index_files/figure-pdf/fig-therapy-examples-supp-3.pdf){#fig-therapy-examples-supp-3}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg4$plot\n```\n\n![Small vessel disease](index_files/figure-pdf/fig-therapy-examples-supp-4.pdf){#fig-therapy-examples-supp-4}\n\nExample cell type-specific gene therapy targets for several severe phenotypes and their associated diseases. Each disease (blue cylinders) is connected to its phenotype (purple cylinders) based on well-established clinical observations recorded within the HPO [@Gargano2024-fc].Phenotypes are connected to cell types (red circles) via association testing between weighted gene sets ($FDR_{p,c}<0.05$). Each cell type is connected to the prioritised gene targets (yellow boxes) based on the driver gene analysis.The thickness of the edges connecting the nodes represent the (mean) fold-change from the bootstrapped enrichment tests. Nodes were spatially arranged using the Sugiyama algorithm [@Sugiyama1981-ev].\n:::\n\n\n### Supplementary Methods\n\n#### Therapeutics: Gene therapy target identification\n\nDescriptions of each step in the prioritisation pipeline are as follows:\n\n1. **start**: All phenotype-cell type association results.\n\n2. **q_threshold**: Keep only results that were significant after multiple-testing correction (q\\<0.05).\n\n3. **fold_threshold**: Keep only results with fold change\\>=1.\n\n4. **keep_ont_levels**: Keep only phenotypes at certain absolute ontology levels within the HPO.\n\n5. **keep_onsets**: Keep only phenotypes with postnatal age of onsets to circumvent technical and ethical challenges associated with antenatal gene therapeutics delivery.\n\n6. **keep_tiers**: Keep only phenotypes with high severity Tiers.\n\n 1. We used a combination of manual curation and automated text-based substring queries to assign each phenotype a severity Tier as characterised in a survey of healthcare professionals [@Lazarin2014-we].\n\n 2. Tier 1: Diseases that shortened life span in adolescence or earlier or resulted in intellectual disability.\n\n 3. Tier 2: Diseases that shortened lifespan prematurely in adulthood, or resulted in impaired mobility or internal physical malformation.\n\n 4. Tier 3: Diseases causing sensory impairments (hearing, vision, touch, pain, or other), immunodeficiency/cancer, mental illness, or dysmorphic features.\n\n 5. Tier 4: Diseases that reduce fertility.\n Of the 49 phenotypes that were available in this severity ranking, we selected three that were classified as Tier 1 (the most severe disease category): mental deterioration, coma and respiratory failure.\n\n7. **severity_threshold**: Keep only phenotypes with mean severity score equal to or below the threshold.\n\n 1. Severity scores were computed by assigning each severity modifier term found in the HPO annotations a numerical value.\n In order of increasing severity:\n\n 2. HP:0012825 \"Mild\" (Severity_score=4)\n\n 3. HP:0012827 \"Borderline\" (Severity_score=3)\n\n 4. HP:0012828 \"Severe\" (Severity_score=2)\n\n 5. HP:0012829 \"Profound\" (Severity_score=1)\n\n8. **pheno_frequency_threshold**: Keep only phenotypes with mean frequency equal to or above the threshold (i.e. how frequently a phenotype is associated with any diseases in which it occurs).\n\n 1. Keep phenotypes with a mean frequency ≥10% or are NA by default.\n\n9. **keep_celltypes**: Keep only terminally differentiated cell types.\n\n 1. Of the 77 cell types tested in the Descartes cell type reference, the 40 terminally differentiated cell types were identified through a literature search. Of these, three (extravillous trophoblasts, syncytiotrophoblasts and trophoblast giant cells) were excluded as they only played a role in pregnancy [@Chang2018-qj; @Fogarty2011-ph; @Hu2010-eh], which would raise additional technical and ethical challenges as rAAV therapy has not yet been used to target foetuses in clinical trials.\n\n10. **keep_seqnames**: Remove genes on non-standard chromosomes.\n\n 1. Only keep chromosomes 1-22, X, and Y.\n\n11. **gene_size**: Keep only genes \\<4.3kb in length.\n\n 1. Due to limitations in the length of the gene that can be carried by the rAAV vector, genes with a length of \\>4.3kb were excluded.\n\n12. **keep_biotypes**: Keep only genes belonging to certain biotypes (e.g. \"protein_coding\", \"processed_transcript\", \"snRNA\", \"lincRNA\", \"snoRNA\", \"IG_C_gene\").\n\n 1. Keep all biotypes by default.\n\n13. **gene_frequency_threshold**: Keep only genes at or above a certain mean frequency threshold (i.e. how frequently a gene is associated with a given phenotype when observed within a disease).\n\n 1. Keep genes with a mean frequency ≥10% or are NA by default.\n\n14. **keep_specificity_quantiles**: Keep only genes in top specificity quantiles from the cell type dataset.\n\n 1. To further narrow down genes, we extracted relevant metrics from the Descartes reference for each gene in each cell type. These included mean expression, specificity, and specificity quantiles (using 40 bins). Only genes with the most specific quantiles (39-40) were included for further analysis, as cell type-specific genes may be less likely to have off-target effects in other cell types.\n\n15. **keep_mean_exp_quantiles**: Keep only genes in top mean expression quantiles from the cell type dataset\n\n16. **end**: Final table of prioritised cell type- / phenotype-specific gene targets.\n\nFinally, for more comprehensive target search, the we removed the filters for onsets (keep_onsets=NULL), Tier (keep_tiers=NULL), severity (severity_threshold=NULL), as well as relaxed the filters for phenotype frequency threshold (pheno_frequency_threshold=c(10,NA)), gene frequency threshold (gene_frequency_threshold = c(10,NA)), gene specificity quantiles (keep_specificity_quantiles = seq(20,40)), and gene expression quantiles (keep_mean_exp_quantiles = seq(20,40)).\n \n", + "markdown": "---\ntitle: \"Cell type-specific contextualisation of the phenomic landscape: a comprehensive and scalable approach towards the diagnosis, prognosis and treatment of all rare diseases\"\nauthor:\n - name: Brian M. Schilder\n orcid: 0000-0001-5949-2191\n corresponding: true\n email: brian_schilder@alumni.brown.edu\n roles:\n - Investigation\n - Project administration\n - Software\n - Visualization\n affiliations:\n - Imperial College London\n - name: Kitty B. Murphy\n orcid: 0000-0002-8669-3076\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Robert Gordon-Smith\n orcid: 0000-0001-6698-7387\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Jai Chapman\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Momoko Otani\n corresponding: false\n roles: []\n affiliations:\n - Imperial College London\n - name: Nathan G. Skene\n orcid: 0000-0002-6807-3180\n corresponding: true\n email: n.skene@imperial.ac.uk\n roles:\n - Project administration\n affiliations:\n - Imperial College London\nkeywords:\n - rare disease\n - phenotype\n - single-cell\n - gene therapy\nplain-language-summary: |\n We identified the cell types underlying the symptoms of all rare diseases.\nkey-points:\n - We used the Human Phenotype Ontology and single-cell RNA-seq references to characterise the phenome.\n - We then demonstrated how these results can be applied to clinical diagnosis, prognosis and therapeutics development.\ndate: last-modified\nbibliography: references.bib\ncitation:\n container-title: Nature Medicine\nformat:\n nature-pdf:\n journal: \"sn-nature\"\n keep-tex: true\n execute:\n cache: true\n echo: false\n docx:\n execute:\n cache: true\n echo: false\n---\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(data.table)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nhpo <- HPOExplorer::get_hpo()\np2g <- HPOExplorer::load_phenotype_to_genes()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nannot <- HPOExplorer::load_phenotype_to_genes(3)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nper_disease <- p2g[,list(ng=data.table::uniqueN(gene_symbol),\n np=data.table::uniqueN(hpo_id)),by=\"disease_id\"]\nper_phenotype <- p2g[,list(ng=data.table::uniqueN(gene_symbol),\n nd=data.table::uniqueN(disease_id)),by=\"hpo_id\"]\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Import precomputed results for reporting summaries\nresults <- MSTExplorer::load_example_results()\nresults <- HPOExplorer::add_hpo_name(results, hpo = hpo)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding HPO names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults <- HPOExplorer::add_ont_lvl(results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults <- HPOExplorer::add_ancestor(results, hpo = hpo)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding level-2 ancestor to each HPO ID.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor metadata already present. Use force_new=TRUE to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults <- MSTExplorer::map_celltype(results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping cell types to cell ontology terms.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding stage information.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nMSTExplorer::add_logfc(results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding logFC column.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults[,effect:=estimate]\n## Substitute B for \\beta for now since Quarto doesn't seem to support\n## Greek letters after they've been stored in a data.table...\nresults[,summary:=paste0(\n \"$\",\n \"FDR_{p,c}=\",format(q,digits=3),\",\",\n \"B=\",format(estimate,digits=3),\n \"$\"\n)] \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nres_summ <- MSTExplorer::summarise_results(results = results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving results --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmphp2ypb/summarise_results.csv10f4a3d4aac16\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nres_summ_all <- res_summ$tmerged[ctd==\"all\"] \n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Must use `cache.lazy=FALSE` because sparse matrices not yet supported for caching\nctd_list <- MSTExplorer::load_example_ctd(c(\"ctd_DescartesHuman.rds\",\n \"ctd_HumanCellLandscape.rds\"),\n multi_dataset=TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_DescartesHuman.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_HumanCellLandscape.rds\n```\n\n\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nvalidate_associations_mkg_out <- MSTExplorer::validate_associations_mkg(results = results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ni All local files already up-to-date!\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRemaining: 82 phenotypes across 65 celltypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n90.24% phenotypes recovered.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nkg_hp <- validate_associations_mkg_out$kg[grepl(\"^HP:\",from)]\nX <- \"`**!!RECOMPUTE!!**`\"\nY <- \"`**!!RECOMPUTE!!**`\"\n```\n:::\n\n\n## Abstract\n\nRare diseases (RDs) are an extremely heterogeneous and underserved category of medical conditions.\nWhile the majority of RDs are strongly genetic, it remains largely unknown via which physiological mechanisms genetics cause RD.\nTherefore, we sought to systematically characterise the cell type-specific mechanisms underlying all RD phenotypes with a known genetic cause by leveraging the Human Phenotype Ontology and transcriptomic single-cell atlases of the entire human body from embryonic, foetal, and adult samples.\nIn total we identified significant associations between 201 cell types and 9,575/11,028 (86.7%) unique phenotypes across 8,628 RDs. We estimate that this represents an over 500-fold increase in the collective knowledge of RD phenotype-cell type mechanisms.\n\nNext, we demonstrated how these results may be used for personalised patient diagnosis and prognosis, as well as the development of novel therapeutics.\nFinally, we take a data-driven approach to highlight several of the most promising gene/cell therapy candidates with the highest probability of animal model-to-human patient translation.\nFurthermore, we have made these results entirely reproducible and freely accessible to the global community to maximise their impact.\nTo summarise, this work represents a significant step forward in the mission to treat patients across an extremely diverse spectrum of serious RDs.\n\n## Introduction {#sec-introduction}\n\nWhile rare diseases (RDs) are individually uncommon, they collectively account for an enormous global disease burden with over 10,000 recognised RDs affecting at least 300-400 million people globally [@Ferreira2019-jp] (1 in 10-20 people) [@Zhu2020-vo] . Over 75% of RDs primarily affect children with a 30% mortality rate by 5 years of age [@noauthor_undated-kp]. Despite the prevalence and severity of RDs, patients suffering from these conditions are vastly underserved due to several contributing factors. First, diagnosis is extremely challenging due to the highly variable clinical presentations of many of these diseases. The diagnostic odyssey can take patients and their families decades, with an average time to diagnosis of 5 years [@Marwaha2022-uy].\nOf those, \\~46% receive at least one incorrect diagnosis and over 75% of all patients never receive any diagnosis @Molster2016-da.\nSecond, prognosis is also made difficult by high variability in disease course and outcomes which makes matching patients with effective and timely treatment plans even more challenging.\nFinally, even for patients who receive an accurate diagnosis/prognosis, treatments are currently only available for less than 5% of all RDs [@Halley2022-pd].\nIn addition to the scientific challenges of understanding RDs, there are strong financial disincentives for pharmaceutical and biotechnology companies to develop expensive therapeutics for exceedingly small RD patient populations with little or no return on investment [@Institute_of_Medicine_US_Committee_on_Accelerating_Rare_Diseases_Research_and_Orphan_Product_Development2010-vj; @Yates2022-ra].\nThose that have been produced are amongst the world’s most expensive drugs, greatly limiting patients’ ability to access it [@Nuijten2022-yc; @Thielen2022-ud], The provision of timely, effective and affordable care for RD patients will require substantive transformations to our existing scientific, clinical, and regulatory frameworks.\n\nA major challenge in both healthcare and scientific research is the scalable exchange of information.\nEven in the age of electronic healthcare records (EHR) much of the information about an individual’s history is currently fractured across healthcare providers, often with differing nomenclatures for the same conditions.\nThe Human Phenotype Ontology (HPO) is a hierarchically organised set of controlled clinical terms that provides a much needed common framework by which clinicians and researchers can precisely communicate patient conditions [@Gargano2024-fc; @Kohler2019-pc @Robinson2008-ys; @Kohler2021-wk].\nThe HPO spans all domains of human physiology and currently describes 18082 phenotypes across 10,300 RDs.\nEach phenotype and disease is assigned its own unique identifier and organised as a hierarchical graph, such that higher-level terms describe broad phenotypic categories or *branches* (e.g. *HP:0033127*: 'Abnormality of the musculoskeletal system' which contains 4495 unique phenotypes) and lower-level terms describe increasingly precise phenotypes (e.g. *HP:0030675*: \"Contracture of proximal interphalangeal joints of 2nd-5th fingers\").\nIt has already been integrated into healthcare systems and clinical diagnostic tools around the world, with increasing adoption over time [@Gargano2024-fc].\nCommon ontology-controlled frameworks like the HPO open a wealth of new opportunities, especially when addressing RDs.\nServices such as the Matchmaker Exchange [@Osmond2022-ml; @Philippakis2015-dq] have enabled the discovery of hundreds of underlying genetic etiologies, and led to the diagnosis of many patients.\nThis also opens the possibility of gathering cohorts of geographically dispersed patients to run clinical trials, the only viable option for treatment in many individuals.\nTo further increase the number of individuals who qualify for these treatments, as well as the trial sample size, proposals have been made deviate from the traditional single-disease clinical trial model and instead perform basket trials on groups of RDs with shared molecular etiologies (SaME) [@Zanello2023-zd].\nHowever this approach, and indeed much of RD patient care, hinges upon first characterising the molecular mechanisms underlying each RD.\n\nOver 80% of RDs have a known genetic cause [@Nguengang_Wakap2020-cz; @noauthor_2022-ok].\nDespite this our knowledge of the physiological mechanisms via which genetics cause pathogenesis is lacking for most RDs, severely hindering our ability to effectively diagnose, prognose and treat RD patients.\nThe availability of standardised, ontology-controlled databases presents opportunities to systematically investigate RDs at scale.\nSince 2008, the HPO has been continuously updated using knowledge from the medical literature, as well as by integrating databases of expert validated gene-phenotype relationships, such as OMIM [@Amberger2019-vl; @Amberger2017-tg; @McKusick2007-di], Orphanet [@Maiella2013-oo; @Weinreich2008-wm], and DECIPHER @Firth2009-qg.\nA subset of the HPO contains gene annotations for 11,047 phenotypes across 8,631 diseases.\nYet genes alone do not tell the full story of how RDs come to be, as their expression and functional relevance varies drastically across the multitude of tissues and cell types contained within the human body.\n\nOur knowledge of single-cell-resolution biology has exploded over the course of the last decade and a half, with numerous applications in both scientific and clinical practices [@Baysoy2023-vt; @Haque2017-bn; @Qi2023-ev].\nMore recently, comprehensive single-cell transcriptomic atlases across tissues have also emerged [@CZI_Single-Cell_Biology_Program2023-fs; @Svensson2020-lg].\nIn particular, the Descartes Human @Cao2020-qz and Human Cell Landscape @Han2020-iq projects provide comprehensive multi-system single-cell RNA-seq (scRNA-seq) atlases in embryonic, foetal, and adult human samples from across the human body.\nThese datasets provide data-driven gene signatures for hundreds of cell subtypes.\nThey also allow us to investigate disease mechanisms in the context of specific life stages.\n\nHere, we combine and extend several of the most comprehensive genomic and transcriptomic resources currently available to systematically uncover the cell types underlying granular phenotypes across 8,628 diseases.\nWe then go on to highlight thousands of novel phenotype-cell type associations which collectively expand our knowledge of cell type-resolved phenotypes by an estimated 567-fold.\nNext, we present several potential avenues for real world applications of these results in the context of RD patient diagnosis, prognosis, treatment, and therapeutics development.\n\n## Results {#sec-results}\n\n### Phenotype-cell type associations\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Create phenotype-gene matrix filled with aggregated GenCC evidence scores\nymat <- HPOExplorer::hpo_to_matrix(formula = \"gene_symbol ~ hpo_id\")\n## Run phenomix with DescartesHuman CellTypeDataset\nlm_res1 <- MSTExplorer::run_phenomix(ctd_name = \"DescartesHuman\",\n annotLevel = 2, \n test_method = \"glm_univariate\",\n ymat = ymat)\n## Run phenomix with HumanCellLandscape CellTypeDataset\nlm_res2 <- MSTExplorer::run_phenomix(ctd_name = \"HumanCellLandscape\",\n annotLevel = 3, \n test_method = \"glm_univariate\",\n ymat = ymat)\n## Merge results\nresults <- data.table::rbindlist(list(DescartesHuman=lm_res1,\n HumanCellLandscape=lm_res2),\n idcol = \"ctd\")\n## Apply multiple testing correction\nresults[,q:=stats::p.adjust(p,method=\"fdr\")]\n```\n:::\n\n\nIn this study we systematically investigated the cell types underlying phenotypes across the HPO. \nA summary of the phenome-wide results stratified by single-cell atlas can be found in @tbl-summary.\nWithin the results using the Descartes Human single-cell atlas, 19,929/ 848,078 (2.35%) tests across 77/ 77 (100%) cell types and 7,340/11,047 (66.4%) phenotypes revealed significant phenotype-cell type associations after multiple-testing correction ($FDR_{p,c}<0.05$). Using the Human Cell Landscape single-cell atlas, 26,585/1,358,916 (1.96%) tests across 124/124 (100%) cell types and 9,049/11,047 (81.9%) phenotypes showed significant phenotype-cell type associations ($FDR_{p,c}<0.05$). The median number of significantly associated phenotypes per cell type was 252 (Descartes Human) and 200 (Human Cell Landscape), respectively.\n\nAcross both single-cell references, the median number of significantly associated cell types per phenotype was 3, suggesting reasonable specificity of the testing strategy.\n8,628/8,631 (\\~100%) of diseases within the HPO gene annotations showed significant cell type associations for at least one of their respective phenotypes.\n\n### Validation of expected phenotype-cell type relationships\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nplot_bar_dendro_out <- MSTExplorer::plot_bar_dendro(\n results = results, \n show_plot = FALSE) \n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: ggdendro\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor columns already present. Skipping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'ancestor_name' : 999,488 / 2,206,994 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ncardiocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nendocrine cell\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nphotoreceptor cellretinal cell\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nleukocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\ncell of skeletal musclechondrocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nneural cell\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nrespiratory epithelial cellepithelial cell of lung\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConverted ontology to: igraph \n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConverted ontology to: igraph_dist \n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: tol\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: Invalid .internal.selfref detected and fixed by taking a (shallow)\ncopy of the data.table so that := can add this new column by reference. At an\nearlier point, this data.table has been copied by R (or was created manually\nusing structure() or similar). Avoid names<- and attr<- which in R currently\n(and oddly) may copy the whole data.table. Use set* syntax instead to avoid\ncopying: ?set, ?setnames and ?setattr. If this message doesn't help, please\nreport your use case to the data.table issue tracker so the root cause can be\nfixed or this message improved.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRunning tests: across_branches_per_celltype\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: Invalid .internal.selfref detected and fixed by taking a (shallow)\ncopy of the data.table so that := can add this new column by reference. At an\nearlier point, this data.table has been copied by R (or was created manually\nusing structure() or similar). Avoid names<- and attr<- which in R currently\n(and oddly) may copy the whole data.table. Use set* syntax instead to avoid\ncopying: ?set, ?setnames and ?setattr. If this message doesn't help, please\nreport your use case to the data.table issue tracker so the root cause can be\nfixed or this message improved.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nScale for x is already present.\nAdding another scale for x, which will replace the existing scale.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nScale for y is already present.\nAdding another scale for y, which will replace the existing scale.\nAncestor columns already present. Skipping.\n\n2,206,994 associations remain after filtering.\n\nCell type columns already present. Skipping mapping.\n\ncardiocyte\n\nTranslating ontology terms to ids.\n\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n\nendocrine cell\n\nTranslating ontology terms to ids.\n\nphotoreceptor cellretinal cell\n\nTranslating ontology terms to ids.\n\nleukocyte\n\nTranslating ontology terms to ids.\n\ncell of skeletal musclechondrocyte\n\nTranslating ontology terms to ids.\n\nneural cell\n\nTranslating ontology terms to ids.\n\nrespiratory epithelial cellepithelial cell of lung\n\nTranslating ontology terms to ids.\n\nProportional enrichment summary stats:\n\n - pct_min: 17.22\n\n - pct_max: 64.86\n\n - pct_max_mean: 40.3\n\n - pct_max_sd: 16.36\n\n - enrichment_mean: 6.09\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\noverrep_dat <- plot_bar_dendro_out$ggbars_out$data_summary\n\noverrep_dat[,summary:=paste0(ancestor_name,\": \", \n n_celltypes_sig,\"/\",n_celltypes,\n \" types of \",shQuote(target_celltypes),\n \" were overrepresented\",\n \" ($N_{p}$=\",phenotypes_per_ancestor,\").\")] \n```\n:::\n\n\nWithin each high-level branch in the HPO shown in [Fig. @fig-summary]b, we tested whether each cell type was more often associated with phenotypes in that branch relative to those in all other branches (including those not shown).\nWe then checked whether each cell type was overrepresented (at $FDR_{b,c}<0.05$) within its respective on-target HPO branch, where the number of phenotypes within that branch ($N_{p}$).\nAbnormality of the cardiovascular system: 5/6 types of 'cardiocyte' were overrepresented ($N_{p}$=673). Abnormality of the endocrine system: 3/4 types of 'endocrine cell' were overrepresented ($N_{p}$=291). Abnormality of the eye: 5/5 types of 'photoreceptor cell/retinal cell' were overrepresented ($N_{p}$=721). Abnormality of the immune system: 4/4 types of 'leukocyte' were overrepresented ($N_{p}$=255). Abnormality of the musculoskeletal system: 4/4 types of 'cell of skeletal muscle/chondrocyte' were overrepresented ($N_{p}$=2155). Abnormality of the nervous system: 19/23 types of 'neural cell' were overrepresented ($N_{p}$=1647). Abnormality of the respiratory system: 2/2 types of 'respiratory epithelial cell/epithelial cell of lung' were overrepresented ($N_{p}$=292)..\n\nAs an additional form of validation ([Fig. @fig-summary]d), we tested for a relationship between phenotype-cell type association significance ($-log_{e}(p_{p,c})$ where $log_{e}$ denotes natural log and and $p_{p,c}$ denotes uncorrected phenotype-cell type association p-values) and the proportion of on-target cell types. The list of on-target cell types were determined by matching each high-level HPO branch to a corresponding CL branch. These cross-ontology mappings can be found in @tbl-celltypes.\nFor this analysis we used raw p-values ($p_{p,c}$) rather than multiple-testing corrected p-values ($FDR_{p,c}$) to provide a more dynamic range of values (as the latter can drive values to 1).\nAll 7/7 high-level HPO branches showed a consistent upwards trend towards greater proportions of on-target cell types with increasing degrees of significance.\nFurthermore, all branches also showed a proportion of on-target cell types above that expected by chance (baseline = on-target cell types / total cell types) at $-log_{e}(p_{p,c})>1$.\n\n\n::: {#cell-fig-summary .cell}\n\n```{.r .cell-code .hidden}\nplot_bar_dendro_out$plot\n```\n\n::: {.cell-output-display}\n![Summary of significant associations between phenotypes and cell types, aggregated by HPO branch. Here we show **a**, the total number of significant phenotype enrichments per cell type ($FDR_{p,c}<0.05$) across all branches of the HPO. **b**, Number of phenotype association related to several high-level branches of the HPO. Asterisks above each bar indicate whether that cell type was significantly more often enriched in that branch relative to all other HPO branches, including those not shown here, as a proxy for how specifically that cell type is associated with that branch; $FDR _{b,c}<1e-04$ (\\*\\*\\*\\*), $FDR _{b,c}<0.001$ (\\*\\*\\*), $FDR_{b,c}<0.01$ (\\*\\*), $FDR _{b,c}<0.05$ (\\*). **c**, Dendrogram derived from the Cell Ontology (CL) showing the relatedness of all tested cell types to one another. For simplicity, cell type labels shown here are aligned to the CL [@Diehl2016-gt] and can therefore encompass one or more cell types annotated by the original authors of scRNA-seq datasets [@Cao2020-qz; @Han2020-iq]. **d**, Percentage of significant phenotype associations with on-target cell types (second row of facet labels), respective to the HPO branch. As significance increases ($-log_{10}(p)$ along the *x-axis*) the percentage of on-target enriched cell types also increases (*y-axis*).](index_files/figure-pdf/fig-summary-1.pdf){#fig-summary fig-pos='H'}\n:::\n:::\n\n\n### Validation of inter- and intra-dataset consistency\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nlibrary(data.table)\n## Across CTD\nvalidate_associations_correlate_ctd_out <- MSTExplorer::validate_associations_correlate_ctd(\n results=results, \n group_var=\"ctd\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCasting results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n13 comparable celltypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n10945 comparable phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n10 comparable celltypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n554 comparable phenotypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGenerating plots.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRegistered S3 method overwritten by 'ggside':\n method from \n +.gg ggplot2\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering statistics.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## Replace p-values of exactly 0 with smallest number R can compute\nvalidate_associations_correlate_ctd_out$data_stats$p.all$summary_data$p.value <- max(validate_associations_correlate_ctd_out$data_stats$p.all$summary_data$p.value,\n .Machine$double.xmin)\n\n## Within CTD: across developmental stages\nvalidate_associations_correlate_ctd_out_hcl <- MSTExplorer::validate_associations_correlate_ctd(\n results=results,\n filters= list(ctd=c(\"HumanCellLandscape\"), \n stage=c(\"Fetus\",\"Adult\")),\n group_var=\"stage\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'ctd' : 848,078 / 2,206,994 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'stage' : 54,795 / 1,358,916 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCasting results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n38 comparable celltypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n10959 comparable phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n33 comparable celltypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1584 comparable phenotypes @FDR<0.05.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGenerating plots.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering statistics.\n```\n\n\n:::\n:::\n\n\nNext, we sought to validate the consistency of our results across the two single-cell reference datasets (Descartes Human vs. Human Cell Landscape) across the subset of overlapping cell types [Fig. @fig-ctd-correlation].\nIn total there were 142285 phenotype-cell type associations to compare across the two datasets (across 10945 phenotypes and 13 cell types annotated to the exact same CL term.\nWe found that the correlation between p-values of the two datasets was high ($rho=0.492, p=1.08e-93$).\nWithin the subset of results that were significant in both single-cell datasets ($FDR_{p,c}<0.05$), we found that correlation of the association effect size were even stronger ($rho=0.723, p=1.08e-93$).\nWe also checked for the intra-dataset consistency between the p-values of the foetal and adult samples in the Human Cell Landscape, showing a very similar degree of correlation as the inter-dataset comparison ($rho=0.436, p=2.36e-149$).\nTogether, these results suggest that our approach to identifying phenotype-cell type associations is highly replicable and generalisable to new datasets.\n\n### More specific phenotypes are associated with fewer genes and cell types\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nplot_ontology_levels_out <- MSTExplorer::plot_ontology_levels(\n results = results, \n ctd_list = ctd_list,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: gginnards\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nplot_ontology_levels_out_stats <- plot_ontology_levels_out$data_stats$summary_data|>\n data.table::setkeyv(\"parameter2\")\n## replace pvalues of exactly 0 with the minimum computable number in R\n## This avoids creating -Inf when logging values.\nplot_ontology_levels_out_stats[p.value==0, p.value:=.Machine$double.xmin]\nplot_ontology_levels_out_stats[q.value==0, q.value:=.Machine$double.xmin]\nplot_ontology_levels_out_stats[,summary:=paste0(\n \"$\",\n \"p=\",format(p.value,digits=3),\", \", \n \"q=\",format(q.value,digits=3),\", \",\n \"rho=\",format(estimate,digits=3),\n \"$\"\n )][,summary:=gsub(\"=[ ]\",\"=\",summary)]\n```\n:::\n\n\nFirst, we found that phenotype ontology showed a significant negative correlation with the number of genes annotated to that phenotype in the HPO data ([Fig. @fig-ontology-lvl]a; $p=2.23e-308, q=2.23e-308, rho=-0.2634$).\nThis is expected as broader phenotypes tend to have large gene set annotations.\nNext, we reasoned that lower HPO ontology levels representing more specific phenotypes were likely to be associated with fewer, more specific subsets of cell types.\nThis was indeed the case, as we observed a strongly significant negative correlation between the two variables ([Fig. @fig-ontology-lvl]b; $p=2.23e-308, q=2.23e-308, rho=-0.2927$).\nWe also found that the effect size of significant phenotype-cell type associations ($FDR_{p,c}<0.05$) increased with greater phenotype specificity, though the relationship was rather weak ([Fig. @fig-ontology-lvl]c; $p=7.30e-97, q=7.30e-97, rho=0.0966$).\nFinally, we found that the mean expression specificity of phenotype-associated genes (within the cell types significantly associated with those respective phenotypes at $FDR_{p,c}<0.05$) was positively correlated phenotype ontology depth ([Fig. @fig-ontology-lvl]d; $p=2.71e-174, q=3.61e-174, rho=0.1398$).\n\n\n::: {#cell-fig-ontology-lvl .cell}\n\n```{.r .cell-code .hidden}\nplot_ontology_levels_out$plot\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`geom_smooth()` using formula = 'y ~ x'\n`geom_smooth()` using formula = 'y ~ x'\n`geom_smooth()` using formula = 'y ~ x'\n`geom_smooth()` using formula = 'y ~ x'\n```\n\n\n:::\n\n::: {.cell-output-display}\n![More specific phenotypes are associated with fewer, more specific genes and cell types. Box plots showing relationship between HPO phenotype level and **a**, the number of genes annotated to each phenotype, **b**, the number of significantly enriched cell types, **c**, the effect size of phenotype-cell type association tests at $FDR_{p,c}<0.05$, and **d**, the mean expression specificity of phenotype-associated genes in the cell types significantly associated with those respective phenotypes ($FDR_{p,c}<0.05$). Ontology level 0 represents the most inclusive HPO term 'All', while higher ontology levels (max=16) indicate progressively more specific HPO terms (e.g. 'Contracture of proximal interphalangeal joints of 2nd-5th fingers'). Boxes are coloured by the mean value (respective to the subplot) within each HPO level.](index_files/figure-pdf/fig-ontology-lvl-1.pdf){#fig-ontology-lvl fig-pos='H'}\n:::\n:::\n\n\n### Hepatoblasts have a unique role in recurrent Neisserial infections\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nresults_tmp <- HPOExplorer::add_ancestor(data.table::copy(results),\n lvl = 7,\n force_new = TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nForce new. Removing existing ancestor columns.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding level-7 ancestor to each HPO ID.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3027 ancestors found at level 7\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ntarget_branches <- list(\"Recurrent bacterial infections\"=\"leukocyte\")\ninfections_out <- MSTExplorer::plot_bar_dendro_facets(\n results=results_tmp,\n target_branches=target_branches,\n facets = \"hpo_name\",\n legend.position=\"top\",\n lvl=9,\n ncol=2,\n vlines=\"hepatoblast\",\n fill_var=\"ancestor_name_original\",\n facets_n=NULL,\n q_threshold=0.05,\n background_full=FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nForce new. Removing existing ancestor columns.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding level-9 ancestor to each HPO ID.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2384 ancestors found at level 9\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to names.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,206,994 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nKeeping descendants of 1 term(s).\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n20 terms remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n4,020 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nleukocyte\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/cl.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRunning tests: across_branches_per_celltype\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\nSkipping tests.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: gnuplot\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nremove(results_tmp)\n\nstaph_res <- infections_out$data[hpo_name==\"Recurrent staphylococcal infections\"]\nstaph_res_top <- staph_res[,.SD[p %in% head(sort(p), 1)], by=c(\"hpo_id\")]\n \nrecurrent_infections_ids <- KGExplorer::get_ontology_descendants(ont = hpo,\n terms = \"Recurrent infections\")[[1]]\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRecurrent infections\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nhepatoblast_res <- results[q<0.05 &hpo_id %in% recurrent_infections_ids & cl_name==\"hepatoblast\"]\nhepatocyte_res <- results[q<0.05 & ancestor_name==\"Abnormality of the immune system\" & grepl(\"hepatocyte\",CellType,ignore.case = TRUE)] \n```\n:::\n\n\nWe selected the HPO term 'Recurrent bacterial infections' and all of its descendants (19 phenotypes) as an example of how investigations at the level of granular phenotypes can reveal different cell type-specific mechanisms ([Fig. @fig-rni]).\nAs expected, these phenotypes are primarily associated with immune cell types (e.g. macrophages, dendritic cells, T cells, monocytes, neutrophils).\nSome associations confirm relationships previously suggested in the literature, such as that between 'Recurrent staphylococcal infections' and myeloid cells [@Heim2014-du; @Pidwill2020-le; @Stoll2018-dc; @Tebartz2015-xs].\nSpecifically, our results pinpoint monocytes as the most strongly associated cell subtypes ($FDR_{p,c}= 1.03e-30,B= 1.76e-01$).\n\nIn contrast to all other recurrent infection types, 'Recurrent Neisserial infections' highlighted a novel association with hepatoblasts (Descartes Human : $FDR_{p,c}= 1.13e-06,B= 8.24e-02$).\nWhilst unexpected, a convincing explanation involves the complement system, a key driver of innate immune response to Neisserial infections.\nHepatocytes, which derive from hepatoblasts, produce the majority of complement proteins [@Zhou2016-kq], and Kupffer cells express complement receptors @Dixon2013-ok.\nIn addition, individuals with deficits in complement are at high risk for Neisserial infections [@Ladhani2019-nf; @Rosain2017-ih], and a genome-wide association study in those with a Neisserial infection identified risk variants within complement proteins [@The_International_Meningococcal_Genetics_Consortium2010-if] .\nWhile the potential of therapeutically targeting complement in RDs (including Neisserial infections) has been proposed previously [@Lung2019-il; @Reis2015-yz], performing this in a gene- and cell type-specific manner may help to improve efficacy and reduce toxicity (e.g. due to off-target effects).\nImportantly, there are over 56 known genes within the complement system [@Seal2023-pa], highlighting the need for a systematic, evidence-based approach to identify effective gene targets.\n\nAlso of note, despite the fact that our datasets contain both hepatoblasts and their mature counterpart, hepatocytes, only the hepatoblasts showed this association.\nThis suggests that the genetic factors that predispose individuals for risk of Neisserial infections are specifically affecting hepatoblasts before they become fully differentiated.\nIt is also notable that these phenotypes were the only ones within the 'Recurrent bacterial infections' branch, or even the broader 'Recurrent infections' branch, perhaps indicating a unique role for hepatoblasts in recurrent infectious disease.\nThe only phenotypes within the even broader 'Abnormality of the immune system' HPO branch that significantly associated with mature hepatocytes were 'Pancreatitis' ($FDR_{p,c}= 2.08e-02,B= 5.25e-02$) and 'Susceptibility to chickenpox' ($FDR_{p,c}= 1.20e-02,B= 5.49e-02$) both of which are well-known to involve the liver [@Al-Hamoudi2009-le; @Brewer2018-dg; @Eshchar1973-tz].\n\n\n::: {#cell-fig-rni .cell}\n\n```{.r .cell-code .hidden}\ninfections_out$plot + ggplot2::guides(fill=ggplot2::guide_legend(ncol=2))\n```\n\n::: {.cell-output-display}\n![Hepatoblasts have a unique role in recurrent Neisserial infections. Significant phenotype-cell type tests for phenotypes within the branch 'Recurrent bacterial infections'. Amongst all different kinds of recurrent bacterial infections, hepatoblasts (highlighted by vertical dotted lines) are exclusively enriched in 'Recurrent gram−negative bacterial infections'. Note that terms from multiple levels of the same ontology branch are shown as separate facets (e.g. 'Recurrent bacterial infections' and 'Recurrent gram−negative bacterial infections').](index_files/figure-pdf/fig-rni-1.pdf){#fig-rni fig-pos='H'}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Annotate results with disease/symptom-level and gene-level information\n## filtering q-values at this step yields the same results as filtering at the next step, \n## albeit with much fast computation.\nresults_annot <- HPOExplorer::add_genes(results[q<0.05],\n allow.cartesian = TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nresults_annot <- MSTExplorer::add_symptom_results(results = results_annot, \n ctd_list = ctd_list)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding symptom-level results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSubsetting results by q_threshold and effect.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,301,874 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## Plot multi-scale mechanisms as an interactive network\nphenotype <- \"Recurrent Neisserial infections\"\nvn_rni <- MSTExplorer::prioritise_targets_network(\n top_targets = results_annot[hpo_name==phenotype], \n main = NULL, \n height = \"400px\",\n width = \"500px\",\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: visNetwork\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpRH55Yj/file11bed3d97c051_prioritise_targets_network.html\n```\n\n\n:::\n:::\n\n\nNext, we sought to link multi-scale mechanisms at the levels of disease, phenotype, cell type, and gene and visualise these as a network ([Fig. @fig-network-rni]).\nThis revealed that genetic deficiencies in different complement system genes (*C5*, *C8*, and *C7*) are primarily mediated by different cell types (hepatoblasts, stratified epithelial cells, and stromal cells, respectively).\nWhile genes of the complement system are expressed throughout many different tissues and cell types, these results indicate that different subsets of these genes may mediate their effects through different cell types.\nThis finding suggests that investigating (during diagnosis) and targeting (during treatment) different cell types may be critical for the diagnosis and treatment of these closely related, yet mechanistically distinct, diseases.\n\n\n::: {#cell-fig-network-rni .cell}\n\n```{.r .cell-code .hidden}\nvn_rni$plot\n```\n\n![Multi-scale mechanisms of Recurrent Neisserial infections. Starting from the bottom of the plot, one can trace how causal genes (yellow boxes) mediate their effects through cell types (orange circles), phenotypes (pruple cylinders) and ultimately diseases (blue cylinders). Cell types are connected to phenotypes via association testing ($FDR_{p,c}<0.05$), and to diseases when the symptom gene set overlap is >25%. Nodes were spatially arranged using the Sugiyama algorithm [@Sugiyama1981-ev].](index_files/figure-pdf/fig-network-rni-1.pdf){#fig-network-rni fig-pos='H'}\n:::\n\n\n### Monarch Knowledge Graph recall\n\nNext, we used the Monarch Knowledge Graph (MKG) as a proxy for the field's current state of knowledge of phenotype-cell type associations.\nWe evaluated the proportion of MKG associations that were recapitulation by our results.\nIn total, our results contained at least one significant cell type associations for \\>90% of the phenotypes described in the MKG.\nOf these phenotypes, we captured \\>45% of the MKG phenotype-cell associations when only considering exact overlap of CL-aligned cell type annotations.\nThis proportion increased with greater flexibility in the matching of cell type annotations, reaching a maximum of `**!!RECOMPUTE!!**`% at a ontology graph distance of `**!!RECOMPUTE!!**` when considering the overlap of cell type annotations at the level of cell type ontology terms.\nThis suggests that our results are in line with the current state of knowledge, and that our approach can be used to identify novel phenotype-cell type associations.\n\n### Annotation of phenotypes using generative large language models\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngpt_check <- HPOExplorer::gpt_annot_check()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nNumber of phenotype hits per query group:\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n - intellectual_disability: 6\n - impaired_mobility: 292\n - physical_malformations: 78\n - blindness: 1\n - sensory_impairments: 252\n - immunodeficiency: 5\n - cancer: 695\n - reduced_fertility: 5\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: The `facets` argument of `facet_grid()` is deprecated as of ggplot2 2.2.0.\ni Please use the `rows` argument instead.\ni The deprecated feature was likely used in the HPOExplorer package.\n Please report the issue at\n .\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngpt_annot <- HPOExplorer::gpt_annot_codify()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngpt_annot$annot_weighted[,hpo_name:=gsub(\"^obsolete \",\"\",hpo_name)]\nleast_severe_phenotype <- gpt_annot$annot_weighted[hpo_name==\"Thin toenail\" & severity_score_gpt==0,]\n```\n:::\n\n\nSeverity annotations were gathered from GPT-4 for 16982/18082 (93.9166%) HPO phenotypes.\nIn our companion study, benchmarking tests of these results using ground-truth HPO branch annotations.\nFor example, phenotypes within the 'Blindness' HPO branch (*HP:0000618*) were correctly annotated as causing blindness by GPT-4.\nAcross all annotations, the recall rate of GPT-4 annotations was 91.26% (min=70.1%, max=100%, SD=11.84) with a mean consistency score of 91.21% (min=80.96%, max=97.48%, SD=5.739) for phenotypes whose annotation were collected more than once.\nThis clearly demonstrates the ability of GPT-4 to accurately annotate phenotypes.\nThis allowed us to begin using these annotations to compute systematically collected severity scores for all phenotypes in the HPO.\n\nFrom these annotations we computed a weighted severity score metric for each phenotype ranging from 0-100 (100 being the theoretical maximum severity of a phenotype that always causes every annotation).\nWithin our annotations, the most severe phenotype was 'Anencephaly' (*HP:0002323*) with a severity score of 58, followed by 'Atrophy/Degeneration affecting the central nervous system' (*HP:0007367*) with a severity score of 58.\nThere were 677 phenotypes with a severity score of 0 (e.g. 'Thin toenail').\nThe mean severity score across all phenotypes was 14.89 (median=14, standard deviation=8.517).\n\n### Enrichment of foetal cell types in congenital phenotypes\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nplot_congenital_annotations_out <- MSTExplorer::plot_congenital_annotations(\n results = results)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nplot_congenital_annotations_out_data <- \n data.table::data.table(plot_congenital_annotations_out$data, key=\"congenital_onset\")\nplot_congenital_annotations_out_data[,summary:=paste0(\n shQuote(congenital_onset),\"=\",.label,\n \" (n=\",counts,\" associations)\"\n)]\nplot_congenital_annotations_out_stats <- data.table::data.table(\n plot_congenital_annotations_out$data_stats$summary_data\n )[,summary:=paste0(\n \"$\",\n \"p=\",format(p.value,digits=2),\",\",\n \"\\\\chi^2_{Pearson}=\",format(statistic,digits=2),\",\",\n \"\\\\hat{V}_{Cramer}=\",format(estimate,digits=2),\n \"$\"\n)]\n```\n:::\n\n\nThe frequency of congenital onset with each phenotype (as determined by GPT-4 annotations) was strongly predictive with the proportion of significantly associated foetal cell types in our results ($p=2e-203,\\chi^2_{Pearson}=940,\\hat{V}_{Cramer}=0.14$).\nFurthermore, increasing congenital frequency annotation (on an ordinal scale) corresponded to an increase in the proportion of foetal cell types: 'always'=24% (n=1636 associations), 'often'=20% (n=2979 associations), 'rarely'=12% (n=1956 associations), 'never'=10% (n=811 associations).\nThis is consistent with the expected role of foetal cell types in development and the aetiology of congenital disorders.\n\n\n::: {#cell-fig-congenital .cell}\n\n```{.r .cell-code .hidden}\nplot_congenital_annotations_out$plot\n```\n\n::: {.cell-output-display}\n![Congenital phenotypes are more often associated with foetal cell types. As a phenotype is more often congenital in nature, the greater proportion of foetal cell types are significantly asscoaited with it.](index_files/figure-pdf/fig-congenital-1.pdf){#fig-congenital fig-pos='H'}\n:::\n:::\n\n\n### Diagnosis via cell type-specific disease prediction\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Define input phenotypes/genes\nphenotypes_diagnose <- c(\"Generalized neonatal hypotonia\",\n \"Scrotal hypospadias\",\n \"Increased circulating progesterone\")\nphenotypes_diagnose <- HPOExplorer::map_phenotypes(phenotypes_diagnose,\n to=\"id\",\n hpo=hpo)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngenes_include <- c(\"HSD3B2\",\"HERC2\")\ngenes_exclude <- c(\"SNORD115-1\")\n## Predict cell types\npredict_celltypes_out <- MSTExplorer::predict_celltypes(\n phenotypes = names(phenotypes_diagnose),\n genes_include = genes_include, \n genes_exclude = genes_exclude,\n phenotype_to_genes = p2g,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding logFC column.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping cell types to cell ontology terms.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding stage information.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_DescartesHuman.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_HumanCellLandscape.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning: Invalid .internal.selfref detected and fixed by taking a (shallow)\ncopy of the data.table so that := can add this new column by reference. At an\nearlier point, this data.table has been copied by R (or was created manually\nusing structure() or similar). Avoid names<- and attr<- which in R currently\n(and oddly) may copy the whole data.table. Use set* syntax instead to avoid\ncopying: ?set, ?setnames and ?setattr. If this message doesn't help, please\nreport your use case to the data.table issue tracker so the root cause can be\nfixed or this message improved.\n```\n\n\n:::\n:::\n\n\nUsing the function `MSTExplorer::predict_celltypes` we input 3 inclusion phenotypes ('Generalized neonatal hypotonia' (*HP:0008935*), 'Scrotal hypospadias' (*HP:0012853*), 'Increased circulating progesterone' (*HP:0031216*)), 2 genes in which the patient is known to have deleterious mutations (*HSD3B2*, *HERC2*) and 1 gene in which the patient is known not to have any deleterious mutations (*SNORD115-1*). This predicted that cortical cell of adrenal gland (score sum=1.38, score mean=0.0256, score standard deviation=0.137) were the most probable cell types underlying this combination of phenotypes and genotypes ([Fig. @fig-diagnosis]), which is highly consistent with existing evidence that adrenal insufficiency can cause both phenotypes via mutations in these genes [@Srivastava2023-ge; @Utsch2004-re]. This was the only cell type to receive a score two standard deviations from the mean score of all cell types (mean score: 0.000668).\n\n\n::: {#cell-fig-diagnosis .cell}\n\n```{.r .cell-code .hidden}\npredict_celltypes_out$plot\n```\n\n::: {.cell-output-display}\n![Diagnosis - Observed phenotypes/genotypes can be used to identify causal cell types in individuals. Our phenotype-cell type association results can be used to make predictions about which cell types are underlying a set of phenotypes observed in a given patient. Here we input three inclusion phenotypes, two inclusion genes, and one exclusion gene into the function `MSTExplorer::predict_celltypes`. The output is a ranked list of the top 10 most probable cell types (*x-axis*) underlying this combination of phenotypes/genotypes (highest to lowest rank from left to right). The score on the *y-axis* is computed by aggregating phenotype-celltype association summary statistics and evidence-weighted phenotype-gene associations. In this simple example, cortical cells of the adrenal gland were predicted as the most probable cell type. The mean of the score sum is shown as a dashed line, while one standard deviation (SD) above this is shown as a dotted line. Each bar is coloured by its mean.](index_files/figure-pdf/fig-diagnosis-1.pdf){#fig-diagnosis fig-pos='H'}\n:::\n:::\n\n\n### Prognosis via cell type-mediated differential outcomes\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Count number of diseases associated with these phenotypes\nkeep_descendants <- \"Hypotonia\" ## HP:0001252\nhypotonia_results <- HPOExplorer::filter_descendants(results, \n keep_descendants = keep_descendants) \n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nKeeping descendants of 1 term(s).\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n14 terms remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n2,814 associations remain after filtering.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nhypotonia_results <- HPOExplorer::add_death(hypotonia_results,\n allow.cartesian = TRUE,\n agg_by = c(\"disease_id\",\"hpo_id\"))\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with AgeOfDeath.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nhypotonia_results <- MSTExplorer::map_celltype(hypotonia_results) \n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## Generate plot\nplot_differential_outcomes_heatmap_out <- MSTExplorer::plot_differential_outcomes_heatmap( \n results = hypotonia_results, \n print_phenotypes = TRUE,\n fill_limits = c(1,8),\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding symptom-level results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSubsetting results by q_threshold and effect.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n52,234 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_DescartesHuman.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading ctd_HumanCellLandscape.rds\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nmin_ageofdeath_score_celltypes <- unique(plot_differential_outcomes_heatmap_out$data_agg[which(AgeOfDeath_score_min==min(AgeOfDeath_score_min))]$celltype_symptom)\n```\n:::\n\n\nHypotonia (*HP:0001252*) is a very broad phenotype containing 13 subterms (e.g. \"Generalised neonatal hypotonia\") and is associated with 2569 unique diseases in the HPO gene annotations.\nTogether, these hypotonia phenotypes were significantly associated with 29/99 (29.29%) unique CL-aligned cell types.\nThis reflects the highly variable set of disease etiologies that can cause this broad-level phenotype.\nAcross all diseases, hypotonia phenotypes tended to be most consistently severe (lower mean age of death score) when associated with the cell type inhibitory interneuron.\nWhile other cell types were associated with lower mean age of death scores (e.g. stromal cell, astrocyte), the severity of the outcomes were more variable.\n\n\n::: {#cell-fig-prognosis .cell}\n\n```{.r .cell-code .hidden}\nplot_differential_outcomes_heatmap_out$plot\n```\n\n::: {.cell-output-display}\n![Prognosis - Cell types predict the probability of deadly diseases. The broad phenotype 'Hypotonia' and its descendants occur in many different diseases (1,832 diseases in the HPO annotations).Therefore, it can be difficult to prognose clinical outcomes of a newborn individual with hypotonia. With additional knowledge of the particular cell types underlying a patient's hypotonia phenotype, one can greatly narrow down the range of potential outcomes (e.g. age of death). **a**, Here, we show the various cell types by which hypotonia phenotypes confer disease risk. **b**, We also computed the mean age of death score for each cell type across hypotonia-associated diseases, revealing that disrupted inhibitory neurons confer the greatest risk of early death. Ordinal age of death categories from the HPO disease annotations were encoded numerically and averaged ([Table @tbl-death]) to produce mean Age of Death scores for each disease (on a scale from 1-8). For example, a score of 1 corresponds to prenatal death, while a score of 8 corresponds to death in late adulthood.](index_files/figure-pdf/fig-prognosis-1.pdf){#fig-prognosis fig-pos='H'}\n:::\n:::\n\n\n### Therapeutic target identification\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nprioritise_targets_out <- MSTExplorer::prioritise_targets(\n results = results, \n ctd_list = ctd_list,\n phenotype_to_genes = p2g,\n hpo = hpo,\n \n keep_deaths=NULL,\n keep_onsets=NULL,\n keep_specificity_quantiles = seq(30,40), ## NULL:70, 30-40:64 \n keep_mean_exp_quantiles = seq(30,40), ## NULL:65, 10:55\n info_content_threshold=8, ## 8:55, 5:64 \n effect_threshold=NULL, ## 1:39\n severity_score_gpt_threshold=NULL, ## 10:78, NULL:82\n symptom_intersection_threshold=.25, ## .25:57\n evidence_score_threshold=3, ## 5:47, 4:47, 3:64\n top_n = 10, ## 5:38, 20:42, 30:45, 40:52, 50:55\n group_vars = \"hpo_id\")\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritising gene targets.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding term definitions.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding information_content scores.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='start' \n - Rows: 51,874,984 \n - Phenotypes: 11,028 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering @ q-value <= 0.05\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='q_threshold' \n - Rows: 2,115,670 \n - Phenotypes: 9,575 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='effect_threshold' \n - Rows: 2,115,670 \n - Phenotypes: 9,575 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with AgeOfDeath.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_deaths' \n - Rows: 2,115,670 \n - Phenotypes: 9,575 \n - Diseases: 12,467 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor columns already present. Skipping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nKeeping descendants of 1 term(s).\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n17,548 terms remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1,889,042 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_descendants' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_ont_levels' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nTranslating ontology terms to ids.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n151 phenotypes do not have matching HPO IDs.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading in GPT annotations for 16,982 phenotypes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='gpt_filters' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='severity_score_gpt_threshold' \n - Rows: 1,889,042 \n - Phenotypes: 9,499 \n - Diseases: 12,364 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='info_content_threshold' \n - Rows: 979,360 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with onset.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_onsets' \n - Rows: 979,360 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Tiers.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_tiers' \n - Rows: 979,360 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with modifiers\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='severity_threshold' \n - Rows: 979,435 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='severity_threshold_max' \n - Rows: 979,435 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with n_diseases\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: genes_to_phenotype.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='pheno_ndiseases_threshold' \n - Rows: 979,435 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenotype frequencies.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='pheno_frequency_threshold' \n - Rows: 981,649 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_celltypes' \n - Rows: 981,649 \n - Phenotypes: 7,691 \n - Diseases: 11,916 \n - Cell types: 201\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConverting phenos to GRanges.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: ensembldb\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering metadata for 4926 unique genes.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: EnsDb.Hsapiens.v75\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='symptom_gene_overlap' \n - Rows: 1,092,098 \n - Phenotypes: 7,015 \n - Diseases: 8,102 \n - Cell types: 201 \n - Genes: 4,650\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering by keep_chr.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_chr' \n - Rows: 1,092,098 \n - Phenotypes: 7,015 \n - Diseases: 8,102 \n - Cell types: 201 \n - Genes: 4,650\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering by gene-disease association evidence.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='evidence_score_threshold' \n - Rows: 825,255 \n - Phenotypes: 6,605 \n - Diseases: 6,622 \n - Cell types: 201 \n - Genes: 3,938\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltering by gene size.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3,938 / 3,938 genes kept.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='gene_size' \n - Rows: 825,255 \n - Phenotypes: 6,605 \n - Diseases: 6,622 \n - Cell types: 201 \n - Genes: 3,938\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='keep_biotypes' \n - Rows: 825,255 \n - Phenotypes: 6,605 \n - Diseases: 6,622 \n - Cell types: 201 \n - Genes: 3,938\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='add_driver_genes' \n - Rows: 255,497 \n - Phenotypes: 6,447 \n - Diseases: 6,442 \n - Cell types: 201 \n - Genes: 3,873\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding symptom-level results.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSubsetting results by q_threshold and effect.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n255,497 associations remain after filtering.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='symptom_intersection_threshold' \n - Rows: 255,497 \n - Phenotypes: 6,447 \n - Diseases: 6,442 \n - Cell types: 201 \n - Genes: 3,873\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene frequencies.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: genes_to_phenotype.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='gene_frequency_threshold' \n - Rows: 350,367 \n - Phenotypes: 6,447 \n - Diseases: 6,442 \n - Cell types: 201 \n - Genes: 3,873\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPruning ancestors.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n5287 / 6447 terms were kept after pruning.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='prune_ancestors' \n - Rows: 196,856 \n - Phenotypes: 5,287 \n - Diseases: 6,147 \n - Cell types: 201 \n - Genes: 3,774\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSorting rows.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFinding top 10 gene targets per: hpo_id\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='top_n' \n - Rows: 32,735 \n - Phenotypes: 5,287 \n - Diseases: 4,850 \n - Cell types: 201 \n - Genes: 3,180\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nPrioritised targets: step='end' \n - Rows: 32,735 \n - Phenotypes: 5,287 \n - Diseases: 4,850 \n - Cell types: 201 \n - Genes: 3,180\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngenes_per_pheno <- prioritise_targets_out$top_targets[,list(n=data.table::uniqueN(gene_symbol)),by=\"hpo_id\"]\np2g <- HPOExplorer::add_ont_lvl(p2g)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGetting absolute ontology level for 18,082 IDs.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nmin_ont_lvl <- 3\ngenes_per_pheno_all <- p2g[ontLvl>min_ont_lvl,\n list(n=data.table::uniqueN(gene_symbol)),by=\"hpo_id\"]\n\n\ntop_celltypes <- prioritise_targets_out$top_targets[,list(\n np=data.table::uniqueN(hpo_id)),\n by=\"cl_name\"]|>data.table::setorderv(\"np\",-1)\ntop_ancestors <- prioritise_targets_out$top_targets[,list(\n np=data.table::uniqueN(hpo_id),\n nc=data.table::uniqueN(CellType),\n ng=data.table::uniqueN(gene_symbol)\n ),\n by=\"ancestor_name\"]|>\n data.table::setorderv(\"np\",-1)\n```\n:::\n\n\nNext, we identified putative cell type-specific gene targets for several severe disease phenotypes.\nThis yielded putative therapeutic targets for 5287 phenotypes across 4850 diseases in 201 cell types and 3180 genes ([Fig. @fig-therapy-filter]).\nWhile this constitutes a large number of genes in total, each phenotype was assigned a median of 2 gene targets (mean=3.29, min=1, max=10).\nRelative to the number of genes annotations per phenotype in the HPO overall (median=7, mean=61.95, min=1, max=5003) this represents a substantial decrease in the number of candidate target genes, even when excluding high-level phenotypes (HPO level\\>3).\nIt is also important to note that the phenotypes in the prioritised targets list are ranked by their severity, allowing us to distinguish between phenotypes with a high medical urgency (e.g. 'Hydranencephaly') from those with lower medical urgency (e.g. 'Hyperplastic labia majora').\nThis can be useful for both clinicians, biomedical scientists, and pharmaceutical manufacturers who wish to focus their research efforts on phenotypes with the greatest need for intervention.\n\nAcross all phenotypes, epithelial cell were most commonly implicated (834 phenotypes), followed by stromal cell (627 phenotypes), stromal cell (627 phenotypes), neuron (478 phenotypes), chondrocyte (385 phenotypes), and endothelial cell (363 phenotypes).\nGrouped by higher-order ontology category, 'Abnormality of the musculoskeletal system' had the greatest number of enriched phenotypes (961 phenotypes, 863 genes), followed by 'Abnormality of the nervous system' (745 phenotypes, 1163 genes), 'Abnormality of head or neck' (545 phenotypes, 997 genes), 'Abnormality of the genitourinary system' (446 phenotypes, 710 genes), and 'Abnormality of the eye' (379 phenotypes, 572 genes).\n\n### Therapeutic target validation\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n## Gene therapy only\nttd_check_out <- MSTExplorer::ttd_check(\n top_targets=prioritise_targets_out$top_targets, \n drug_types = \"Gene therapy\",\n allow.cartesian = TRUE,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: readxl\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRetrieving all organisms available in gprofiler.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing stored `gprofiler_orgs`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping species name: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1 organism identified from search: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n35,812 / 51,339 (69.76%) genes mapped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nNon-failed gene targets enrichment p-value: 0.0104281415165849\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFailed gene targets depletion p-value: 0.364508393285371\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in ggplot2::geom_text(data = unique(dat_sub[, c(\"HIGHEST_STATUS\", : Ignoring unknown aesthetics: fill\nIgnoring unknown aesthetics: fill\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\n## All therapy types\nttd_check_all_out <- MSTExplorer::ttd_check(\n top_targets=prioritise_targets_out$top_targets, \n allow.cartesian = TRUE,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRetrieving all organisms available in gprofiler.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing stored `gprofiler_orgs`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMapping species name: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n1 organism identified from search: hsapiens\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n35,812 / 51,339 (69.76%) genes mapped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nNon-failed gene targets enrichment p-value: 3.05778572878398e-19\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFailed gene targets depletion p-value: 2.51631585215379e-199\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in ggplot2::geom_text(data = unique(dat_sub[, c(\"HIGHEST_STATUS\", : Ignoring unknown aesthetics: fill\nIgnoring unknown aesthetics: fill\n```\n\n\n:::\n:::\n\n\nTo determine whether the genes prioritised by our therapeutic targets pipeline were plausible, we checked what percentage of gene therapy targets we recapitulated.\nData on therapeutic approval status was gathered from the Therapeutic Target Database (TTD; release 2024-03-22) [@Liu2011-qd].\nOverall, we prioritised 79% of all non-failed existing gene therapy targets.\nA hypergeometric test confirmed that our prioritised targets were significantly enriched for non-failed gene therapy targets ($p=0.0104$).\nImportantly, we did not prioritise any of the failed therapeutics (0%), defined as having been terminated or withdrawn from the market.\nThe hypergeometric test for depletion of failed targets did not reach significance ($p=0.365$), but this is to be expected as there was only one failed gene therapy target in the TTD database.\n\nEven when considering therapeutics of any kind ([Fig. @fig-therapy-validate-all]), not just gene therapies, we recapitulated 44% of the non-failed therapeutic targets and 0% of the terminated/withdrawn therapeutic targets (n=1255).\nHere we found that our prioritised targets were significantly enriched for non-failed therapeutics ($p=3e-19$), and highly significantly depleted for failed therapeutics ($p=3e-199$).\nThis suggests that our multi-scale evidence-based prioritisation pipeline is capable of selectively identifying genes that are likely to be effective therapeutic targets.\n\n\n::: {#cell-fig-therapy-validate .cell}\n\n```{.r .cell-code .hidden}\nttd_check_out$plot\n```\n\n::: {.cell-output-display}\n![Therapeutics - Validation of prioritised therapeutic targets. The proportion of existing gene therapy targets (documented in the Therapeutic Target Database) recapitulated by our prioritisation pipeline. Therapetics are stratified by the stage of clinical development they were at during the time of writing.](index_files/figure-pdf/fig-therapy-validate-1.pdf){#fig-therapy-validate fig-pos='H'}\n:::\n:::\n\n\n### Selected example targets\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ntop_targets <- prioritise_targets_out$top_targets[,n_genes:=data.table::uniqueN(gene_symbol),\n by=\"hpo_id\"][n_genes<5 & proportion_driver_genes_symptom>.25]\n\ntop_phenotypes <- unique(top_targets$hpo_name)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nheight <- \"80vh\"\nwidth <- \"100vw\"\nphenotypes_network <- c(\"Lethal skeletal dysplasia\",\n \"GM2-ganglioside accumulation\",\n \"Alzheimer disease\",\n \"Parkinson disease\")\n\nphenotype <- phenotypes_network[1]\nvn_therapy_eg1 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[hpo_name==phenotype], \n main = NULL,\n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: visNetwork\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af371adf511_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- phenotypes_network[2]\nvn_therapy_eg2 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[hpo_name==phenotype], \n main = NULL,\n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af3697390d1_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- phenotypes_network[3]\nvn_therapy_eg3 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = NULL, \n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af3212f9ea0_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- phenotypes_network[4]\nvn_therapy_eg4 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = NULL, \n height = height,\n width = width,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpXiPAxF/file12af325e21359_prioritise_targets_network.html\n```\n\n\n:::\n:::\n\n::: {#fig-therapy-examples .cell layout=\"[[1,1], [1], [1]]\"}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg1$plot\n```\n\n![Lethal skeletal dysplasia](index_files/figure-pdf/fig-therapy-examples-1.pdf){#fig-therapy-examples-1}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg2$plot\n```\n\n![GM2-ganglioside accumulation](index_files/figure-pdf/fig-therapy-examples-2.pdf){#fig-therapy-examples-2}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg3$plot\n```\n\n![Alzheimer disease](index_files/figure-pdf/fig-therapy-examples-3.pdf){#fig-therapy-examples-3}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg4$plot\n```\n\n![Parkinson disease](index_files/figure-pdf/fig-therapy-examples-4.pdf){#fig-therapy-examples-4}\n\nExample cell type-specific gene therapy targets for several severe phenotypes and their associated diseases. Each disease (blue cylinders) is connected to its phenotype (purple cylinders) based on well-established clinical observations recorded within the HPO [@Gargano2024-fc]. Phenotypes are connected to cell types (red circles) via association testing between weighted gene sets ($FDR_{p,c}<0.05$). Each cell type is connected to the prioritised gene targets (yellow boxes) based on the driver gene analysis.The thickness of the edges connecting the nodes represent the (mean) fold-change from the bootstrapped enrichment tests. Nodes were spatially arranged using the Sugiyama algorithm [@Sugiyama1981-ev].\n:::\n\n\nFrom our prioritised targets, we selected the following four sets of phenotypes or diseases as examples: 'Lethal skeletal dysplasia', 'GM2-ganglioside accumulation', 'Alzheimer disease', 'Parkinson disease'.\n\nSkeletal dysplasia is a heterogeneous group of over 450 disorders that affect the growth and development of bone and cartilage.\nThis phenotype can be lethal when deficient bone growth leads to the constriction of vital organs such as the lungs.\nEven after surgical interventions, these complications continue to arise as the child develops.\nPharmacological interventions to treat this condition have largely been ineffective.\nWhile there are various cell types involved in skeletal system development, our pipeline nominated chondrocytes as the causal cell type underlying the lethal form of this condition.\nAssuringly, we found that the disease 'Achondrogenesis Type 1B' is caused by the genes *SLC26A2* and *COL2A1* via chondrocytes.\nWe also found that 'Platyspondylic lethal skeletal dysplasia, Torrance type'.\nThus, in cases where surgical intervention is insufficient, targeting these genes within chondrocytes may prove a viable long-term solution for children suffering from lethal skeletal dysplasia.\n\nTay-Sachs disease is a devastating disease in which children are born appearing healthy, which gradually degrades leading to death after 3-5 years.\nThe underlying cause is the toxic accumulation of gangliosides in the nervous system due to a loss of the enzyme produced by *HEXA*.\nWhile this could in theory be corrected with gene editing technologies, there remain some outstanding challenges.\nOne of which is early detection and diagnosis, before irreversible damage has occurred.\nOur pipeline implicated extravillous trophoblasts of the placenta in 'GM2-ganglioside accumulation'.\nWhile not necessarily a target for gene therapy, checking these cells *in utero* for an absence of *HEXA* may serve as a viable biomarker as these cells normally express the gene at high levels.\nEarly detection of Tay-Sachs disease may lengthen the window of opportunity for therapeutic intervention, especially when genetic sequencing is not available or variants of unknown significance are found within *HEXA*.\n\nAlzheimer disease (AD) is the most common neurodegenerative condition. It is characterised by a set of variably penetrant phenotypes including memory loss, cognitive decline, cerebral proteinopathy. Interestingly, we found that different forms of early onset AD (which are defined by the presence of a specific disease gene) are each associated with different cell types via different phenotypes. For example, AD 3 and AD 4 are primarily associated with cells of the digestive system ('enterocyte', 'gastric goblet cell') and are implied to be responsible for the phenotypes 'Senile plaques', 'Alzheimer disease', 'Parietal hypometabolism in FDG PET', 'Cerebral amyloid angiopathy'. Meanwhile, early-onset autosomal dominant\nAD and AD 2 are primarily associated with immune cells ('alternatively activated macrophage', 'microglial cell') and are implied to be responsible for the phenotypes 'Neurofibrillary tangles', 'Long-tract signs', 'Finger agnosia', 'Semantic dementia'. This suggests that different forms of AD may be driven by different cell types and phenotypes, which may help explain its variability in onset and clinical presentation. \n\nFinally, Parkinson disease (PD) is characterised by motor symptoms such as tremor, rigidity, and bradykinesia. However there are a number of additional phenotypes associated with the disease that span multiple physiological systems. PD 19a and PD 8 seemed to align most closely with the canonical understanding of PD as a disease of the central nervous system in that they implicated oligodendrocytes and neurons. Though the reference datasets being used in this study were not annotated at sufficient resolution to distinguish between different subtypes of neurons, in particular dopaminergic neurons. PD 19a/8 also suggested that risk variants in *LRRK2* mediate their effects on PD through both myeloid cells and oligodendrocytes by causing gliosis of the substantia nigra. The remaining clusters of PD mechanisms revolved around chondrocytes (PD 20), amacrine cells of the eye (hereditary late-onset PD), and the respiratory/immune system (PD 14). While the diversity in cell type-specific mechanisms is somewhat surprising, it may help to explain the wide variety of cross-system phenotypes frequently observed in PD.\n\nIt should be noted that the HPO only includes gene annotations for the monogenic forms of AD and PD. However it has previously been shown that there is at least partial overlap in their phenotypic and genetic aetiology with respect to their common forms. Thus understanding the monogenic forms of these diseases may shed light onto their more common counterparts.\n\n### Experimental model translatability\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\npheno_map_genes_match <- KGExplorer::map_upheno_data()\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached data: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/pheno_map_genes_match.rds\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\npheno_map_targets <- pheno_map_genes_match[\n id1 %in% unique(prioritise_targets_out$top_targets$hpo_id)\n ]|>\n data.table::setorderv(\"phenotype_genotype_score\",-1)\ntaxa_count <- sort(table(pheno_map_targets$gene_taxon_label2), decreasing = TRUE)\n\npheno_map_targets_severe <- pheno_map_targets[\n id1 %in% unique(prioritise_targets_out$top_targets[severity_score_gpt>10,]$hpo_id)\n ]\n\npheno_map_targets_severe[,summary:=paste0(\n shQuote(object_label1), \n \" ($SIM_{o,g}=\",round(phenotype_genotype_score,3),\"$)\"\n )]\n```\n:::\n\n\nWe computed interspecies translatability scores using a combination of both ontological ($SIM_{o}$) and genotypic ($SIM_{g}$) similarity relative to each homologous human phenotype and its associated genes [Fig. @fig-animal-models]. In total, we mapped 278 non-human phenotypes (in *Caenorhabditis elegans*, *Danio rerio*, *Mus musculus*, *Rattus norvegicus*) to 849 homologous human phenotypes. Amongst the 5287 phenotype within our prioritised therapy targets, 356 had viable animal models in at least on non-human species. Per species, the number of homologous phenotypes was: *Danio rerio* (n=214), *Mus musculus* (n=152), *Caenorhabditis elegans* (n=35), *Rattus norvegicus* (n=3). Amongst our prioritised targets with a GPT-4 severity score of >10, the phenotypes with the greatest animal model similarity were 'Anterior vertebral fusion' ($SIM_{o,g}=0.967$), 'Disc-like vertebral bodies' ($SIM_{o,g}=0.964$), 'Metaphyseal enchondromatosis' ($SIM_{o,g}=0.946$), 'Peripheral retinal avascularization' ($SIM_{o,g}=0.943$), 'Retinal vascular malformation' ($SIM_{o,g}=0.943$).\n\n## Discussion {#sec-discussion}\n\nAcross the 201 cell types and 11,047 RD-associated phenotypes investigated, more than 46,514 significant phenotype-cell type relationships were discovered.\nThe examples we have highlighted above recapitulate well-known relationships, provide additional cellular context to many of these known relationships, and discover novel relationships at multiple biological scales.\n\nInvestigating RDs at the level of phenotypes offers several key advantages.\nFirst, the vast majority of RDs only have one associated gene (7671/8631 diseases = 89%).\nAggregating gene sets across diseases into phenotype-centric \"buckets\" permits sufficiently well-powered analyses, with an average of \\~76 genes per phenotype (median=7) [see Fig. @fig-diagram].\nSecond, we hypothesise that these phenotype-level gene sets converge on a limited number of molecular and cellular pathways.\nPerturbations to these pathways manifest as one or more phenotypes which, when considered together, tend to be clinically diagnosed as a certain disease.\nThird, RDs are often highly heterogeneous in their clinical presentation across individuals, leading to the creation of an ever increasing number of disease subtypes (some of which only have a single documented case).\nIn contrast, a phenotype-centric approach enables us to more accurately describe a particular individual’s version of a disease without relying on the generation of additional disease subcategories.\nBy characterising an individual’s precise phenotypes over time, we may better understand the underlying biological mechanisms that have caused their condition.\nHowever, in order to achieve a truly precision-based approach to clinical care, we must first characterise the molecular and cellular mechanisms that cause the emergence of each phenotype.\nHere, we provide a highly reproducible framework that enables this at the scale of the entire phenome.\nThis presents an opportunity to design basket trials of patients with different diseases but overlapping phenotypes and cellular mechanisms [@Zanello2023-zd].\nIt may be especially helpful for complex patients with diagnostically ambiguous sets of phenotypes who would otherwise be excluded from traditional clinical trials [@Diaz-Santiago2020-ep].\n\nIt was paramount to the success of this study to ensure our results were anchored in ground-truth benchmarks, generated falsifiable hypotheses, and rigorously guarded against false-positive associations.\nExtensive validation using multiple approaches demonstrated that our methodology consistently recapitulates expected phenotype-cell type associations ([Fig. @fig-summary]-[Fig. @fig-congenital]).\nThis was made possible by the existence of comprehensive, structured ontologies for all phenotypes (HPO) and cell types (CL), which provide an abundance of clear and falsifiable hypotheses for which to test our predictions against.\nSeveral key examples include 1) strong enrichment of associations between cell types and phenotypes within the same anatomical systems ([Fig. @fig-summary]b-d), 2) a strong relationship between phenotype-specificity and the strength and number of cell type associations ([Fig. @fig-ontology-lvl]), 3) identification of the precise cell subtypes involved in susceptibility to various subtypes of recurrent bacterial infections ([Fig. @fig-rni]), 4) a strong positive correlation between the frequency of congenital onset of a phenotype and the proportion of developmental cell types associated with it ([Fig. @fig-congenital])), and 5) consistent phenotype-cell type associations across multiple independent single-cell datasets ([Fig. @fig-ctd-correlation]).\nHaving validated our phenotype-cell type associations, we then went on to demonstrate how these results may be used in each stage of clinical care: diagnosis ([Fig. @fig-diagnosis]), prognosis ([Fig. @fig-prognosis]), treatment, and therapeutics development ([Fig. @fig-therapy-examples]).\n\nDiagnosis is an essential but challenging step in RD patient care.\nAdditional phenotypes that emerge over time may assist a clinician to reach a more confident disease diagnosis.\nHowever many of these phenotypes can have a serious impact on patient quality of life or survival and avoiding them would be far better for patient outcomes.\nOften times phenotypes alone cannot clearly pinpoint the disease and thus a diagnosis is never reached.\nHaving a more complete understanding of the mechanisms underlying observed phenotypes allows clinicians to far more effectively make predictions about what additional, less obvious phenotypes they should search for to confirm or reject their hypothesis of disease diagnosis (e.g. with imaging or biomarker tests).\n\nConsider the following hypothetical scenario.\nA clinician observes that a newborn patient has several phenotypes ('Generalized neonatal hypotonia', 'Scrotal hypospadias', 'Increased circulating progesterone'), none of which conclusively point to a single disease diagnosis.\nUnder the strong suspicion that the phenotypes are genetic in origin, the clinician orders whole-genome sequencing (WGS) on the patient as well as the patient’s family.\nThe clinician finds that the patient has a number of putative causal mutations, narrowing down the number of potential diseases from hundreds to just 10.\nFurther narrowing down the possibilities at this stage can be extremely challenging even for expert clinical geneticists.\nHowever, additional knowledge of which tissues and cell types are primarily affected allow the clinician to make a series of testable hypotheses that they may begin to investigate.\nFor example, two of the putative diseases are known to cause aberrant splicing events in a gene that is only expressed in adrenocortical cells ([Fig. @fig-diagnosis]), providing justification to order a needle biopsy of the adrenal gland.\nRNA sequencing is performed on the tissue biopsy and it is discovered that the patient does indeed have high expression of the dysfunctional transcript, confirming the disease diagnosis [@Lord2021-rf].\nThis opens new avenues for the patient to receive timely and effective treatments for their specific condition, which is important as their version of the disease tends to lead to death in early childhood if left untreated ([Fig. @fig-prognosis]).\nFortunately, their diagnosis now qualifies them to participate in a clinical trial of a novel gene therapy with promising preliminary results.\nFurthermore, it is predicted that this patient would respond especially well to this treatment given that the mechanisms of action of the gene therapy primarily acts on adrenocortical cells ([Fig. @fig-therapy-examples]).\n\nUnfortunately, there are currently only treatments available for less than 5% of RDs [@Halley2022-pd].\nNovel technologies including CRISPR, prime editing, antisense oligonucleotides, viral vectors, and/or lipid nanoparticles, have been undergone significant advances in the last several years [@Bueren2023-ma; @Bulaklak2020-ta; @Godbout2023-uo; @Kohn2023-vh; @Zhao2023-qy] and proven remarkable clinical success in an increasing number of clinical applications [@Darrow2019-om; @Mendell2017-kg; @Mueller2017-fz; @Russell2017-dh].\nThe U.S. Food and Drug Administration (FDA) recently announced an landmark program aimed towards improving the international regulatory framework to take advantage of the evolving gene/cell therapy technologies @Lu2024-kl with the aim of bringing dozens more therapies to patients in a substantially shorter timeframe than traditional pharmaceutical product development (typically 5-20 years with a median of 8.3 years) [@Brown2022-ye].\nWhile these technologies have the potential to revolutionise RD medicine, their successful application is dependent on first understanding the mechanisms causing each disease.\n\nTo address this critical gap in knowledge, we used our results to create a reproducible and customisable pipeline to nominate cell type-resolved therapeutic targets ([Fig. @fig-therapy-filter]-[Fig. @fig-therapy-examples]).\nTargeting cell type-specific mechanisms underlying granular RD phenotypes can improve therapeutic effectiveness by treating the causal root of an individual's conditions [@Bulaklak2020-ta; @Moffat2017-al].\nA cell type-specific approach also helps to reduce the number of harmful side effects caused by unintentionally delivering the therapeutic to off-target tissues/cell types (which may induce aberrant gene activity), especially when combined with technologies that can target cell surface antigens (e.g viral vectors) [@Zhou2013-wx].\nThis has the additional benefit of reducing the minimal effective dose of a therapeutic, which can be both immunogenic and extremely financially costly [@Bueren2023-ma; @Kohn2023-vh; @Nuijten2022-yc; @Thielen2022-ud].\nHere, we demonstrate the utility of a high-throughput evidence-based approach to RD therapeutics discovery by highlighting several of the most promising therapeutic candidates.\nOur pipeline takes into account a myriad of factors, including the strength of the phenotype-cell type associations, symptom-cell type associations, cell type-specificity of causal genes, the severity and frequency of the phenotypes, suitability for gene therapy delivery systems (e.g. recombinant adeno-associated viral vectors (rAAV)), as well as a quantitative analysis of phenotypic and genetic animal model translatability ([Fig. @fig-animal-models]).\nWe validated these candidates by comparing the proportional overlap with gene therapies that are presently in the market or undergoing clinical trials, in which we recovered 79% of all active gene therapies and 0% of failed gene therapies ([Fig. @fig-therapy-validate], [Fig. @fig-therapy-validate-all]). Despite nominating a large number of putative targets, hypergeometric tests confirmed that our targets were strongly enriched for targets of existing therapies that are either approved or currently undergoing clinical trials.\n\nIt should be noted that our study has several key limitations.\nFirst, while our cell type datasets are amongst the most comprehensive human scRNA-seq references currently available, they are nevertheless missing certain tissues, cell types (e.g. spermatocytes, oocytes), and life stages (post-natal childhood, senility).\nIt is also possible that we have not captured certain cell state signatures that only occur in disease (e.g. disease-associated microglia \\[**CITATION**\\]).\nThough we reasoned that using only control cell type signatures would mitigate bias towards any particular disease, and avoid degradation of gene signatures due to loss of function mutations.\nSecond, the collective knowledge of gene-phenotype and gene-disease associations is far from complete and we fully anticipate that these annotations will continue to expand and change well into the future.\nIt is for this reason we designed this study to be easily reproduced within a single containerised script so that we (or others) may rerun it with updated datasets at any point.\nFinally, causality is notoriously difficult to prove definitively from associative testing alone, and our study is not exempt from this rule.\nDespite this, there are several reasons to believe that our approach is able to better approximate causal relationships than traditional approaches.\nFirst, we did not intentionally preselect any subset of phenotypes or cell types to investigate here.\nAlong with a scaling prestep during linear modelling, this means that all the results are internally consistent and can be directly compared to one another (in stark contrast to literature meta-analyses).\nFurthermore, for the phenotype gene signatures we used expert-curated GenCC annotations [@DiStefano2022-ao; @DiStefano2023-np] to weight the current strength of evidence supporting a causal relationship between each gene and phenotype.\nThis is especially important for phenotypes with large genes lists (thousands of annotations) for which some of the relationships may be tenuous.\nWithin the cell type references, we deliberately chose to use specificity scores (rather than raw gene expression) as this normalisation procedure has previously been demonstrated to better distinguish between signatures of highly similar cell types/subtypes [@Skene2016-rb].\n\nMoving forward, we are now actively seeking industry and academic partnerships to begin experimentally validating our multi-scale target predictions and exploring their potential for therapeutic translation.\nNevertheless, there are more promising therapeutic targets here than our research group could ever hope to pursue by ourselves.\nIn the interest of accelerating research and ensuring RD patients are able to benefit from this work as quickly as possible, we have decided to publicly release all of the results described in this study.\nThese can be accessed in multiple ways, including through a suite of R packages as well as a web app, the [Rare Disease Celltyping Portal](https://neurogenomics.github.io/rare_disease_celltyping_apps/home/).\nThe latter allows our results to be easily queried, filtered, visualised, and downloaded without any knowledge of programming.\nThrough these resources we aim to make our findings useful to a wide variety of RD stakeholders including subdomain experts, clinicians, advocacy groups, and patients.\n\n## Conclusions {#sec-conclusions}\n\nUltimately, our primary objective was to develop a methodology capable of generating high-throughput phenome-wide predictions while preserving the accuracy and clinical utility typically associated with more narrowly focused studies.\nWith the rapid advancement of gene therapy technologies, and a regulatory landscape that is evolving to better meet the needs of a large and diverse patient population, there is finally momentum to begin to realise the promise of personalised medicine.\nThis has especially important implications for the global RD community which has remained relatively neglected.\nHere, we lay out the groundwork necessary for this watershed moment by providing a scalable, cost-effective, and fully reproducible means of resolving the multi-scale, cell-type specific mechanisms of virtually all rare diseases.\n\n## Methods {#sec-methods}\n\n### Human Phenotype Ontology\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngencc <- KGExplorer::get_gencc(agg_by = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\ngencc_version <- KGExplorer::get_version(gencc, return_version = TRUE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n:::\n\n\nThe latest version of the HPO (release 2024-02-08) was downloaded from the EMBL-EBI Ontology Lookup Service [@Cote2010-gp] and imported into R using the `HPOExplorer` package.\nThis R object was used to extract ontological relationships between phenotypes as well as to assign absolute and relative ontological levels to each phenotype.\nThe latest version of the HPO phenotype-to-gene mappings and phenotype annotations were downloaded from the official HPO GitHub repository and imported into R using `HPOExplorer`.\nThis contains lists of genes associated with phenotypes via particular diseases, formatted as three columns in a table (gene, phenotype, disease).\n\nHowever, not all genes have equally strong evidence of causality with a disease or phenotype, especially when considering that the variety of resources used to generate these annotations (OMIM, Orphanet, DECIPHER) use variable methodologies (e.g. expert-curated review of the medical literature vs. automated text mining of the literature).\nTherefore we imported data from the Gene Curation Coalition (GenCC) [@DiStefano2022-ao; @DiStefano2023-np], which (as of 2024-03-01) 21798 evidence scores across 7229 diseases and 5142 genes.\nEvidence scores are defined by GenCC using a standardised ordinal rubric which we then encoded as a semi-quantitative score ranging from 0 (no evidence of disease-gene relationship) to 6 (strongest evidence of disease-gene relationship) (see @tbl-gencc).\nWe then summed evidence scores per disease, merged this table with the HPO disease-phenotype-gene annotation table, and then cast the data into a gene-by-phenotype matrix filled with the aggregated mean evidence score.\nThis can be expressed as the following equations.\n\nLet us denote:\n\n- $D$ as the set of $d$ diseases.\n\n- $p$ as a phenotype.\n\n- $g$ as a gene.\n\nThe final evidence-weighted gene-by-phenotype matrix ($M_{g,p}$) can be expressed as:\n\n::: {#eq-evidence-scores .content-hidden unless-format=\"html\"}\n![](equations/eq1.png){height=\"300px\"}\n\nConstruction of the evidence-weighted gene-by-phenotype matrix.\n:::\n\n\\\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[NavyBlue]{n1}{M_{g,p}}\n =\n \\frac{\n \\eqnmarkbox[Cerulean]{n3a}{\\sum_{d \\in D}}\n \\eqnmarkbox[blue]{n4a}{R(g,p,d)} \n \\times \n \\eqnmarkbox[BlueViolet]{n5}{E(g,d)} \n }{\n \\eqnmarkbox[Cerulean]{n3b}{\\sum_{d \\in D}}\n \\eqnmarkbox[blue]{n4b}{R(g,p,d)}\n }\n\\end{equation*}\n\\annotate[yshift=1em]{left}{n1}{Weighted gene-by-phenotype \\\\evidence score matrix} \n\\annotate[yshift=-2em]{below,left}{n3a,n3b}{Iterate over all diseases}\n\\annotate[yshift=-2.5em,xshift=2.5em]{below,right}{n4a,n4b}{Binary gene-by-phenotype \\\\relationship matrix,\\\\ (1=relationship, 0=no relationship)}\n\\annotate[yshift=2em]{left}{n5}{Weighted gene-by-disease \\\\evidence score matrix}\n```\n\n:::\n\n\\\n\nHistograms of evidence score distributions at each step in processing can be found in [Fig. @fig-evidence-histograms].\n\n### Single-cell transcriptomic atlases\n\nIn this study, the gene by cell type specificity matrix was constructed using the Descartes Human transcriptome atlas of foetal gene expression, which contains a mixture of single-nucleus and single-cell RNA-seq data (collected with sci-RNA-seq3) [@Cao2020-qz].\nThis dataset contains 377,456 cells representing 77 distinct cell types across 15 tissues.\nAll 121 human foetal samples ranged from 72 to 129 days in estimated postconceptual age.\nTo independently replicate our findings, we also used the Human Cell Landscape which contains single-cell transcriptomic data (collected with microwell-seq) from embryonic, foetal, and adult human samples across 49 tissues [@Han2020-iq].\n\nSpecificity matrices were generated separately for each transcriptomic atlas using the R package `EWCE` (v1.11.3) [@Skene2016-rb].\nWithin each atlas, cell types were defined using the authors’ original freeform annotations in order to preserve the granularity of cell subtypes as well as incorporate expert-identified rare cell types.\nCell types were only aligned and aggregated to the level of corresponding Cell Ontology (CL) [@Diehl2016-gt] annotations afterwards when generating summary figures and performing cross-atlas analyses.\nUsing the original gene-by-cell count matrices from each single-cell atlas, we computed gene-by-cell type expression specificity matrices as follows.\n\nLet us denote: $g$ as a gene, $c$ as a cell type, and $i$ as a single cell.\nGenes with very no expression across any cell types were considered to be uninformative and were therefore removed from the input gene-by-cell matrix $F(g,i,c)$.\n\n::: {#eq-ctd-filter .content-hidden unless-format=\"html\"}\n![](equations/eq2.png){height=\"200px\"}\n\nFiltering of the gene-by-cell expression matrices.\n:::\n\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[purple]{f1}{F(g,i,c)}\n =\n \\begin{cases}\n \\eqnmarkbox[WildStrawberry]{f2}{r_{g,i}},\n \\text{ }l_i = c\\\\0,\n \\text{ }l_i \\neq c \n \\end{cases}\n\\end{equation*}\n\\annotate[yshift=1em]{left}{f1}{Filtered gene-by-cell expression matrix} \n\\annotate[yshift=2em]{left}{f2}{Expression of gene $g$ in cell $i$} \n```\n\n:::\n\n\\\n\nNext, we calculated the mean expression per cell type and normalised the resulting matrix to transform it into a gene-by-cell type expression specificity matrix ($S_{g,c}$).\nIn other words, each gene in each cell type had a 0-1 score where 1 indicated the gene was mostly specifically expressed in that particular cell type relative to all other cell types.\nThis procedure was repeated separately for each of the single-cell atlases and can be summarised as:\n\n::: {#eq-ctd-specificity .content-hidden unless-format=\"html\"}\n![](equations/eq3.png){height=\"300px\"}\n\nConstruction of the gene-by-cell type specificity matrix.\n:::\n\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[orange]{s1}{S_{g,c}}\n =\n \\frac{\n \\eqnmarkbox[purple]{s3a}{\n \\frac{\n \\sum_{i=1}^{|L|} F(g,i,c)\n }{\n N_c \n }\n } \n }{\n \\eqnmarkbox[OrangeRed]{s6}{\\sum_{r=1}^{k}}(\n \\eqnmarkbox[purple]{s3b}{\n \\frac{\n \\sum_{i=1}^{|L|} F(g,i,c)\n }{\n N_c \n }\n } \n ) \n }\n\\end{equation*}\n\\annotate[yshift=1em]{left}{s1}{Gene-by-cell type specificity matrix} \n\\annotate[yshift=2em]{left}{s3a,s3b}{Compute mean expression of each gene per cell type} \n\\annotate{below,left}{s6}{Compute row sums of \\\\mean gene-by-cell type matrix}\n```\n\n:::\n\n\\\n\n### Phenotype-cell type associations\n\nTo test for relationships between each pairwise combination of phenotype (n=11,047) and cell type (n=201) we ran a series of univariate generalised linear models implemented via the `stats::glm` function in R.\nFirst, we filtered the gene-by-phenotype evidence score matrix ($M _{g,p}$) and the gene-by-cell type expression specificity matrix ($S _{g,c}$) to only include genes present in both matrices (n=4,949 genes in the Descartes Human analyses; n=4,653 genes in the Human Cell Landscape analyses).\nThen, within each matrix any rows or columns with a sum of 0 were removed as these were uninformative data points that did not vary.\nTo improve interpretability of the results $\\beta$ coefficient estimates across models (i.e. effect size), we performed a scaling prestep on all dependent and independent variables.\nInitial tests showed that this had virtually no impact on the total number of significant results or any of the benchmarking metrics based on p-value thresholds [Fig. @fig-summary].\nThis scaling prestep improved our ability to rank cell types by the strength of their association with a given phenotype as determined by separate linear models.\n\nWe repeated the aforementioned procedure separately for each of the single-cell references.\nOnce all results were generated using both cell type references (2,206,994 association tests total), we applied Benjamini-Hochberg false discovery rate [@Benjamini1995-vo] (denoted as $FDR_{p,c}$) to account for multiple testing.\nOf note, we applied this correction across all results at once (as opposed to each single-cell reference separately) to ensure the $FDR_{p,c}$ was stringently controlled for across all tests performed in this study.\n\n### Symptom-cell type associations\n\nHere we define a symptom as a phenotype as it presents within the context of the specific disease.\nThe features of a given symptom can be described as the subset of genes annotated to phenotype $p$ via a particular disease $d$, denoted as $G_{d,p}$ ([see Fig. @fig-diagram]).\nTo attribute our phenotype-level cell type enrichment signatures to specific diseases, we first identified the gene subset that was most strongly driving the phenotype-cell type association by computing the intersect of genes that were both in the phenotype annotation and within the top 25% specificity percentile for the associated cell type.\nWe then computed the intersect between symptom genes ($G_{d,p}$) and driver genes ($G_{p,c}$), resulting in the gene subset $G_{d \\cap p \\cap c}$.\nOnly $G_{d \\cap p \\cap c}$ gene sets with 25% or greater overlap with the symptom gene subset ($G_{d,p}$) were kept.\nThis procedure was repeated for all phenotype-cell type-disease triads, which can be summarised as follows:\n\n::: {#eq-symptoms .content-hidden unless-format=\"html\"}\n![](equations/eq4.png){height=\"300px\"}\n:::\n\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\frac{\n \\eqnmarkbox[Chartreuse3]{g1}{|G_{d \\cap p \\cap c} |}\n }{\n \\eqnmarkbox[Emerald]{g2}{|G_{d,p}|}} \n \\geq \\eqnmarkbox[SeaGreen]{g3}{.25} \n\\end{equation*}\n\\annotate[yshift=1em]{left}{g1}{Intersect between \\\\symptom genes ($G_{d,p}$) and driver genes ($G_{p,c}$)} \n\\annotate[yshift=-1em]{below,left}{g2}{Symptom genes \\\\(i.e. genes annotated to a phenotype\\\\ via a specific disease)} \n\\annotate[yshift=-1em]{below,right}{g3}{Minimum proportion of overlap \\\\between $G_{d,p,c}$ and $G_{d,p}$}\n```\n\n:::\n\n\\\n\n### Validation of expected phenotype-cell type relationships\n\nWe first sought to confirm that our tests (across both single-cell references) were able to recover expected phenotype-cell type relationships across seven high-level branches within the HPO ([Fig. @fig-summary]), including abnormalities of the cardiovascular system, endocrine system, eye, immune system, musculoskeletal system, nervous system, and respiratory system.\nWithin each branch the number of significant tests in a given cell type were plotted ([Fig. @fig-summary]b).\nMappings between freeform annotations (the level at which we performed our phenotype- cell type association tests) provided by the original atlas authors and their closest CL term equivalents were provided by CellxGene [@CZI_Single-Cell_Biology_Program2023-fs].\nCL terms along the *x-axis* of [Fig. @fig-summary]b were assigned colours corresponding to which HPO branch showed the greatest number of enrichments (after normalising within each branch to account for differences in scale).\nThe normalised colouring allows readers to quickly assess which HPO branch was most often associated with each cell type, while accounting for differences in the number of phenotypes across branches.\nWe then ran a series of Analysis of Variance (ANOVA) tests to determine whether (within a given branch) a given cell type was more often enriched ($FDR_{p,c}<0.05$) within that branch relative to all of the other HPO branches of an equivalent level in the ontology (including all branches not shown in [Fig. @fig-summary]b).\nAfter applying Benjamini-Hochberg multiple testing correction [@Benjamini1995-vo] (denoted as $FDR _{b,c}$), we annotated each respective branch-by-cell type bar according to the significance (\\*\\*\\*\\* : $FDR _{b,c}<1e-04$, \\*\\*\\* : $FDR _{b,c}<0.001$, \\*\\* : $FDR _{b,c}<0.01$, \\* : $FDR _{b,c}<0.05$).\nCell types in [Fig. @fig-summary]a-b were ordered along the *x-axis* according to a dendrogram derived from the CL ontology ([Fig. @fig-summary]c), which provides ground-truth semantic relationships between all cell types (e.g. different neuronal subtypes are grouped together).\n\nAs an additional measure of the accuracy of our phenotype-cell types test results we identified conceptually matched branches across the HPO and the CL ([Fig. @fig-summary]d and @tbl-celltypes).\nFor example, 'Abnormality of the cardiovascular system' in the HPO was matched with 'cardiocytes' in the CL which includes all cell types specific to the heart.\nAnalogously, 'Abnormality of the nervous system' in the HPO was matched with 'neural cell' in the CL which includes all descendant subtypes of neurons and glia.\nThis cross-ontology matching was repeated for each HPO branch and can be referred to as on-target cell types.\nWithin each branch, the $-log_{10}(FDR _{p,c})$ values of on-target cell types were binned by rounding to the nearest integer (*x-axis*) and the percentage of tests for on-target cell types relative to all cell types were computed at each bin (*y-axis*) ([Fig. @fig-summary]d).\nThe baseline level (dotted horizontal line) illustrates the percentage of on-target cell types relative to the total number of observed cell types.\nAny percentages above this baseline level represent greater than chance representation of the on-target cell types in the significant tests.\n\n### Monarch Knowledge Graph recall\n\nFinally, we gathered known phenotype-cell type relationships from the Monarch Knowledge Graph (MKG), a comprehensive database of links between many aspects of disease biology [@Putman2024-et].\nThis currently includes 103 links between HPO phenotypes (n=103) and CL cell types (n=79).\nOf these, we only considered the 82 phenotypes that we were able to test given that our approach was reliant on gene annotations.\nWe considered instances where we found a significant relationship between exactly matching pairs of HPO-CL terms as a hit.\n\nHowever, as the cell types in MKG were not necessarily annotated at the same level as our single-cell references, we also considered instances where the MKG cell type was an ancestor term of our cell type (e.g. 'myeloid cell' vs. 'monocyte'), or *vice versa*, as hits.\nUsing these criteria, we determined our results recapitulated `**!!RECOMPUTE!!**`% of known phenotype-cell type relationships in the MKG.\nWe next computed how far along the CL ontological tree we would need to travel in order to reach a common ancestor between the MKG cell type and our cell type, for each phenotype-cell type link in the MKG.\nThis provides a metric of not just whether we recapitulated the exact cell types, but how dissimilar our identified cell types were for a given phenotype-cell type association ([Fig. @fig-monarch-recall]).\n\n### Annotation of phenotypes using generative large language models\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ngpt_codes <- formals(HPOExplorer::gpt_annot_codify)\ncode_dict <- paste0(shQuote(names(eval(gpt_codes$code_dict))),\"=\",\n eval(gpt_codes$code_dict), collapse = \", \")\ntiers_dict <- paste0(shQuote(names(eval(gpt_codes$tiers_dict))),\"=\",\n eval(gpt_codes$tiers_dict), collapse = \", \") \n```\n:::\n\n\nOnly a small fraction of the the phenotypes in HPO (\\<1%) have metadata annotations containing information on their time course, consequences, and severity.\nThis is due to the time-consuming nature of manually annotating thousands of phenotypes.\nTo generate such annotations at scale, we used Generative Pre-trained Transformer 4 (GPT-4), a large language model (LLM) as implemented within OpenAI’s chatGPT Application Programming Interface (API).\nAfter extensive prompt engineering and ground-truth benchmarking, we were able to acquire annotations on how often each phenotype directly causes intellectual disability, death, impaired mobility, physical malformations, blindness, sensory impairments, immunodeficiency, cancer, reduced fertility, or is associated with a congenital onset.\nThese criteria were previously defined in surveys of medical experts as a means of systematically assessing phenotype severity [@Lazarin2014-we].\nResponses for each metric were provided in a consistent one-word format which could be one of: 'never', 'rarely', 'often', 'always'.\nThis procedure was repeated in batches (to avoid exceeding token limits) until annotations were gathered for 16982/18082 HPO phenotypes.\n\nWe then encoded these responses into a semi-quantitative scoring system ('never'=0, 'rarely'=1, 'often'=2, 'always'=3), which were then weighted by multiplying a semi-subjective scoring of the relevance of each metric to the concept of severity on a scale from 1-5, with 5 being the most severe ('intellectual_disability'=5, 'death'=5, 'impaired_mobility'=4, 'physical_malformations'=3, 'blindness'=4, 'sensory_impairments'=3, 'immunodeficiency'=3, 'cancer'=3, 'reduced_fertility'=1, 'congenital_onset'=4).\nFinally, the product of the score was normalised to a quantitative severity score ranging from 0-100, where 100 is the theoretical maximum severity score.\nThis phenotype severity scoring procedure can be expressed as follows.\n\n::: {#eq-gpt .content-hidden unless-format=\"html\"}\n![](equations/eq5.png){height=\"300px\"}\n\nComputing normalised severity score from encoded GPT-4 annotations.\n:::\n\n\\\n\\\n\n::: {.content-visible unless-format=\"html\"}\n\n```{=tex}\n\\begin{equation*}\n \\eqnmarkbox[Brown4]{nss}{NSS_p}\n =\n \\frac{ \n \\eqnmarkbox[Goldenrod]{nss2}{\\sum_{j=1}^{m}} \n (\n \\eqnmarkbox[Goldenrod4]{nss3}{F_{pj}}\n \\times \n \\eqnmarkbox[IndianRed4]{nss4}{W_j}\n )\n }{\n \\eqnmarkbox[Tan]{nss5}{\\sum_{j=1}^{m}(\\max\\{F_j\\} \\times W_j)} \n } \\times 100\n\\end{equation*}\n\\annotate[yshift=1em]{left}{nss}{Normalised Severity Score \\\\for each phenotype}\n\\annotate[yshift=3em]{left}{nss2}{Sum of weighted annotation values \\\\across all metrics}\n\\annotate[yshift=3em]{right}{nss3}{Numerically encoded annotation value \\\\of metric $j$ for phenotype $p$}\n\\annotate[yshift=1em]{right}{nss4}{Weight for metric $j$} \n\\annotate[yshift=-1em]{below,right}{nss5}{Theoretical maximum severity score}\n```\n\n:::\n\n\\\n\n### Enrichment of foetal cell types in congenital phenotypes\n\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\nfetal_keywords <- shQuote(eval(formals(MSTExplorer::plot_congenital_annotations)$fetal_keywords) )\n```\n:::\n\n\nThe GPT-4 annotations also enabled us to assess whether foetal cell types were more often significantly associated with congenital phenotypes in our Human Cell Landscape results as this single-cell reference contained both adult and foetal versions of cell types ([Fig. @fig-congenital]).\nTo do this, we performed a chi-squared ($\\chi^2$) test on the proportion of significantly associated cell types containing any of the substrings 'fetal', 'fetus', 'primordial', 'hESC' or 'embryonic' (within cell types annotations from the original Human Cell Landscape authors [@Han2020-iq]) vs. those associated without, stratified by how often the corresponding phenotype had a congenital onset according to the GPT phenotype annotations (including 'never', 'rarely', 'often', 'always').\nIn addition, a series of $\\chi^2$ tests were performed within each congenital onset frequency strata, to determine whether the observed proportion of foetal cell types vs. non-foetal cell types significantly deviated from the proportions expected by chance.\n\n### Diagnosis via cell type-specific disease prediction\n\nWe designed an algorithm that uses our results to predict the most likely cell types underlying a set of phenotypic and genotypic traits observed in a patient ([Fig. @fig-diagnosis]).\nThis is implemented within `MSTExplorer::predict_celltypes` and takes HPO phenotypes as inputs.\nIt can optionally take included risk genes, excluded risk genes, included diseases and/or excluded diseases as additional inputs.\nIt then computes the It then outputs a weighted ranking of cell types, where higher ranking indicates a higher likelihood of being the underlying mechanism of the patient’s particular form of disease(s).\n\n### Prognosis via cell type-mediated differential outcomes\n\nThe phenotype hypotonia is associated with diseases that range in severity from benign to debilitating to fatal [@Ahmed2016-ag].\nIn the absence of additional information, making an accurate diagnosis is extremely challenging even for experienced physicians.\nThe magnitude of this challenge is highlighted by the fact that each disease is associated with anywhere between 1-595 unique phenotypes (median=61, mean=77.74) within the HPO.\nConversely, each phenotype is associated with 1-5404 diseases (median=6, mean=60.74).\nWe addressed this challenge by applying our phenotype-cell type association results in combination with expert-curated HPO annotations of clinical outcomes associated with each phenotype-disease pairing ([Fig. @fig-prognosis]).\nWe first extracted results for the phenotype 'Hypotonia' (*HP:0001252*) and its 13 descendant subterms from our phenotype-cell type association analyses.\nNext, we encoded the \"Age of Death\" categories associated with each disease in an ordinal scale ranging from 1, corresponding to prenatal death, to 8, corresponding to death in late adulthood (@tbl-death).\nTo determine whether cell type identity significantly predicted the age of death, we conducted an ANOVA where cell type was the predictor and \"Age of Death score\" was the outcome.\n\n### Therapeutic target identification\n\nWe developed a systematic and automated strategy for identifying putative cell type-specific gene targets for each phenotype based on a series of filters at phenotype, cell type, and gene levels.\nThe entire target prioritisation procedure can be replicated with a single function: `MSTExplorer::prioritise_targets`.\nThis function automates all of the reference data gathering (e.g. phenotype metadata, cell type metadata, cell type signature reference, gene lengths, severity tiers) and takes a variety of arguments at each step for greater customisability.\n\n### Therapeutic target validation\n\nTo assess whether our prioritised therapeutic targets were likely to be viable, we computed the overlap between our gene targets and those of existing gene therapies at various stages of clinical development ([Fig. @fig-therapy-validate]).\nGene targets were obtained for each therapy from the Therapeutic Target Database (TTD; release 2024-03-22) and mapped onto standardised HUGO Gene Nomenclature Committee (HGNC) gene symbols using the `orthogene` R package.\nWe stratified our overlap metrics according to whether the therapies had failed (unsuccessful clinical trials or withdrawn), or were non-failed (successful or ongoing clinical trials).\nWe then conducted hypergeometric tests to determine whether the observed overlap between our prioritised targets and the non-failed therapy targets was significantly greater than expected by chance (i.e. enrichment).\nWe also conducted a second hypergeometric test to determine whether the observed overlap between our prioritised targets and the failed therapy targets was significantly less than expected by chance (i.e. depletion).\nFinally, we repeated the analysis against all therapeutic targets, not just those of gene therapies, to determine whether our prioritised targets had relevance to other therapeutic modalities.\n\n### Experimental model translatability\n\nTo improve the likelihood of successful translation between preclinical animal models and human patients, we created an interspecies translatability prediction tool for each phenotype nominated by our gene therapy prioritised pipeline ([Fig. @fig-animal-models]).\nFirst, we extracted ontological similarity scores of homologous phenotypes across species from the MKG [@Putman2024-et].\nBriefly, the ontological similarity scores ($SIM_o$) are computed for each homologous pair of phenotypes across two ontologies by calculating the overlap in homologous phenotypes that are ancestors or descendants of the target phenotype.\nNext, we generated genotypic similarity scores ($SIM_g$) for each homologous phenotype pair by computing the proportion of 1:1 orthologous genes using gene annotation from their respective ontologies.\nInterspecies orthologs were also obtained from the MKG.\nFinally, both scores are multiplied together to yield a unified ontological-genotypic similarity score ($SIM_{o,g}$).\n\n### Novel R packages\n\nTo facilitate all analyses described in this study and to make them more easily reproducible by others, we created several open-source R packages.\n[`KGExplorer`](https://github.com/neurogenomics/KGExplorer) imports and analyses large-scale biomedical knowledge graphs and ontologies.\n[`HPOExplorer`](https://github.com/neurogenomics/HPOExplorer) aids in managing and querying the directed acyclic ontology graph within the HPO.\n[`MSTExplorer`](https://github.com/neurogenomics/MSTExplorer) facilitates the efficient analysis of many thousands of phenotype-cell type association tests, and provides a suite of multi-scale therapeutic target prioritisation and visualisation functions.\nThese R packages also include various functions for distributing the post-processed results from this study in an organised, tabular format.\nOf note, `MSTExplorer::load_example_results` loads all summary statistics from our phenotype-cell type tests performed here.\n\n### Rare Disease Celltyping Portal\n\nTo further increase the ease of access for stakeholders in the RD community without the need for programmatic experience, we developed a series of web apps to interactively explore, visualise, and download the results from our study.\nCollectively, these web apps are called the Rare Disease Celltyping Portal.\nThe landing page for the website was made using HTML, CSS, and javascript and the web apps were created using the Shiny Web application framework for R and deployed on the [shinyapps.io](https://www.shinyapps.io) server.\nThe website can be accessed [here](https://neurogenomics.github.io/rare_disease_celltyping_apps/home). All code used to generate the website can be found [here](https://github.com/neurogenomics/rare_disease_celltyping_apps).\n\n## Data and Code Availability\n\nAll data and code is made freely available through preexisting databases and/or GitHub repositories / software associated with this publication.\n\n- [Human Phenotype Ontology](https://hpo.jax.org)\n- [GenCC](https://thegencc.org/)\n- [Descartes Human scRNA-seq atlas](https://cellxgene.cziscience.com/collections/c114c20f-1ef4-49a5-9c2e-d965787fb90c)\n- [Human Cell Landscape scRNA-seq atlas](https://cellxgene.cziscience.com/collections/38833785-fac5-48fd-944a-0f62a4c23ed1)\n- [Rare Disease Celltyping Portal](https://neurogenomics.github.io/rare_disease_celltyping_apps/home)\n- [`KGExplorer`](https://github.com/neurogenomics/KGExplorer)\n- [`HPOExplorer`](https://github.com/neurogenomics/HPOExplorer)\n- [`MSTExplorer`](https://github.com/neurogenomics/MSTExplorer)\n- [Code to replicate analyses](https://github.com/neurogenomics/rare_disease_celltyping)\n- [Cell type-specific gene target prioritisation](https://neurogenomics.github.io/RareDiseasePrioritisation/reports/prioritise_targets) \n- [Complement system gene list](https://www.genenames.org/data/genegroup/#!/group/492)\n\n\n## Acknowledgements\n\nWe would like to thank the following individuals for their insightful feedback and assistance with data resources: Sarah J. Marzi, Gerton Lunter, Peter Robinson, Melissa Haendel, Ben Coleman, Nico Matentzoglu, Shawn T. O'Neil, Alan E. Murphy, Sarada Gurung.\n\n### Funding\n\nThis work was supported by a UK Dementia Research Institute (UK DRI) Future Leaders Fellowship \\[MR/T04327X/1\\] and the UK DRI which receives its funding from UK DRI Ltd, funded by the UK Medical Research Council, Alzheimer’s Society and Alzheimer's Research UK.\n\n\n## References {.unnumbered}\n\n:::{#refs}\n\n:::\n\n\\\n\n\n{{< pagebreak >}}\n\n\n\n## Supplementary Materials\n\n### Supplementary Figures\n\n\n::: {#cell-fig-evidence-histograms .cell}\n\n```{.r .cell-code .hidden}\nevidence_plot <- HPOExplorer::plot_evidence(phenotype_to_genes = p2g,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nConstructing HPO gene x phenotype matrix.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating gene-disease associations with Evidence Score\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAnnotating phenos with Disease\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nGathering data from GenCC.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nImporting cached file.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nEvidence scores for: \n - 10390 diseases \n - 5142 genes\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: 2024-03-01\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nevidence_plot$plot\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n::: {.cell-output-display}\n![Distribution of evidence scores at each processing step.](index_files/figure-pdf/fig-evidence-histograms-1.pdf){#fig-evidence-histograms fig-pos='H'}\n:::\n:::\n\n\n\n::: {#fig-diagram}\n![](img/fig-diagram.png)\n\nDiagrammatic overview of multi-scale disease investigation strategy.\nHere we provide an abstract example of differential disease aetiology across multiple scales: diseases ($D$), phenotypes ($P$), cell types ($C$), genes ($G$), and clinical outcomes ($O$).\nIn the HPO, genes are assigned to phenotypes via particular diseases ($G_{d,p}$).\nTherefore, the final gene list for each phenotype is aggregated from across multiple diseases ($G_{p}$).\nWe performed association tests for all pairwise combinations of cell types and phenotypes and filtered results after multiple testing corrections ($FDR_{p,c}<0.05$).\nEach phenotype in the context of a given disease is referred to here as a symptom.\nLinks were established between symptoms and cell types through proportional gene set overlap at a minimum threshold of 25%.\n:::\n\n\n::: {#fig-ctd-correlation .cell layout-ncol=\"2\"}\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out$plot$p.all\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out$plot$logFC.significant\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out_hcl$plot$p.all\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nvalidate_associations_correlate_ctd_out_hcl$plot$logFC.significant\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n`stat_xsidebin()` using `bins = 30`. Pick better value with `binwidth`.\n`stat_ysidebin()` using `bins = 30`. Pick better value with `binwidth`.\n```\n\n\n:::\n\n::: {.cell-output-display}\n![Correlation between the uncorrected p-values from all phenotype-cell type association tests using the Descartes Human vs. Human Cell Landscape CTDs.](index_files/figure-pdf/fig-ctd-correlation-1.pdf){#fig-ctd-correlation-1}\n:::\n\n::: {.cell-output-display}\n![Correlation between the $log_{10}(fold-change)$ from significant phenotype-cell type association tests ($FDR_{p,c}<0.05$) using the Descartes Human vs. Human Cell Landscape CTDs.](index_files/figure-pdf/fig-ctd-correlation-2.pdf){#fig-ctd-correlation-2}\n:::\n\n::: {.cell-output-display}\n![Correlation between the uncorrected p-values from all phenotype-cell type association tests using the Human Cell Landscape fetal samples vs. Human Cell Landscape adult samples.](index_files/figure-pdf/fig-ctd-correlation-3.pdf){#fig-ctd-correlation-3}\n:::\n\n::: {.cell-output-display}\n![Correlation between the $log_{10}(fold-change)$ from significant phenotype-cell type association tests ($FDR_{p,c}<0.05$) using the Human Cell Landscape fetal samples vs. Human Cell Landscape adult samples.](index_files/figure-pdf/fig-ctd-correlation-4.pdf){#fig-ctd-correlation-4}\n:::\n\nInter- and intra-dataset validation across the different CellTypeDataset (CTD) and developmental stages. Correlations are computed using Pearson's correlation coefficient. Point density is plotted using a 2D kernel density estimate.\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\n### Generate filtering report plot\nplot_report_out <- MSTExplorer::plot_report(\n results = results,\n rep_dt = prioritise_targets_out$report,\n show_plot = FALSE)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nplot_report:: Preparing data.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype.hpoa\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nReading cached RDS file: phenotype_to_genes.txt\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n+ Version: v2024-02-08\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nplot_report:: Preparing plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot ==> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//RtmpQjrQLG/file73fc5fbe045d_plot_report.pdf\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving 5.5 x 3.5 in image\n```\n\n\n:::\n:::\n\n::: {#cell-fig-therapy-filter .cell}\n\n```{.r .cell-code .hidden}\nplot_report_out\n```\n\n::: {.cell-output-display}\n![Therapeutics - Prioritised target filtering steps. This plot visualises the number of unique phenotype-cell type associations, cell types, genes, and phenotypes (*y-axis*) at each filtering step (*x-axis*) within the multi-scale therapeutic target prioritisation pipeline. Each step in the pipeline can be easily adjusted according to user preference and use case. See **Methods** for descriptions and criterion of each filtering step.**a**, The percentage of phenotypes belonging to each severity Tier after each filtering step (Tier 1 being the most severe). **b**, The number of phenotypes, cell types, associated diseases and genes remaining after each filtering step during the gene prioritisation pipeline.](index_files/figure-pdf/fig-therapy-filter-1.pdf){#fig-therapy-filter fig-pos='H'}\n:::\n:::\n\n::: {#cell-fig-monarch-recall .cell}\n\n```{.r .cell-code .hidden}\ncat(\"!!!RECOMPUTE!!!\")\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n!!!RECOMPUTE!!!\n```\n\n\n:::\n:::\n\n::: {#cell-fig-therapy-validate-all .cell}\n\n```{.r .cell-code .hidden}\nttd_check_all_out$plot\n```\n\n::: {.cell-output-display}\n![Therapeutics - Validation of prioritised therapeutic targets. Proportion of existing all therapy targets (documented in the Therapeutic Target Database) recapitulated by our prioritisation pipeline.](index_files/figure-pdf/fig-therapy-validate-all-1.pdf){#fig-therapy-validate-all fig-pos='H'}\n:::\n:::\n\n::: {#cell-fig-animal-models .cell}\n\n```{.r .cell-code .hidden}\nlibrary(ggplot2) # <-- Necessary due to bug in one of the plotting dependencies \n\ntop_ids <- unique(prioritise_targets_out$top_targets$hpo_id)[1:1000]\nplot_upheno_out <- KGExplorer::plot_upheno(\n pheno_map_genes_match = pheno_map_genes_match, \n filters=list(id1=top_ids)\n )\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nFiltered 'id1' : 883 / 987 rows dropped.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n3 / 4 species remain after filtering by `subset_db1`.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: ggdist\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: tidyquant\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nRegistered S3 method overwritten by 'quantmod':\n method from\n as.zoo.data.frame zoo \n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding ancestor metadata.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading cached ontology: /Users/bms20/Library/Caches/org.R-project.R/R/KGExplorer/upheno.rds\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAncestor metadata already present. Use force_new=TRUE to overwrite.\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nplot_upheno_out$heatmap\n```\n\n::: {.cell-output-display}\n![Identification of translatable experimental models. Interspecies translatability of human phenotypes nominated by our gene therapy prioritised pipeline. Above, our combined ontological-genotypic similarity score ($SIM_{o,g}$) is displayed as the heatmap fill colour stratified by the model organism (*x-axis*). An additional column (“n_genes_db1” on the far left) displays the total number of unique genes annotated to the phenotypic within the HPO. Phenotypes are clustered according to their ontological similarity in the HPO (*y-axis*).](index_files/figure-pdf/fig-animal-models-1.pdf){#fig-animal-models fig-pos='H'}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code .hidden}\ntop_targets <- prioritise_targets_out$top_targets[,n_genes:=data.table::uniqueN(gene_symbol),\n by=\"hpo_id\"][n_genes<5 & proportion_driver_genes_symptom>.25]\n\nheight <- \"60vh\"\n\nphenotype <- \"respiratory failure\"\nvn_therapy_eg1 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)],\n main = phenotype, \n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nLoading required namespace: visNetwork\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a42436e8b7a_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- \"amyotrophic lateral sclerosis\"\nvn_therapy_eg2 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)],\n main = phenotype,\n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a4247859f27_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- \"neurodegeneration\"\nvn_therapy_eg3 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = phenotype, \n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a421edc11f1_prioritise_targets_network.html\n```\n\n\n:::\n\n```{.r .cell-code .hidden}\nphenotype <- \"small vessel disease\"\nvn_therapy_eg4 <- MSTExplorer::prioritise_targets_network(\n top_targets = top_targets[grepl(paste(phenotype,collapse = \"|\"), disease_name,ignore.case = TRUE)], \n main = phenotype, \n height = height,\n submain = NULL)\n```\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nlogFC already exists in results. Use `force_new=TRUE` to overwrite.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCell type columns already present. Skipping mapping.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating network.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nMaking hoverboxes from: 'node', 'node_type', 'effect', 'q', 'CellType', 'ancestor_name', 'disease_id', 'ontLvl', 'definition', 'hpo_name', 'hpo_id', 'disease_name', 'shape', 'name'\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nAdding hoverboxes to data.table.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nCreating visNetwork plot.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\n'title' column already exists. Skipping hoverbox creation.\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nUsing palette: kovesi.linear_bmy_10_95_c78\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nWarning in\nvisNetwork::visOptions(visNetwork::visInteraction(visNetwork::visEdges(visNetwork::visNodes(visNetwork::visPhysics(visNetwork::visIgraphLayout(visNetwork::toVisNetworkData(g)\n%>% : Can't find 'name' in node data.frame\n```\n\n\n:::\n\n::: {.cell-output .cell-output-stderr .hidden}\n\n```\nSaving plot --> /var/folders/rd/rbc_wrdj4k3djf3brk6z0_dc0000gp/T//Rtmpc8VtV7/file7a427cc2a642_prioritise_targets_network.html\n```\n\n\n:::\n:::\n\n::: {#fig-therapy-examples-supp .cell layout=\"[[1,1], [1], [1]]\"}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg1$plot\n```\n\n![Respiratory failure](index_files/figure-pdf/fig-therapy-examples-supp-1.pdf){#fig-therapy-examples-supp-1}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg2$plot\n```\n\n![Amyotrophic lateral sclerosis](index_files/figure-pdf/fig-therapy-examples-supp-2.pdf){#fig-therapy-examples-supp-2}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg3$plot\n```\n\n![Neurodegeneration](index_files/figure-pdf/fig-therapy-examples-supp-3.pdf){#fig-therapy-examples-supp-3}\n\n```{.r .cell-code .hidden}\nvn_therapy_eg4$plot\n```\n\n![Small vessel disease](index_files/figure-pdf/fig-therapy-examples-supp-4.pdf){#fig-therapy-examples-supp-4}\n\nExample cell type-specific gene therapy targets for several severe phenotypes and their associated diseases. Each disease (blue cylinders) is connected to its phenotype (purple cylinders) based on well-established clinical observations recorded within the HPO [@Gargano2024-fc].Phenotypes are connected to cell types (red circles) via association testing between weighted gene sets ($FDR_{p,c}<0.05$). Each cell type is connected to the prioritised gene targets (yellow boxes) based on the driver gene analysis.The thickness of the edges connecting the nodes represent the (mean) fold-change from the bootstrapped enrichment tests. Nodes were spatially arranged using the Sugiyama algorithm [@Sugiyama1981-ev].\n:::\n\n\n### Supplementary Methods\n\n#### Therapeutics: Gene therapy target identification\n\nDescriptions of each step in the prioritisation pipeline are as follows:\n\n1. **start**: All phenotype-cell type association results.\n\n2. **q_threshold**: Keep only results that were significant after multiple-testing correction (q\\<0.05).\n\n3. **fold_threshold**: Keep only results with fold change\\>=1.\n\n4. **keep_ont_levels**: Keep only phenotypes at certain absolute ontology levels within the HPO.\n\n5. **keep_onsets**: Keep only phenotypes with postnatal age of onsets to circumvent technical and ethical challenges associated with antenatal gene therapeutics delivery.\n\n6. **keep_tiers**: Keep only phenotypes with high severity Tiers.\n\n 1. We used a combination of manual curation and automated text-based substring queries to assign each phenotype a severity Tier as characterised in a survey of healthcare professionals [@Lazarin2014-we].\n\n 2. Tier 1: Diseases that shortened life span in adolescence or earlier or resulted in intellectual disability.\n\n 3. Tier 2: Diseases that shortened lifespan prematurely in adulthood, or resulted in impaired mobility or internal physical malformation.\n\n 4. Tier 3: Diseases causing sensory impairments (hearing, vision, touch, pain, or other), immunodeficiency/cancer, mental illness, or dysmorphic features.\n\n 5. Tier 4: Diseases that reduce fertility.\n Of the 49 phenotypes that were available in this severity ranking, we selected three that were classified as Tier 1 (the most severe disease category): mental deterioration, coma and respiratory failure.\n\n7. **severity_threshold**: Keep only phenotypes with mean severity score equal to or below the threshold.\n\n 1. Severity scores were computed by assigning each severity modifier term found in the HPO annotations a numerical value.\n In order of increasing severity:\n\n 2. HP:0012825 \"Mild\" (Severity_score=4)\n\n 3. HP:0012827 \"Borderline\" (Severity_score=3)\n\n 4. HP:0012828 \"Severe\" (Severity_score=2)\n\n 5. HP:0012829 \"Profound\" (Severity_score=1)\n\n8. **pheno_frequency_threshold**: Keep only phenotypes with mean frequency equal to or above the threshold (i.e. how frequently a phenotype is associated with any diseases in which it occurs).\n\n 1. Keep phenotypes with a mean frequency ≥10% or are NA by default.\n\n9. **keep_celltypes**: Keep only terminally differentiated cell types.\n\n 1. Of the 77 cell types tested in the Descartes cell type reference, the 40 terminally differentiated cell types were identified through a literature search. Of these, three (extravillous trophoblasts, syncytiotrophoblasts and trophoblast giant cells) were excluded as they only played a role in pregnancy [@Chang2018-qj; @Fogarty2011-ph; @Hu2010-eh], which would raise additional technical and ethical challenges as rAAV therapy has not yet been used to target foetuses in clinical trials.\n\n10. **keep_seqnames**: Remove genes on non-standard chromosomes.\n\n 1. Only keep chromosomes 1-22, X, and Y.\n\n11. **gene_size**: Keep only genes \\<4.3kb in length.\n\n 1. Due to limitations in the length of the gene that can be carried by the rAAV vector, genes with a length of \\>4.3kb were excluded.\n\n12. **keep_biotypes**: Keep only genes belonging to certain biotypes (e.g. \"protein_coding\", \"processed_transcript\", \"snRNA\", \"lincRNA\", \"snoRNA\", \"IG_C_gene\").\n\n 1. Keep all biotypes by default.\n\n13. **gene_frequency_threshold**: Keep only genes at or above a certain mean frequency threshold (i.e. how frequently a gene is associated with a given phenotype when observed within a disease).\n\n 1. Keep genes with a mean frequency ≥10% or are NA by default.\n\n14. **keep_specificity_quantiles**: Keep only genes in top specificity quantiles from the cell type dataset.\n\n 1. To further narrow down genes, we extracted relevant metrics from the Descartes reference for each gene in each cell type. These included mean expression, specificity, and specificity quantiles (using 40 bins). Only genes with the most specific quantiles (39-40) were included for further analysis, as cell type-specific genes may be less likely to have off-target effects in other cell types.\n\n15. **keep_mean_exp_quantiles**: Keep only genes in top mean expression quantiles from the cell type dataset\n\n16. **end**: Final table of prioritised cell type- / phenotype-specific gene targets.\n\nFinally, for more comprehensive target search, the we removed the filters for onsets (keep_onsets=NULL), Tier (keep_tiers=NULL), severity (severity_threshold=NULL), as well as relaxed the filters for phenotype frequency threshold (pheno_frequency_threshold=c(10,NA)), gene frequency threshold (gene_frequency_threshold = c(10,NA)), gene specificity quantiles (keep_specificity_quantiles = seq(20,40)), and gene expression quantiles (keep_mean_exp_quantiles = seq(20,40)).\n \n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/manuscript/_manuscript/_tex/img/fig-diagram.png b/manuscript/_manuscript/_tex/img/fig-diagram.png new file mode 100644 index 0000000..0dbac7d Binary files /dev/null and b/manuscript/_manuscript/_tex/img/fig-diagram.png differ diff --git a/manuscript/_manuscript/_tex/index.tex b/manuscript/_manuscript/_tex/index.tex new file mode 100644 index 0000000..edb8988 --- /dev/null +++ b/manuscript/_manuscript/_tex/index.tex @@ -0,0 +1,2716 @@ +% Options for packages loaded elsewhere +\PassOptionsToPackage{unicode}{hyperref} +\PassOptionsToPackage{hyphens}{url} +\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor} +% +\documentclass[ +sn-nature +]{sn-jnl} + +\usepackage{amsmath,amssymb} +\usepackage{iftex} +\ifPDFTeX + \usepackage[T1]{fontenc} + \usepackage[utf8]{inputenc} + \usepackage{textcomp} % provide euro and other symbols +\else % if luatex or xetex + \usepackage{unicode-math} + \defaultfontfeatures{Scale=MatchLowercase} + \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} +\fi +\usepackage{lmodern} +\ifPDFTeX\else + % xetex/luatex font selection +\fi +% Use upquote if available, for straight quotes in verbatim environments +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +\IfFileExists{microtype.sty}{% use microtype if available + \usepackage[]{microtype} + \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts +}{} +\makeatletter +\@ifundefined{KOMAClassName}{% if non-KOMA class + \IfFileExists{parskip.sty}{% + \usepackage{parskip} + }{% else + \setlength{\parindent}{0pt} + \setlength{\parskip}{6pt plus 2pt minus 1pt}} +}{% if KOMA class + \KOMAoptions{parskip=half}} +\makeatother +\usepackage{xcolor} +\setlength{\emergencystretch}{3em} % prevent overfull lines +\setcounter{secnumdepth}{-\maxdimen} % remove section numbering +% Make \paragraph and \subparagraph free-standing +\ifx\paragraph\undefined\else + \let\oldparagraph\paragraph + \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}} +\fi +\ifx\subparagraph\undefined\else + \let\oldsubparagraph\subparagraph + \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}} +\fi + + +\providecommand{\tightlist}{% + \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array} +\usepackage{calc} % for calculating minipage widths +% Correct order of tables after \paragraph or \subparagraph +\usepackage{etoolbox} +\makeatletter +\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} +\makeatother +% Allow footnotes in longtable head/foot +\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} +\makesavenoteenv{longtable} +\usepackage{graphicx} +\makeatletter +\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} +\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} +\makeatother +% Scale images if necessary, so that they will not overflow the page +% margins by default, and it is still possible to overwrite the defaults +% using explicit options in \includegraphics[width, height, ...]{} +\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} +% Set default figure placement to htbp +\makeatletter +\def\fps@figure{htbp} +\makeatother +% definitions for citeproc citations +\NewDocumentCommand\citeproctext{}{} +\NewDocumentCommand\citeproc{mm}{% + \begingroup\def\citeproctext{#2}\cite{#1}\endgroup} +\makeatletter + % allow citations to break across lines + \let\@cite@ofmt\@firstofone + % avoid brackets around text for \cite: + \def\@biblabel#1{} + \def\@cite#1#2{{#1\if@tempswa , #2\fi}} +\makeatother +\newlength{\cslhangindent} +\setlength{\cslhangindent}{1.5em} +\newlength{\csllabelwidth} +\setlength{\csllabelwidth}{3em} +\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing + {\begin{list}{}{% + \setlength{\itemindent}{0pt} + \setlength{\leftmargin}{0pt} + \setlength{\parsep}{0pt} + % turn on hanging indent if param 1 is 1 + \ifodd #1 + \setlength{\leftmargin}{\cslhangindent} + \setlength{\itemindent}{-1\cslhangindent} + \fi + % set entry spacing + \setlength{\itemsep}{#2\baselineskip}}} + {\end{list}} +\usepackage{calc} +\newcommand{\CSLBlock}[1]{\hfill\break\parbox[t]{\linewidth}{\strut\ignorespaces#1\strut}} +\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{\strut#1\strut}} +\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{\strut#1\strut}} +\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1} + +%%%% Standard Packages + +\usepackage{graphicx}% +\usepackage{multirow}% +\usepackage{amsmath,amssymb,amsfonts}% +\usepackage{amsthm}% +\usepackage{mathrsfs}% +\usepackage[title]{appendix}% +\usepackage{xcolor}% +\usepackage{textcomp}% +\usepackage{manyfoot}% +\usepackage{booktabs}% +\usepackage{algorithm}% +\usepackage{algorithmicx}% +\usepackage{algpseudocode}% +\usepackage{listings}% + +%%%% + +\raggedbottom +\usepackage{annotate-equations} +\makeatletter +\@ifpackageloaded{caption}{}{\usepackage{caption}} +\AtBeginDocument{% +\ifdefined\contentsname + \renewcommand*\contentsname{Table of contents} +\else + \newcommand\contentsname{Table of contents} +\fi +\ifdefined\listfigurename + \renewcommand*\listfigurename{List of Figures} +\else + \newcommand\listfigurename{List of Figures} +\fi +\ifdefined\listtablename + \renewcommand*\listtablename{List of Tables} +\else + \newcommand\listtablename{List of Tables} +\fi +\ifdefined\figurename + \renewcommand*\figurename{Figure} +\else + \newcommand\figurename{Figure} +\fi +\ifdefined\tablename + \renewcommand*\tablename{Table} +\else + \newcommand\tablename{Table} +\fi +} +\@ifpackageloaded{float}{}{\usepackage{float}} +\floatstyle{ruled} +\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]} +\floatname{codelisting}{Listing} +\newcommand*\listoflistings{\listof{codelisting}{List of Listings}} +\makeatother +\makeatletter +\makeatother +\makeatletter +\@ifpackageloaded{caption}{}{\usepackage{caption}} +\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}} +\makeatother +\ifLuaTeX + \usepackage{selnolig} % disable illegal ligatures +\fi +\usepackage{bookmark} + +\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available +\urlstyle{same} % disable monospaced font for URLs +\hypersetup{ + pdftitle={Cell type-specific contextualisation of the phenomic landscape: a comprehensive and scalable approach towards the diagnosis, prognosis and treatment of all rare diseases}, + pdfauthor={Brian M. Schilder; Kitty B. Murphy; Robert Gordon-Smith; Jai Chapman; Momoko Otani; Nathan G. Skene}, + pdfkeywords={rare disease, phenotype, single-cell, gene therapy}, + colorlinks=true, + linkcolor={blue}, + filecolor={Maroon}, + citecolor={Blue}, + urlcolor={Blue}, + pdfcreator={LaTeX via pandoc}} + +\title[Cell type-specific contextualisation of the phenomic landscape: a +comprehensive and scalable approach towards the diagnosis, prognosis and +treatment of all rare diseases]{Cell type-specific contextualisation of +the phenomic landscape: a comprehensive and scalable approach towards +the diagnosis, prognosis and treatment of all rare diseases} + +% author setup +\author*[aff-1]{\fnm{Brian M.} \sur{Schilder}}\email{brian\_schilder@alumni.brown.edu}\author[aff-1]{\fnm{Kitty B.} \sur{Murphy}}\author[aff-1]{\fnm{Robert} \sur{Gordon-Smith}}\author[aff-1]{\fnm{Jai} \sur{Chapman}}\author[aff-1]{\fnm{Momoko} \sur{Otani}}\author*[aff-1]{\fnm{Nathan G.} \sur{Skene}}\email{n.skene@imperial.ac.uk} +% affil setup +\affil[aff-1]{, \orgname{Imperial College London}} + +% abstract + + +% keywords +\keywords{rare disease, phenotype, single-cell, gene therapy} + +\begin{document} +\maketitle + +\subsection{Abstract}\label{abstract} + +Rare diseases (RDs) are an extremely heterogeneous and underserved +category of medical conditions. While the majority of RDs are strongly +genetic, it remains largely unknown via which physiological mechanisms +genetics cause RD. Therefore, we sought to systematically characterise +the cell type-specific mechanisms underlying all RD phenotypes with a +known genetic cause by leveraging the Human Phenotype Ontology and +transcriptomic single-cell atlases of the entire human body from +embryonic, foetal, and adult samples. In total we identified significant +associations between 201 cell types and 9,575/11,028 (86.7\%) unique +phenotypes across 8,628 RDs. We estimate that this represents an over +500-fold increase in the collective knowledge of RD phenotype-cell type +mechanisms. + +Next, we demonstrated how these results may be used for personalised +patient diagnosis and prognosis, as well as the development of novel +therapeutics. Finally, we take a data-driven approach to highlight +several of the most promising gene/cell therapy candidates with the +highest probability of animal model-to-human patient translation. +Furthermore, we have made these results entirely reproducible and freely +accessible to the global community to maximise their impact. To +summarise, this work represents a significant step forward in the +mission to treat patients across an extremely diverse spectrum of +serious RDs. + +\subsection{Introduction}\label{sec-introduction} + +While rare diseases (RDs) are individually uncommon, they collectively +account for an enormous global disease burden with over 10,000 +recognised RDs affecting at least 300-400 million people +globally\textsuperscript{1} (1 in 10-20 people)\textsuperscript{2} . +Over 75\% of RDs primarily affect children with a 30\% mortality rate by +5 years of age\textsuperscript{3}. Despite the prevalence and severity +of RDs, patients suffering from these conditions are vastly underserved +due to several contributing factors. First, diagnosis is extremely +challenging due to the highly variable clinical presentations of many of +these diseases. The diagnostic odyssey can take patients and their +families decades, with an average time to diagnosis of 5 +years\textsuperscript{4}. Of those, \textasciitilde46\% receive at least +one incorrect diagnosis and over 75\% of all patients never receive any +diagnosis\textsuperscript{5}. Second, prognosis is also made difficult +by high variability in disease course and outcomes which makes matching +patients with effective and timely treatment plans even more +challenging. Finally, even for patients who receive an accurate +diagnosis/prognosis, treatments are currently only available for less +than 5\% of all RDs\textsuperscript{6}. In addition to the scientific +challenges of understanding RDs, there are strong financial +disincentives for pharmaceutical and biotechnology companies to develop +expensive therapeutics for exceedingly small RD patient populations with +little or no return on investment\textsuperscript{7,8}. Those that have +been produced are amongst the world's most expensive drugs, greatly +limiting patients' ability to access it\textsuperscript{9,10}, The +provision of timely, effective and affordable care for RD patients will +require substantive transformations to our existing scientific, +clinical, and regulatory frameworks. + +A major challenge in both healthcare and scientific research is the +scalable exchange of information. Even in the age of electronic +healthcare records (EHR) much of the information about an individual's +history is currently fractured across healthcare providers, often with +differing nomenclatures for the same conditions. The Human Phenotype +Ontology (HPO) is a hierarchically organised set of controlled clinical +terms that provides a much needed common framework by which clinicians +and researchers can precisely communicate patient +conditions\textsuperscript{14}. The HPO spans all domains of human +physiology and currently describes 18082 phenotypes across 10,300 RDs. +Each phenotype and disease is assigned its own unique identifier and +organised as a hierarchical graph, such that higher-level terms describe +broad phenotypic categories or \emph{branches} (e.g.~\emph{HP:0033127}: +`Abnormality of the musculoskeletal system' which contains 4495 unique +phenotypes) and lower-level terms describe increasingly precise +phenotypes (e.g.~\emph{HP:0030675}: ``Contracture of proximal +interphalangeal joints of 2nd-5th fingers''). It has already been +integrated into healthcare systems and clinical diagnostic tools around +the world, with increasing adoption over time\textsuperscript{11}. +Common ontology-controlled frameworks like the HPO open a wealth of new +opportunities, especially when addressing RDs. Services such as the +Matchmaker Exchange\textsuperscript{15,16} have enabled the discovery of +hundreds of underlying genetic etiologies, and led to the diagnosis of +many patients. This also opens the possibility of gathering cohorts of +geographically dispersed patients to run clinical trials, the only +viable option for treatment in many individuals. To further increase the +number of individuals who qualify for these treatments, as well as the +trial sample size, proposals have been made deviate from the traditional +single-disease clinical trial model and instead perform basket trials on +groups of RDs with shared molecular etiologies +(SaME)\textsuperscript{17}. However this approach, and indeed much of RD +patient care, hinges upon first characterising the molecular mechanisms +underlying each RD. + +Over 80\% of RDs have a known genetic cause\textsuperscript{18,19}. +Despite this our knowledge of the physiological mechanisms via which +genetics cause pathogenesis is lacking for most RDs, severely hindering +our ability to effectively diagnose, prognose and treat RD patients. The +availability of standardised, ontology-controlled databases presents +opportunities to systematically investigate RDs at scale. Since 2008, +the HPO has been continuously updated using knowledge from the medical +literature, as well as by integrating databases of expert validated +gene-phenotype relationships, such as OMIM\textsuperscript{20--22}, +Orphanet\textsuperscript{23,24}, and DECIPHER\textsuperscript{25}. A +subset of the HPO contains gene annotations for 11,047 phenotypes across +8,631 diseases. Yet genes alone do not tell the full story of how RDs +come to be, as their expression and functional relevance varies +drastically across the multitude of tissues and cell types contained +within the human body. + +Our knowledge of single-cell-resolution biology has exploded over the +course of the last decade and a half, with numerous applications in both +scientific and clinical practices\textsuperscript{26--28}. More +recently, comprehensive single-cell transcriptomic atlases across +tissues have also emerged\textsuperscript{29,30}. In particular, the +Descartes Human\textsuperscript{31} and Human Cell +Landscape\textsuperscript{32} projects provide comprehensive +multi-system single-cell RNA-seq (scRNA-seq) atlases in embryonic, +foetal, and adult human samples from across the human body. These +datasets provide data-driven gene signatures for hundreds of cell +subtypes. They also allow us to investigate disease mechanisms in the +context of specific life stages. + +Here, we combine and extend several of the most comprehensive genomic +and transcriptomic resources currently available to systematically +uncover the cell types underlying granular phenotypes across 8,628 +diseases. We then go on to highlight thousands of novel phenotype-cell +type associations which collectively expand our knowledge of cell +type-resolved phenotypes by an estimated 567-fold. Next, we present +several potential avenues for real world applications of these results +in the context of RD patient diagnosis, prognosis, treatment, and +therapeutics development. + +\subsection{Results}\label{sec-results} + +\subsubsection{Phenotype-cell type +associations}\label{phenotype-cell-type-associations} + +In this study we systematically investigated the cell types underlying +phenotypes across the HPO. A summary of the phenome-wide results +stratified by single-cell atlas can be found in \textbf{?@tbl-summary}. +Within the results using the Descartes Human single-cell atlas, 19,929/ +848,078 (2.35\%) tests across 77/ 77 (100\%) cell types and 7,340/11,047 +(66.4\%) phenotypes revealed significant phenotype-cell type +associations after multiple-testing correction (\(FDR_{p,c}<0.05\)). +Using the Human Cell Landscape single-cell atlas, 26,585/1,358,916 +(1.96\%) tests across 124/124 (100\%) cell types and 9,049/11,047 +(81.9\%) phenotypes showed significant phenotype-cell type associations +(\(FDR_{p,c}<0.05\)). The median number of significantly associated +phenotypes per cell type was 252 (Descartes Human) and 200 (Human Cell +Landscape), respectively. + +Across both single-cell references, the median number of significantly +associated cell types per phenotype was 3, suggesting reasonable +specificity of the testing strategy. 8,628/8,631 (\textasciitilde100\%) +of diseases within the HPO gene annotations showed significant cell type +associations for at least one of their respective phenotypes. + +\subsubsection{Validation of expected phenotype-cell type +relationships}\label{validation-of-expected-phenotype-cell-type-relationships} + +Within each high-level branch in the HPO shown in +Fig.~\ref{fig-summary}b, we tested whether each cell type was more often +associated with phenotypes in that branch relative to those in all other +branches (including those not shown). We then checked whether each cell +type was overrepresented (at \(FDR_{b,c}<0.05\)) within its respective +on-target HPO branch, where the number of phenotypes within that branch +(\(N_{p}\)). Abnormality of the cardiovascular system: 5/6 types of +`cardiocyte' were overrepresented (\(N_{p}\)=673). Abnormality of the +endocrine system: 3/4 types of `endocrine cell' were overrepresented +(\(N_{p}\)=291). Abnormality of the eye: 5/5 types of `photoreceptor +cell/retinal cell' were overrepresented (\(N_{p}\)=721). Abnormality of +the immune system: 4/4 types of `leukocyte' were overrepresented +(\(N_{p}\)=255). Abnormality of the musculoskeletal system: 4/4 types of +`cell of skeletal muscle/chondrocyte' were overrepresented +(\(N_{p}\)=2155). Abnormality of the nervous system: 19/23 types of +`neural cell' were overrepresented (\(N_{p}\)=1647). Abnormality of the +respiratory system: 2/2 types of `respiratory epithelial cell/epithelial +cell of lung' were overrepresented (\(N_{p}\)=292).. + +As an additional form of validation (Fig.~\ref{fig-summary}d), we tested +for a relationship between phenotype-cell type association significance +(\(-log_{e}(p_{p,c})\) where \(log_{e}\) denotes natural log and and +\(p_{p,c}\) denotes uncorrected phenotype-cell type association +p-values) and the proportion of on-target cell types. The list of +on-target cell types were determined by matching each high-level HPO +branch to a corresponding CL branch. These cross-ontology mappings can +be found in \textbf{?@tbl-celltypes}. For this analysis we used raw +p-values (\(p_{p,c}\)) rather than multiple-testing corrected p-values +(\(FDR_{p,c}\)) to provide a more dynamic range of values (as the latter +can drive values to 1). All 7/7 high-level HPO branches showed a +consistent upwards trend towards greater proportions of on-target cell +types with increasing degrees of significance. Furthermore, all branches +also showed a proportion of on-target cell types above that expected by +chance (baseline = on-target cell types / total cell types) at +\(-log_{e}(p_{p,c})>1\). + +\phantomsection\label{cell-fig-summary} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-summary-1.pdf} + +} + +\caption{\label{fig-summary}Summary of significant associations between +phenotypes and cell types, aggregated by HPO branch. Here we show +\textbf{a}, the total number of significant phenotype enrichments per +cell type (\(FDR_{p,c}<0.05\)) across all branches of the HPO. +\textbf{b}, Number of phenotype association related to several +high-level branches of the HPO. Asterisks above each bar indicate +whether that cell type was significantly more often enriched in that +branch relative to all other HPO branches, including those not shown +here, as a proxy for how specifically that cell type is associated with +that branch; \(FDR _{b,c}<1e-04\) (****), \(FDR _{b,c}<0.001\) (***), +\(FDR_{b,c}<0.01\) (**), \(FDR _{b,c}<0.05\) (*). \textbf{c}, Dendrogram +derived from the Cell Ontology (CL) showing the relatedness of all +tested cell types to one another. For simplicity, cell type labels shown +here are aligned to the CL\textsuperscript{33} and can therefore +encompass one or more cell types annotated by the original authors of +scRNA-seq datasets\textsuperscript{31,32}. \textbf{d}, Percentage of +significant phenotype associations with on-target cell types (second row +of facet labels), respective to the HPO branch. As significance +increases (\(-log_{10}(p)\) along the \emph{x-axis}) the percentage of +on-target enriched cell types also increases (\emph{y-axis}).} + +\end{figure}% + +\subsubsection{Validation of inter- and intra-dataset +consistency}\label{validation-of-inter--and-intra-dataset-consistency} + +Next, we sought to validate the consistency of our results across the +two single-cell reference datasets (Descartes Human vs.~Human Cell +Landscape) across the subset of overlapping cell types +Fig.~\ref{fig-ctd-correlation}. In total there were 142285 +phenotype-cell type associations to compare across the two datasets +(across 10945 phenotypes and 13 cell types annotated to the exact same +CL term. We found that the correlation between p-values of the two +datasets was high (\(rho=0.492, p=1.08e-93\)). Within the subset of +results that were significant in both single-cell datasets +(\(FDR_{p,c}<0.05\)), we found that correlation of the association +effect size were even stronger (\(rho=0.723, p=1.08e-93\)). We also +checked for the intra-dataset consistency between the p-values of the +foetal and adult samples in the Human Cell Landscape, showing a very +similar degree of correlation as the inter-dataset comparison +(\(rho=0.436, p=2.36e-149\)). Together, these results suggest that our +approach to identifying phenotype-cell type associations is highly +replicable and generalisable to new datasets. + +\subsubsection{More specific phenotypes are associated with fewer genes +and cell +types}\label{more-specific-phenotypes-are-associated-with-fewer-genes-and-cell-types} + +First, we found that phenotype ontology showed a significant negative +correlation with the number of genes annotated to that phenotype in the +HPO data (Fig.~\ref{fig-ontology-lvl}a; +\(p=2.23e-308, q=2.23e-308, rho=-0.2634\)). This is expected as broader +phenotypes tend to have large gene set annotations. Next, we reasoned +that lower HPO ontology levels representing more specific phenotypes +were likely to be associated with fewer, more specific subsets of cell +types. This was indeed the case, as we observed a strongly significant +negative correlation between the two variables +(Fig.~\ref{fig-ontology-lvl}b; +\(p=2.23e-308, q=2.23e-308, rho=-0.2927\)). We also found that the +effect size of significant phenotype-cell type associations +(\(FDR_{p,c}<0.05\)) increased with greater phenotype specificity, +though the relationship was rather weak (Fig.~\ref{fig-ontology-lvl}c; +\(p=7.30e-97, q=7.30e-97, rho=0.0966\)). Finally, we found that the mean +expression specificity of phenotype-associated genes (within the cell +types significantly associated with those respective phenotypes at +\(FDR_{p,c}<0.05\)) was positively correlated phenotype ontology depth +(Fig.~\ref{fig-ontology-lvl}d; +\(p=2.71e-174, q=3.61e-174, rho=0.1398\)). + +\phantomsection\label{cell-fig-ontology-lvl} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-ontology-lvl-1.pdf} + +} + +\caption{\label{fig-ontology-lvl}More specific phenotypes are associated +with fewer, more specific genes and cell types. Box plots showing +relationship between HPO phenotype level and \textbf{a}, the number of +genes annotated to each phenotype, \textbf{b}, the number of +significantly enriched cell types, \textbf{c}, the effect size of +phenotype-cell type association tests at \(FDR_{p,c}<0.05\), and +\textbf{d}, the mean expression specificity of phenotype-associated +genes in the cell types significantly associated with those respective +phenotypes (\(FDR_{p,c}<0.05\)). Ontology level 0 represents the most +inclusive HPO term `All', while higher ontology levels (max=16) indicate +progressively more specific HPO terms (e.g.~`Contracture of proximal +interphalangeal joints of 2nd-5th fingers'). Boxes are coloured by the +mean value (respective to the subplot) within each HPO level.} + +\end{figure}% + +\subsubsection{Hepatoblasts have a unique role in recurrent Neisserial +infections}\label{hepatoblasts-have-a-unique-role-in-recurrent-neisserial-infections} + +We selected the HPO term `Recurrent bacterial infections' and all of its +descendants (19 phenotypes) as an example of how investigations at the +level of granular phenotypes can reveal different cell type-specific +mechanisms (Fig.~\ref{fig-rni}). As expected, these phenotypes are +primarily associated with immune cell types (e.g.~macrophages, dendritic +cells, T cells, monocytes, neutrophils). Some associations confirm +relationships previously suggested in the literature, such as that +between `Recurrent staphylococcal infections' and myeloid +cells\textsuperscript{34--37}. Specifically, our results pinpoint +monocytes as the most strongly associated cell subtypes +(\(FDR_{p,c}= 1.03e-30,B= 1.76e-01\)). + +In contrast to all other recurrent infection types, `Recurrent +Neisserial infections' highlighted a novel association with hepatoblasts +(Descartes Human : \(FDR_{p,c}= 1.13e-06,B= 8.24e-02\)). Whilst +unexpected, a convincing explanation involves the complement system, a +key driver of innate immune response to Neisserial infections. +Hepatocytes, which derive from hepatoblasts, produce the majority of +complement proteins\textsuperscript{38}, and Kupffer cells express +complement receptors\textsuperscript{39}. In addition, individuals with +deficits in complement are at high risk for Neisserial +infections\textsuperscript{40,41}, and a genome-wide association study +in those with a Neisserial infection identified risk variants within +complement proteins\textsuperscript{42} . While the potential of +therapeutically targeting complement in RDs (including Neisserial +infections) has been proposed previously\textsuperscript{43,44}, +performing this in a gene- and cell type-specific manner may help to +improve efficacy and reduce toxicity (e.g.~due to off-target effects). +Importantly, there are over 56 known genes within the complement +system\textsuperscript{45}, highlighting the need for a systematic, +evidence-based approach to identify effective gene targets. + +Also of note, despite the fact that our datasets contain both +hepatoblasts and their mature counterpart, hepatocytes, only the +hepatoblasts showed this association. This suggests that the genetic +factors that predispose individuals for risk of Neisserial infections +are specifically affecting hepatoblasts before they become fully +differentiated. It is also notable that these phenotypes were the only +ones within the `Recurrent bacterial infections' branch, or even the +broader `Recurrent infections' branch, perhaps indicating a unique role +for hepatoblasts in recurrent infectious disease. The only phenotypes +within the even broader `Abnormality of the immune system' HPO branch +that significantly associated with mature hepatocytes were +`Pancreatitis' (\(FDR_{p,c}= 2.08e-02,B= 5.25e-02\)) and `Susceptibility +to chickenpox' (\(FDR_{p,c}= 1.20e-02,B= 5.49e-02\)) both of which are +well-known to involve the liver\textsuperscript{46--48}. + +\phantomsection\label{cell-fig-rni} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-rni-1.pdf} + +} + +\caption{\label{fig-rni}Hepatoblasts have a unique role in recurrent +Neisserial infections. Significant phenotype-cell type tests for +phenotypes within the branch `Recurrent bacterial infections'. Amongst +all different kinds of recurrent bacterial infections, hepatoblasts +(highlighted by vertical dotted lines) are exclusively enriched in +`Recurrent gram−negative bacterial infections'. Note that terms from +multiple levels of the same ontology branch are shown as separate facets +(e.g.~`Recurrent bacterial infections' and `Recurrent gram−negative +bacterial infections').} + +\end{figure}% + +Next, we sought to link multi-scale mechanisms at the levels of disease, +phenotype, cell type, and gene and visualise these as a network +(Fig.~\ref{fig-network-rni}). This revealed that genetic deficiencies in +different complement system genes (\emph{C5}, \emph{C8}, and \emph{C7}) +are primarily mediated by different cell types (hepatoblasts, stratified +epithelial cells, and stromal cells, respectively). While genes of the +complement system are expressed throughout many different tissues and +cell types, these results indicate that different subsets of these genes +may mediate their effects through different cell types. This finding +suggests that investigating (during diagnosis) and targeting (during +treatment) different cell types may be critical for the diagnosis and +treatment of these closely related, yet mechanistically distinct, +diseases. + +\phantomsection\label{cell-fig-network-rni} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-network-rni-1.pdf} + +} + +\caption{\label{fig-network-rni}Multi-scale mechanisms of Recurrent +Neisserial infections. Starting from the bottom of the plot, one can +trace how causal genes (yellow boxes) mediate their effects through cell +types (orange circles), phenotypes (pruple cylinders) and ultimately +diseases (blue cylinders). Cell types are connected to phenotypes via +association testing (\(FDR_{p,c}<0.05\)), and to diseases when the +symptom gene set overlap is \textgreater25\%. Nodes were spatially +arranged using the Sugiyama algorithm\textsuperscript{49}.} + +\end{figure}% + +\subsubsection{Monarch Knowledge Graph +recall}\label{monarch-knowledge-graph-recall} + +Next, we used the Monarch Knowledge Graph (MKG) as a proxy for the +field's current state of knowledge of phenotype-cell type associations. +We evaluated the proportion of MKG associations that were recapitulation +by our results. In total, our results contained at least one significant +cell type associations for \textgreater90\% of the phenotypes described +in the MKG. Of these phenotypes, we captured \textgreater45\% of the MKG +phenotype-cell associations when only considering exact overlap of +CL-aligned cell type annotations. This proportion increased with greater +flexibility in the matching of cell type annotations, reaching a maximum +of \texttt{**!!RECOMPUTE!!**}\% at a ontology graph distance of +\texttt{**!!RECOMPUTE!!**} when considering the overlap of cell type +annotations at the level of cell type ontology terms. This suggests that +our results are in line with the current state of knowledge, and that +our approach can be used to identify novel phenotype-cell type +associations. + +\subsubsection{Annotation of phenotypes using generative large language +models}\label{annotation-of-phenotypes-using-generative-large-language-models} + +Severity annotations were gathered from GPT-4 for 16982/18082 +(93.9166\%) HPO phenotypes. In our companion study, benchmarking tests +of these results using ground-truth HPO branch annotations. For example, +phenotypes within the `Blindness' HPO branch (\emph{HP:0000618}) were +correctly annotated as causing blindness by GPT-4. Across all +annotations, the recall rate of GPT-4 annotations was 91.26\% +(min=70.1\%, max=100\%, SD=11.84) with a mean consistency score of +91.21\% (min=80.96\%, max=97.48\%, SD=5.739) for phenotypes whose +annotation were collected more than once. This clearly demonstrates the +ability of GPT-4 to accurately annotate phenotypes. This allowed us to +begin using these annotations to compute systematically collected +severity scores for all phenotypes in the HPO. + +From these annotations we computed a weighted severity score metric for +each phenotype ranging from 0-100 (100 being the theoretical maximum +severity of a phenotype that always causes every annotation). Within our +annotations, the most severe phenotype was `Anencephaly' +(\emph{HP:0002323}) with a severity score of 58, followed by +`Atrophy/Degeneration affecting the central nervous system' +(\emph{HP:0007367}) with a severity score of 58. There were 677 +phenotypes with a severity score of 0 (e.g.~`Thin toenail'). The mean +severity score across all phenotypes was 14.89 (median=14, standard +deviation=8.517). + +\subsubsection{Enrichment of foetal cell types in congenital +phenotypes}\label{enrichment-of-foetal-cell-types-in-congenital-phenotypes} + +The frequency of congenital onset with each phenotype (as determined by +GPT-4 annotations) was strongly predictive with the proportion of +significantly associated foetal cell types in our results +(\(p=2e-203,\chi^2_{Pearson}=940,\hat{V}_{Cramer}=0.14\)). Furthermore, +increasing congenital frequency annotation (on an ordinal scale) +corresponded to an increase in the proportion of foetal cell types: +`always'=24\% (n=1636 associations), `often'=20\% (n=2979 associations), +`rarely'=12\% (n=1956 associations), `never'=10\% (n=811 associations). +This is consistent with the expected role of foetal cell types in +development and the aetiology of congenital disorders. + +\phantomsection\label{cell-fig-congenital} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-congenital-1.pdf} + +} + +\caption{\label{fig-congenital}Congenital phenotypes are more often +associated with foetal cell types. As a phenotype is more often +congenital in nature, the greater proportion of foetal cell types are +significantly asscoaited with it.} + +\end{figure}% + +\subsubsection{Diagnosis via cell type-specific disease +prediction}\label{diagnosis-via-cell-type-specific-disease-prediction} + +Using the function \texttt{MSTExplorer::predict\_celltypes} we input 3 +inclusion phenotypes (`Generalized neonatal hypotonia' +(\emph{HP:0008935}), `Scrotal hypospadias' (\emph{HP:0012853}), +`Increased circulating progesterone' (\emph{HP:0031216})), 2 genes in +which the patient is known to have deleterious mutations (\emph{HSD3B2}, +\emph{HERC2}) and 1 gene in which the patient is known not to have any +deleterious mutations (\emph{SNORD115-1}). This predicted that cortical +cell of adrenal gland (score sum=1.38, score mean=0.0256, score standard +deviation=0.137) were the most probable cell types underlying this +combination of phenotypes and genotypes (Fig.~\ref{fig-diagnosis}), +which is highly consistent with existing evidence that adrenal +insufficiency can cause both phenotypes via mutations in these +genes\textsuperscript{50,51}. This was the only cell type to receive a +score two standard deviations from the mean score of all cell types +(mean score: 0.000668). + +\phantomsection\label{cell-fig-diagnosis} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-diagnosis-1.pdf} + +} + +\caption{\label{fig-diagnosis}Diagnosis - Observed phenotypes/genotypes +can be used to identify causal cell types in individuals. Our +phenotype-cell type association results can be used to make predictions +about which cell types are underlying a set of phenotypes observed in a +given patient. Here we input three inclusion phenotypes, two inclusion +genes, and one exclusion gene into the function +\texttt{MSTExplorer::predict\_celltypes}. The output is a ranked list of +the top 10 most probable cell types (\emph{x-axis}) underlying this +combination of phenotypes/genotypes (highest to lowest rank from left to +right). The score on the \emph{y-axis} is computed by aggregating +phenotype-celltype association summary statistics and evidence-weighted +phenotype-gene associations. In this simple example, cortical cells of +the adrenal gland were predicted as the most probable cell type. The +mean of the score sum is shown as a dashed line, while one standard +deviation (SD) above this is shown as a dotted line. Each bar is +coloured by its mean.} + +\end{figure}% + +\subsubsection{Prognosis via cell type-mediated differential +outcomes}\label{prognosis-via-cell-type-mediated-differential-outcomes} + +Hypotonia (\emph{HP:0001252}) is a very broad phenotype containing 13 +subterms (e.g.~``Generalised neonatal hypotonia'') and is associated +with 2569 unique diseases in the HPO gene annotations. Together, these +hypotonia phenotypes were significantly associated with 29/99 (29.29\%) +unique CL-aligned cell types. This reflects the highly variable set of +disease etiologies that can cause this broad-level phenotype. Across all +diseases, hypotonia phenotypes tended to be most consistently severe +(lower mean age of death score) when associated with the cell type +inhibitory interneuron. While other cell types were associated with +lower mean age of death scores (e.g.~stromal cell, astrocyte), the +severity of the outcomes were more variable. + +\phantomsection\label{cell-fig-prognosis} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-prognosis-1.pdf} + +} + +\caption{\label{fig-prognosis}Prognosis - Cell types predict the +probability of deadly diseases. The broad phenotype `Hypotonia' and its +descendants occur in many different diseases (1,832 diseases in the HPO +annotations).Therefore, it can be difficult to prognose clinical +outcomes of a newborn individual with hypotonia. With additional +knowledge of the particular cell types underlying a patient's hypotonia +phenotype, one can greatly narrow down the range of potential outcomes +(e.g.~age of death). \textbf{a}, Here, we show the various cell types by +which hypotonia phenotypes confer disease risk. \textbf{b}, We also +computed the mean age of death score for each cell type across +hypotonia-associated diseases, revealing that disrupted inhibitory +neurons confer the greatest risk of early death. Ordinal age of death +categories from the HPO disease annotations were encoded numerically and +averaged (\textbf{?@tbl-death}) to produce mean Age of Death scores for +each disease (on a scale from 1-8). For example, a score of 1 +corresponds to prenatal death, while a score of 8 corresponds to death +in late adulthood.} + +\end{figure}% + +\subsubsection{Therapeutic target +identification}\label{therapeutic-target-identification} + +Next, we identified putative cell type-specific gene targets for several +severe disease phenotypes. This yielded putative therapeutic targets for +5287 phenotypes across 4850 diseases in 201 cell types and 3180 genes +(Fig.~\ref{fig-therapy-filter}). While this constitutes a large number +of genes in total, each phenotype was assigned a median of 2 gene +targets (mean=3.29, min=1, max=10). Relative to the number of genes +annotations per phenotype in the HPO overall (median=7, mean=61.95, +min=1, max=5003) this represents a substantial decrease in the number of +candidate target genes, even when excluding high-level phenotypes (HPO +level\textgreater3). It is also important to note that the phenotypes in +the prioritised targets list are ranked by their severity, allowing us +to distinguish between phenotypes with a high medical urgency +(e.g.~`Hydranencephaly') from those with lower medical urgency +(e.g.~`Hyperplastic labia majora'). This can be useful for both +clinicians, biomedical scientists, and pharmaceutical manufacturers who +wish to focus their research efforts on phenotypes with the greatest +need for intervention. + +Across all phenotypes, epithelial cell were most commonly implicated +(834 phenotypes), followed by stromal cell (627 phenotypes), stromal +cell (627 phenotypes), neuron (478 phenotypes), chondrocyte (385 +phenotypes), and endothelial cell (363 phenotypes). Grouped by +higher-order ontology category, `Abnormality of the musculoskeletal +system' had the greatest number of enriched phenotypes (961 phenotypes, +863 genes), followed by `Abnormality of the nervous system' (745 +phenotypes, 1163 genes), `Abnormality of head or neck' (545 phenotypes, +997 genes), `Abnormality of the genitourinary system' (446 phenotypes, +710 genes), and `Abnormality of the eye' (379 phenotypes, 572 genes). + +\subsubsection{Therapeutic target +validation}\label{therapeutic-target-validation} + +To determine whether the genes prioritised by our therapeutic targets +pipeline were plausible, we checked what percentage of gene therapy +targets we recapitulated. Data on therapeutic approval status was +gathered from the Therapeutic Target Database (TTD; release +2024-03-22)\textsuperscript{52}. Overall, we prioritised 79\% of all +non-failed existing gene therapy targets. A hypergeometric test +confirmed that our prioritised targets were significantly enriched for +non-failed gene therapy targets (\(p=0.0104\)). Importantly, we did not +prioritise any of the failed therapeutics (0\%), defined as having been +terminated or withdrawn from the market. The hypergeometric test for +depletion of failed targets did not reach significance (\(p=0.365\)), +but this is to be expected as there was only one failed gene therapy +target in the TTD database. + +Even when considering therapeutics of any kind +(Fig.~\ref{fig-therapy-validate-all}), not just gene therapies, we +recapitulated 44\% of the non-failed therapeutic targets and 0\% of the +terminated/withdrawn therapeutic targets (n=1255). Here we found that +our prioritised targets were significantly enriched for non-failed +therapeutics (\(p=3e-19\)), and highly significantly depleted for failed +therapeutics (\(p=3e-199\)). This suggests that our multi-scale +evidence-based prioritisation pipeline is capable of selectively +identifying genes that are likely to be effective therapeutic targets. + +\phantomsection\label{cell-fig-therapy-validate} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-validate-1.pdf} + +} + +\caption{\label{fig-therapy-validate}Therapeutics - Validation of +prioritised therapeutic targets. The proportion of existing gene therapy +targets (documented in the Therapeutic Target Database) recapitulated by +our prioritisation pipeline. Therapetics are stratified by the stage of +clinical development they were at during the time of writing.} + +\end{figure}% + +\subsubsection{Selected example targets}\label{selected-example-targets} + +\begin{figure} + +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-1.pdf} + +} + +\subcaption{\label{fig-therapy-examples-1}Lethal skeletal dysplasia} + +\end{minipage}% +% +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-2.pdf} + +} + +\subcaption{\label{fig-therapy-examples-2}GM2-ganglioside accumulation} + +\end{minipage}% +\newline +\begin{minipage}{\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-3.pdf} + +} + +\subcaption{\label{fig-therapy-examples-3}Alzheimer disease} + +\end{minipage}% +\newline +\begin{minipage}{\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-4.pdf} + +} + +\subcaption{\label{fig-therapy-examples-4}Parkinson disease} + +\end{minipage}% + +\caption{\label{fig-therapy-examples}Example cell type-specific gene +therapy targets for several severe phenotypes and their associated +diseases. Each disease (blue cylinders) is connected to its phenotype +(purple cylinders) based on well-established clinical observations +recorded within the HPO\textsuperscript{11}. Phenotypes are connected to +cell types (red circles) via association testing between weighted gene +sets (\(FDR_{p,c}<0.05\)). Each cell type is connected to the +prioritised gene targets (yellow boxes) based on the driver gene +analysis.The thickness of the edges connecting the nodes represent the +(mean) fold-change from the bootstrapped enrichment tests. Nodes were +spatially arranged using the Sugiyama algorithm\textsuperscript{49}.} + +\end{figure}% + +From our prioritised targets, we selected the following four sets of +phenotypes or diseases as examples: `Lethal skeletal dysplasia', +`GM2-ganglioside accumulation', `Alzheimer disease', `Parkinson +disease'. + +Skeletal dysplasia is a heterogeneous group of over 450 disorders that +affect the growth and development of bone and cartilage. This phenotype +can be lethal when deficient bone growth leads to the constriction of +vital organs such as the lungs. Even after surgical interventions, these +complications continue to arise as the child develops. Pharmacological +interventions to treat this condition have largely been ineffective. +While there are various cell types involved in skeletal system +development, our pipeline nominated chondrocytes as the causal cell type +underlying the lethal form of this condition. Assuringly, we found that +the disease `Achondrogenesis Type 1B' is caused by the genes +\emph{SLC26A2} and \emph{COL2A1} via chondrocytes. We also found that +`Platyspondylic lethal skeletal dysplasia, Torrance type'. Thus, in +cases where surgical intervention is insufficient, targeting these genes +within chondrocytes may prove a viable long-term solution for children +suffering from lethal skeletal dysplasia. + +Tay-Sachs disease is a devastating disease in which children are born +appearing healthy, which gradually degrades leading to death after 3-5 +years. The underlying cause is the toxic accumulation of gangliosides in +the nervous system due to a loss of the enzyme produced by \emph{HEXA}. +While this could in theory be corrected with gene editing technologies, +there remain some outstanding challenges. One of which is early +detection and diagnosis, before irreversible damage has occurred. Our +pipeline implicated extravillous trophoblasts of the placenta in +`GM2-ganglioside accumulation'. While not necessarily a target for gene +therapy, checking these cells \emph{in utero} for an absence of +\emph{HEXA} may serve as a viable biomarker as these cells normally +express the gene at high levels. Early detection of Tay-Sachs disease +may lengthen the window of opportunity for therapeutic intervention, +especially when genetic sequencing is not available or variants of +unknown significance are found within \emph{HEXA}. + +Alzheimer disease (AD) is the most common neurodegenerative condition. +It is characterised by a set of variably penetrant phenotypes including +memory loss, cognitive decline, cerebral proteinopathy. Interestingly, +we found that different forms of early onset AD (which are defined by +the presence of a specific disease gene) are each associated with +different cell types via different phenotypes. For example, AD 3 and AD +4 are primarily associated with cells of the digestive system +(`enterocyte', `gastric goblet cell') and are implied to be responsible +for the phenotypes `Senile plaques', `Alzheimer disease', `Parietal +hypometabolism in FDG PET', `Cerebral amyloid angiopathy'. Meanwhile, +early-onset autosomal dominant AD and AD 2 are primarily associated with +immune cells (`alternatively activated macrophage', `microglial cell') +and are implied to be responsible for the phenotypes `Neurofibrillary +tangles', `Long-tract signs', `Finger agnosia', `Semantic dementia'. +This suggests that different forms of AD may be driven by different cell +types and phenotypes, which may help explain its variability in onset +and clinical presentation. + +Finally, Parkinson disease (PD) is characterised by motor symptoms such +as tremor, rigidity, and bradykinesia. However there are a number of +additional phenotypes associated with the disease that span multiple +physiological systems. PD 19a and PD 8 seemed to align most closely with +the canonical understanding of PD as a disease of the central nervous +system in that they implicated oligodendrocytes and neurons. Though the +reference datasets being used in this study were not annotated at +sufficient resolution to distinguish between different subtypes of +neurons, in particular dopaminergic neurons. PD 19a/8 also suggested +that risk variants in \emph{LRRK2} mediate their effects on PD through +both myeloid cells and oligodendrocytes by causing gliosis of the +substantia nigra. The remaining clusters of PD mechanisms revolved +around chondrocytes (PD 20), amacrine cells of the eye (hereditary +late-onset PD), and the respiratory/immune system (PD 14). While the +diversity in cell type-specific mechanisms is somewhat surprising, it +may help to explain the wide variety of cross-system phenotypes +frequently observed in PD. + +It should be noted that the HPO only includes gene annotations for the +monogenic forms of AD and PD. However it has previously been shown that +there is at least partial overlap in their phenotypic and genetic +aetiology with respect to their common forms. Thus understanding the +monogenic forms of these diseases may shed light onto their more common +counterparts. + +\subsubsection{Experimental model +translatability}\label{experimental-model-translatability} + +We computed interspecies translatability scores using a combination of +both ontological (\(SIM_{o}\)) and genotypic (\(SIM_{g}\)) similarity +relative to each homologous human phenotype and its associated genes +Fig.~\ref{fig-animal-models}. In total, we mapped 278 non-human +phenotypes (in \emph{Caenorhabditis elegans}, \emph{Danio rerio}, +\emph{Mus musculus}, \emph{Rattus norvegicus}) to 849 homologous human +phenotypes. Amongst the 5287 phenotype within our prioritised therapy +targets, 356 had viable animal models in at least on non-human species. +Per species, the number of homologous phenotypes was: \emph{Danio rerio} +(n=214), \emph{Mus musculus} (n=152), \emph{Caenorhabditis elegans} +(n=35), \emph{Rattus norvegicus} (n=3). Amongst our prioritised targets +with a GPT-4 severity score of \textgreater10, the phenotypes with the +greatest animal model similarity were `Anterior vertebral fusion' +(\(SIM_{o,g}=0.967\)), `Disc-like vertebral bodies' +(\(SIM_{o,g}=0.964\)), `Metaphyseal enchondromatosis' +(\(SIM_{o,g}=0.946\)), `Peripheral retinal avascularization' +(\(SIM_{o,g}=0.943\)), `Retinal vascular malformation' +(\(SIM_{o,g}=0.943\)). + +\subsection{Discussion}\label{sec-discussion} + +Across the 201 cell types and 11,047 RD-associated phenotypes +investigated, more than 46,514 significant phenotype-cell type +relationships were discovered. The examples we have highlighted above +recapitulate well-known relationships, provide additional cellular +context to many of these known relationships, and discover novel +relationships at multiple biological scales. + +Investigating RDs at the level of phenotypes offers several key +advantages. First, the vast majority of RDs only have one associated +gene (7671/8631 diseases = 89\%). Aggregating gene sets across diseases +into phenotype-centric ``buckets'' permits sufficiently well-powered +analyses, with an average of \textasciitilde76 genes per phenotype +(median=7) see Fig.~\ref{fig-diagram}. Second, we hypothesise that these +phenotype-level gene sets converge on a limited number of molecular and +cellular pathways. Perturbations to these pathways manifest as one or +more phenotypes which, when considered together, tend to be clinically +diagnosed as a certain disease. Third, RDs are often highly +heterogeneous in their clinical presentation across individuals, leading +to the creation of an ever increasing number of disease subtypes (some +of which only have a single documented case). In contrast, a +phenotype-centric approach enables us to more accurately describe a +particular individual's version of a disease without relying on the +generation of additional disease subcategories. By characterising an +individual's precise phenotypes over time, we may better understand the +underlying biological mechanisms that have caused their condition. +However, in order to achieve a truly precision-based approach to +clinical care, we must first characterise the molecular and cellular +mechanisms that cause the emergence of each phenotype. Here, we provide +a highly reproducible framework that enables this at the scale of the +entire phenome. This presents an opportunity to design basket trials of +patients with different diseases but overlapping phenotypes and cellular +mechanisms\textsuperscript{17}. It may be especially helpful for complex +patients with diagnostically ambiguous sets of phenotypes who would +otherwise be excluded from traditional clinical +trials\textsuperscript{53}. + +It was paramount to the success of this study to ensure our results were +anchored in ground-truth benchmarks, generated falsifiable hypotheses, +and rigorously guarded against false-positive associations. Extensive +validation using multiple approaches demonstrated that our methodology +consistently recapitulates expected phenotype-cell type associations +(Fig.~\ref{fig-summary}-Fig.~\ref{fig-congenital}). This was made +possible by the existence of comprehensive, structured ontologies for +all phenotypes (HPO) and cell types (CL), which provide an abundance of +clear and falsifiable hypotheses for which to test our predictions +against. Several key examples include 1) strong enrichment of +associations between cell types and phenotypes within the same +anatomical systems (Fig.~\ref{fig-summary}b-d), 2) a strong relationship +between phenotype-specificity and the strength and number of cell type +associations (Fig.~\ref{fig-ontology-lvl}), 3) identification of the +precise cell subtypes involved in susceptibility to various subtypes of +recurrent bacterial infections (Fig.~\ref{fig-rni}), 4) a strong +positive correlation between the frequency of congenital onset of a +phenotype and the proportion of developmental cell types associated with +it (Fig.~\ref{fig-congenital})), and 5) consistent phenotype-cell type +associations across multiple independent single-cell datasets +(Fig.~\ref{fig-ctd-correlation}). Having validated our phenotype-cell +type associations, we then went on to demonstrate how these results may +be used in each stage of clinical care: diagnosis +(Fig.~\ref{fig-diagnosis}), prognosis (Fig.~\ref{fig-prognosis}), +treatment, and therapeutics development +(Fig.~\ref{fig-therapy-examples}). + +Diagnosis is an essential but challenging step in RD patient care. +Additional phenotypes that emerge over time may assist a clinician to +reach a more confident disease diagnosis. However many of these +phenotypes can have a serious impact on patient quality of life or +survival and avoiding them would be far better for patient outcomes. +Often times phenotypes alone cannot clearly pinpoint the disease and +thus a diagnosis is never reached. Having a more complete understanding +of the mechanisms underlying observed phenotypes allows clinicians to +far more effectively make predictions about what additional, less +obvious phenotypes they should search for to confirm or reject their +hypothesis of disease diagnosis (e.g.~with imaging or biomarker tests). + +Consider the following hypothetical scenario. A clinician observes that +a newborn patient has several phenotypes (`Generalized neonatal +hypotonia', `Scrotal hypospadias', `Increased circulating +progesterone'), none of which conclusively point to a single disease +diagnosis. Under the strong suspicion that the phenotypes are genetic in +origin, the clinician orders whole-genome sequencing (WGS) on the +patient as well as the patient's family. The clinician finds that the +patient has a number of putative causal mutations, narrowing down the +number of potential diseases from hundreds to just 10. Further narrowing +down the possibilities at this stage can be extremely challenging even +for expert clinical geneticists. However, additional knowledge of which +tissues and cell types are primarily affected allow the clinician to +make a series of testable hypotheses that they may begin to investigate. +For example, two of the putative diseases are known to cause aberrant +splicing events in a gene that is only expressed in adrenocortical cells +(Fig.~\ref{fig-diagnosis}), providing justification to order a needle +biopsy of the adrenal gland. RNA sequencing is performed on the tissue +biopsy and it is discovered that the patient does indeed have high +expression of the dysfunctional transcript, confirming the disease +diagnosis\textsuperscript{54}. This opens new avenues for the patient to +receive timely and effective treatments for their specific condition, +which is important as their version of the disease tends to lead to +death in early childhood if left untreated (Fig.~\ref{fig-prognosis}). +Fortunately, their diagnosis now qualifies them to participate in a +clinical trial of a novel gene therapy with promising preliminary +results. Furthermore, it is predicted that this patient would respond +especially well to this treatment given that the mechanisms of action of +the gene therapy primarily acts on adrenocortical cells +(Fig.~\ref{fig-therapy-examples}). + +Unfortunately, there are currently only treatments available for less +than 5\% of RDs\textsuperscript{6}. Novel technologies including CRISPR, +prime editing, antisense oligonucleotides, viral vectors, and/or lipid +nanoparticles, have been undergone significant advances in the last +several years\textsuperscript{55--59} and proven remarkable clinical +success in an increasing number of clinical +applications\textsuperscript{60--63}. The U.S. Food and Drug +Administration (FDA) recently announced an landmark program aimed +towards improving the international regulatory framework to take +advantage of the evolving gene/cell therapy +technologies\textsuperscript{64} with the aim of bringing dozens more +therapies to patients in a substantially shorter timeframe than +traditional pharmaceutical product development (typically 5-20 years +with a median of 8.3 years)\textsuperscript{65}. While these +technologies have the potential to revolutionise RD medicine, their +successful application is dependent on first understanding the +mechanisms causing each disease. + +To address this critical gap in knowledge, we used our results to create +a reproducible and customisable pipeline to nominate cell type-resolved +therapeutic targets +(Fig.~\ref{fig-therapy-filter}-Fig.~\ref{fig-therapy-examples}). +Targeting cell type-specific mechanisms underlying granular RD +phenotypes can improve therapeutic effectiveness by treating the causal +root of an individual's conditions\textsuperscript{56,66}. A cell +type-specific approach also helps to reduce the number of harmful side +effects caused by unintentionally delivering the therapeutic to +off-target tissues/cell types (which may induce aberrant gene activity), +especially when combined with technologies that can target cell surface +antigens (e.g viral vectors)\textsuperscript{67}. This has the +additional benefit of reducing the minimal effective dose of a +therapeutic, which can be both immunogenic and extremely financially +costly\textsuperscript{9,10,55,58}. Here, we demonstrate the utility of +a high-throughput evidence-based approach to RD therapeutics discovery +by highlighting several of the most promising therapeutic candidates. +Our pipeline takes into account a myriad of factors, including the +strength of the phenotype-cell type associations, symptom-cell type +associations, cell type-specificity of causal genes, the severity and +frequency of the phenotypes, suitability for gene therapy delivery +systems (e.g.~recombinant adeno-associated viral vectors (rAAV)), as +well as a quantitative analysis of phenotypic and genetic animal model +translatability (Fig.~\ref{fig-animal-models}). We validated these +candidates by comparing the proportional overlap with gene therapies +that are presently in the market or undergoing clinical trials, in which +we recovered 79\% of all active gene therapies and 0\% of failed gene +therapies (Fig.~\ref{fig-therapy-validate}, +Fig.~\ref{fig-therapy-validate-all}). Despite nominating a large number +of putative targets, hypergeometric tests confirmed that our targets +were strongly enriched for targets of existing therapies that are either +approved or currently undergoing clinical trials. + +It should be noted that our study has several key limitations. First, +while our cell type datasets are amongst the most comprehensive human +scRNA-seq references currently available, they are nevertheless missing +certain tissues, cell types (e.g.~spermatocytes, oocytes), and life +stages (post-natal childhood, senility). It is also possible that we +have not captured certain cell state signatures that only occur in +disease (e.g.~disease-associated microglia {[}\textbf{CITATION}{]}). +Though we reasoned that using only control cell type signatures would +mitigate bias towards any particular disease, and avoid degradation of +gene signatures due to loss of function mutations. Second, the +collective knowledge of gene-phenotype and gene-disease associations is +far from complete and we fully anticipate that these annotations will +continue to expand and change well into the future. It is for this +reason we designed this study to be easily reproduced within a single +containerised script so that we (or others) may rerun it with updated +datasets at any point. Finally, causality is notoriously difficult to +prove definitively from associative testing alone, and our study is not +exempt from this rule. Despite this, there are several reasons to +believe that our approach is able to better approximate causal +relationships than traditional approaches. First, we did not +intentionally preselect any subset of phenotypes or cell types to +investigate here. Along with a scaling prestep during linear modelling, +this means that all the results are internally consistent and can be +directly compared to one another (in stark contrast to literature +meta-analyses). Furthermore, for the phenotype gene signatures we used +expert-curated GenCC annotations\textsuperscript{68,69} to weight the +current strength of evidence supporting a causal relationship between +each gene and phenotype. This is especially important for phenotypes +with large genes lists (thousands of annotations) for which some of the +relationships may be tenuous. Within the cell type references, we +deliberately chose to use specificity scores (rather than raw gene +expression) as this normalisation procedure has previously been +demonstrated to better distinguish between signatures of highly similar +cell types/subtypes\textsuperscript{70}. + +Moving forward, we are now actively seeking industry and academic +partnerships to begin experimentally validating our multi-scale target +predictions and exploring their potential for therapeutic translation. +Nevertheless, there are more promising therapeutic targets here than our +research group could ever hope to pursue by ourselves. In the interest +of accelerating research and ensuring RD patients are able to benefit +from this work as quickly as possible, we have decided to publicly +release all of the results described in this study. These can be +accessed in multiple ways, including through a suite of R packages as +well as a web app, the +\href{https://neurogenomics.github.io/rare_disease_celltyping_apps/home/}{Rare +Disease Celltyping Portal}. The latter allows our results to be easily +queried, filtered, visualised, and downloaded without any knowledge of +programming. Through these resources we aim to make our findings useful +to a wide variety of RD stakeholders including subdomain experts, +clinicians, advocacy groups, and patients. + +\subsection{Conclusions}\label{sec-conclusions} + +Ultimately, our primary objective was to develop a methodology capable +of generating high-throughput phenome-wide predictions while preserving +the accuracy and clinical utility typically associated with more +narrowly focused studies. With the rapid advancement of gene therapy +technologies, and a regulatory landscape that is evolving to better meet +the needs of a large and diverse patient population, there is finally +momentum to begin to realise the promise of personalised medicine. This +has especially important implications for the global RD community which +has remained relatively neglected. Here, we lay out the groundwork +necessary for this watershed moment by providing a scalable, +cost-effective, and fully reproducible means of resolving the +multi-scale, cell-type specific mechanisms of virtually all rare +diseases. + +\subsection{Methods}\label{sec-methods} + +\subsubsection{Human Phenotype Ontology}\label{human-phenotype-ontology} + +The latest version of the HPO (release 2024-02-08) was downloaded from +the EMBL-EBI Ontology Lookup Service\textsuperscript{71} and imported +into R using the \texttt{HPOExplorer} package. This R object was used to +extract ontological relationships between phenotypes as well as to +assign absolute and relative ontological levels to each phenotype. The +latest version of the HPO phenotype-to-gene mappings and phenotype +annotations were downloaded from the official HPO GitHub repository and +imported into R using \texttt{HPOExplorer}. This contains lists of genes +associated with phenotypes via particular diseases, formatted as three +columns in a table (gene, phenotype, disease). + +However, not all genes have equally strong evidence of causality with a +disease or phenotype, especially when considering that the variety of +resources used to generate these annotations (OMIM, Orphanet, DECIPHER) +use variable methodologies (e.g.~expert-curated review of the medical +literature vs.~automated text mining of the literature). Therefore we +imported data from the Gene Curation Coalition +(GenCC)\textsuperscript{68,69}, which (as of 2024-03-01) 21798 evidence +scores across 7229 diseases and 5142 genes. Evidence scores are defined +by GenCC using a standardised ordinal rubric which we then encoded as a +semi-quantitative score ranging from 0 (no evidence of disease-gene +relationship) to 6 (strongest evidence of disease-gene relationship) +(see \textbf{?@tbl-gencc}). We then summed evidence scores per disease, +merged this table with the HPO disease-phenotype-gene annotation table, +and then cast the data into a gene-by-phenotype matrix filled with the +aggregated mean evidence score. This can be expressed as the following +equations. + +Let us denote: + +\begin{itemize} +\item + \(D\) as the set of \(d\) diseases. +\item + \(p\) as a phenotype. +\item + \(g\) as a gene. +\end{itemize} + +The final evidence-weighted gene-by-phenotype matrix (\(M_{g,p}\)) can +be expressed as: + +\hfill\break +\hfill\break + +\begin{equation*} + \eqnmarkbox[NavyBlue]{n1}{M_{g,p}} + = + \frac{ + \eqnmarkbox[Cerulean]{n3a}{\sum_{d \in D}} + \eqnmarkbox[blue]{n4a}{R(g,p,d)} + \times + \eqnmarkbox[BlueViolet]{n5}{E(g,d)} + }{ + \eqnmarkbox[Cerulean]{n3b}{\sum_{d \in D}} + \eqnmarkbox[blue]{n4b}{R(g,p,d)} + } +\end{equation*} +\annotate[yshift=1em]{left}{n1}{Weighted gene-by-phenotype \\evidence score matrix} +\annotate[yshift=-2em]{below,left}{n3a,n3b}{Iterate over all diseases} +\annotate[yshift=-2.5em,xshift=2.5em]{below,right}{n4a,n4b}{Binary gene-by-phenotype \\relationship matrix,\\ (1=relationship, 0=no relationship)} +\annotate[yshift=2em]{left}{n5}{Weighted gene-by-disease \\evidence score matrix} + +\hfill\break + +Histograms of evidence score distributions at each step in processing +can be found in Fig.~\ref{fig-evidence-histograms}. + +\subsubsection{Single-cell transcriptomic +atlases}\label{single-cell-transcriptomic-atlases} + +In this study, the gene by cell type specificity matrix was constructed +using the Descartes Human transcriptome atlas of foetal gene expression, +which contains a mixture of single-nucleus and single-cell RNA-seq data +(collected with sci-RNA-seq3)\textsuperscript{31}. This dataset contains +377,456 cells representing 77 distinct cell types across 15 tissues. All +121 human foetal samples ranged from 72 to 129 days in estimated +postconceptual age. To independently replicate our findings, we also +used the Human Cell Landscape which contains single-cell transcriptomic +data (collected with microwell-seq) from embryonic, foetal, and adult +human samples across 49 tissues\textsuperscript{32}. + +Specificity matrices were generated separately for each transcriptomic +atlas using the R package \texttt{EWCE} (v1.11.3)\textsuperscript{70}. +Within each atlas, cell types were defined using the authors' original +freeform annotations in order to preserve the granularity of cell +subtypes as well as incorporate expert-identified rare cell types. Cell +types were only aligned and aggregated to the level of corresponding +Cell Ontology (CL)\textsuperscript{33} annotations afterwards when +generating summary figures and performing cross-atlas analyses. Using +the original gene-by-cell count matrices from each single-cell atlas, we +computed gene-by-cell type expression specificity matrices as follows. + +Let us denote: \(g\) as a gene, \(c\) as a cell type, and \(i\) as a +single cell. Genes with very no expression across any cell types were +considered to be uninformative and were therefore removed from the input +gene-by-cell matrix \(F(g,i,c)\). + +\hfill\break + +\begin{equation*} + \eqnmarkbox[purple]{f1}{F(g,i,c)} + = + \begin{cases} + \eqnmarkbox[WildStrawberry]{f2}{r_{g,i}}, + \text{ }l_i = c\\0, + \text{ }l_i \neq c + \end{cases} +\end{equation*} +\annotate[yshift=1em]{left}{f1}{Filtered gene-by-cell expression matrix} +\annotate[yshift=2em]{left}{f2}{Expression of gene $g$ in cell $i$} + +\hfill\break + +Next, we calculated the mean expression per cell type and normalised the +resulting matrix to transform it into a gene-by-cell type expression +specificity matrix (\(S_{g,c}\)). In other words, each gene in each cell +type had a 0-1 score where 1 indicated the gene was mostly specifically +expressed in that particular cell type relative to all other cell types. +This procedure was repeated separately for each of the single-cell +atlases and can be summarised as: + +\hfill\break + +\begin{equation*} + \eqnmarkbox[orange]{s1}{S_{g,c}} + = + \frac{ + \eqnmarkbox[purple]{s3a}{ + \frac{ + \sum_{i=1}^{|L|} F(g,i,c) + }{ + N_c + } + } + }{ + \eqnmarkbox[OrangeRed]{s6}{\sum_{r=1}^{k}}( + \eqnmarkbox[purple]{s3b}{ + \frac{ + \sum_{i=1}^{|L|} F(g,i,c) + }{ + N_c + } + } + ) + } +\end{equation*} +\annotate[yshift=1em]{left}{s1}{Gene-by-cell type specificity matrix} +\annotate[yshift=2em]{left}{s3a,s3b}{Compute mean expression of each gene per cell type} +\annotate{below,left}{s6}{Compute row sums of \\mean gene-by-cell type matrix} + +\hfill\break + +\subsubsection{Phenotype-cell type +associations}\label{phenotype-cell-type-associations-1} + +To test for relationships between each pairwise combination of phenotype +(n=11,047) and cell type (n=201) we ran a series of univariate +generalised linear models implemented via the \texttt{stats::glm} +function in R. First, we filtered the gene-by-phenotype evidence score +matrix (\(M _{g,p}\)) and the gene-by-cell type expression specificity +matrix (\(S _{g,c}\)) to only include genes present in both matrices +(n=4,949 genes in the Descartes Human analyses; n=4,653 genes in the +Human Cell Landscape analyses). Then, within each matrix any rows or +columns with a sum of 0 were removed as these were uninformative data +points that did not vary. To improve interpretability of the results +\(\beta\) coefficient estimates across models (i.e.~effect size), we +performed a scaling prestep on all dependent and independent variables. +Initial tests showed that this had virtually no impact on the total +number of significant results or any of the benchmarking metrics based +on p-value thresholds Fig.~\ref{fig-summary}. This scaling prestep +improved our ability to rank cell types by the strength of their +association with a given phenotype as determined by separate linear +models. + +We repeated the aforementioned procedure separately for each of the +single-cell references. Once all results were generated using both cell +type references (2,206,994 association tests total), we applied +Benjamini-Hochberg false discovery rate\textsuperscript{72} (denoted as +\(FDR_{p,c}\)) to account for multiple testing. Of note, we applied this +correction across all results at once (as opposed to each single-cell +reference separately) to ensure the \(FDR_{p,c}\) was stringently +controlled for across all tests performed in this study. + +\subsubsection{Symptom-cell type +associations}\label{symptom-cell-type-associations} + +Here we define a symptom as a phenotype as it presents within the +context of the specific disease. The features of a given symptom can be +described as the subset of genes annotated to phenotype \(p\) via a +particular disease \(d\), denoted as \(G_{d,p}\) (see +Fig.~\ref{fig-diagram}). To attribute our phenotype-level cell type +enrichment signatures to specific diseases, we first identified the gene +subset that was most strongly driving the phenotype-cell type +association by computing the intersect of genes that were both in the +phenotype annotation and within the top 25\% specificity percentile for +the associated cell type. We then computed the intersect between symptom +genes (\(G_{d,p}\)) and driver genes (\(G_{p,c}\)), resulting in the +gene subset \(G_{d \cap p \cap c}\). Only \(G_{d \cap p \cap c}\) gene +sets with 25\% or greater overlap with the symptom gene subset +(\(G_{d,p}\)) were kept. This procedure was repeated for all +phenotype-cell type-disease triads, which can be summarised as follows: + +\hfill\break + +\begin{equation*} + \frac{ + \eqnmarkbox[Chartreuse3]{g1}{|G_{d \cap p \cap c} |} + }{ + \eqnmarkbox[Emerald]{g2}{|G_{d,p}|}} + \geq \eqnmarkbox[SeaGreen]{g3}{.25} +\end{equation*} +\annotate[yshift=1em]{left}{g1}{Intersect between \\symptom genes ($G_{d,p}$) and driver genes ($G_{p,c}$)} +\annotate[yshift=-1em]{below,left}{g2}{Symptom genes \\(i.e. genes annotated to a phenotype\\ via a specific disease)} +\annotate[yshift=-1em]{below,right}{g3}{Minimum proportion of overlap \\between $G_{d,p,c}$ and $G_{d,p}$} + +\hfill\break + +\subsubsection{Validation of expected phenotype-cell type +relationships}\label{validation-of-expected-phenotype-cell-type-relationships-1} + +We first sought to confirm that our tests (across both single-cell +references) were able to recover expected phenotype-cell type +relationships across seven high-level branches within the HPO +(Fig.~\ref{fig-summary}), including abnormalities of the cardiovascular +system, endocrine system, eye, immune system, musculoskeletal system, +nervous system, and respiratory system. Within each branch the number of +significant tests in a given cell type were plotted +(Fig.~\ref{fig-summary}b). Mappings between freeform annotations (the +level at which we performed our phenotype- cell type association tests) +provided by the original atlas authors and their closest CL term +equivalents were provided by CellxGene\textsuperscript{29}. CL terms +along the \emph{x-axis} of Fig.~\ref{fig-summary}b were assigned colours +corresponding to which HPO branch showed the greatest number of +enrichments (after normalising within each branch to account for +differences in scale). The normalised colouring allows readers to +quickly assess which HPO branch was most often associated with each cell +type, while accounting for differences in the number of phenotypes +across branches. We then ran a series of Analysis of Variance (ANOVA) +tests to determine whether (within a given branch) a given cell type was +more often enriched (\(FDR_{p,c}<0.05\)) within that branch relative to +all of the other HPO branches of an equivalent level in the ontology +(including all branches not shown in Fig.~\ref{fig-summary}b). After +applying Benjamini-Hochberg multiple testing +correction\textsuperscript{72} (denoted as \(FDR _{b,c}\)), we annotated +each respective branch-by-cell type bar according to the significance +(**** : \(FDR _{b,c}<1e-04\), *** : \(FDR _{b,c}<0.001\), ** : +\(FDR _{b,c}<0.01\), * : \(FDR _{b,c}<0.05\)). Cell types in +Fig.~\ref{fig-summary}a-b were ordered along the \emph{x-axis} according +to a dendrogram derived from the CL ontology (Fig.~\ref{fig-summary}c), +which provides ground-truth semantic relationships between all cell +types (e.g.~different neuronal subtypes are grouped together). + +As an additional measure of the accuracy of our phenotype-cell types +test results we identified conceptually matched branches across the HPO +and the CL (Fig.~\ref{fig-summary}d and \textbf{?@tbl-celltypes}). For +example, `Abnormality of the cardiovascular system' in the HPO was +matched with `cardiocytes' in the CL which includes all cell types +specific to the heart. Analogously, `Abnormality of the nervous system' +in the HPO was matched with `neural cell' in the CL which includes all +descendant subtypes of neurons and glia. This cross-ontology matching +was repeated for each HPO branch and can be referred to as on-target +cell types. Within each branch, the \(-log_{10}(FDR _{p,c})\) values of +on-target cell types were binned by rounding to the nearest integer +(\emph{x-axis}) and the percentage of tests for on-target cell types +relative to all cell types were computed at each bin (\emph{y-axis}) +(Fig.~\ref{fig-summary}d). The baseline level (dotted horizontal line) +illustrates the percentage of on-target cell types relative to the total +number of observed cell types. Any percentages above this baseline level +represent greater than chance representation of the on-target cell types +in the significant tests. + +\subsubsection{Monarch Knowledge Graph +recall}\label{monarch-knowledge-graph-recall-1} + +Finally, we gathered known phenotype-cell type relationships from the +Monarch Knowledge Graph (MKG), a comprehensive database of links between +many aspects of disease biology\textsuperscript{73}. This currently +includes 103 links between HPO phenotypes (n=103) and CL cell types +(n=79). Of these, we only considered the 82 phenotypes that we were able +to test given that our approach was reliant on gene annotations. We +considered instances where we found a significant relationship between +exactly matching pairs of HPO-CL terms as a hit. + +However, as the cell types in MKG were not necessarily annotated at the +same level as our single-cell references, we also considered instances +where the MKG cell type was an ancestor term of our cell type +(e.g.~`myeloid cell' vs.~`monocyte'), or \emph{vice versa}, as hits. +Using these criteria, we determined our results recapitulated +\texttt{**!!RECOMPUTE!!**}\% of known phenotype-cell type relationships +in the MKG. We next computed how far along the CL ontological tree we +would need to travel in order to reach a common ancestor between the MKG +cell type and our cell type, for each phenotype-cell type link in the +MKG. This provides a metric of not just whether we recapitulated the +exact cell types, but how dissimilar our identified cell types were for +a given phenotype-cell type association (\textbf{?@fig-monarch-recall}). + +\subsubsection{Annotation of phenotypes using generative large language +models}\label{annotation-of-phenotypes-using-generative-large-language-models-1} + +Only a small fraction of the the phenotypes in HPO (\textless1\%) have +metadata annotations containing information on their time course, +consequences, and severity. This is due to the time-consuming nature of +manually annotating thousands of phenotypes. To generate such +annotations at scale, we used Generative Pre-trained Transformer 4 +(GPT-4), a large language model (LLM) as implemented within OpenAI's +chatGPT Application Programming Interface (API). After extensive prompt +engineering and ground-truth benchmarking, we were able to acquire +annotations on how often each phenotype directly causes intellectual +disability, death, impaired mobility, physical malformations, blindness, +sensory impairments, immunodeficiency, cancer, reduced fertility, or is +associated with a congenital onset. These criteria were previously +defined in surveys of medical experts as a means of systematically +assessing phenotype severity\textsuperscript{74}. Responses for each +metric were provided in a consistent one-word format which could be one +of: `never', `rarely', `often', `always'. This procedure was repeated in +batches (to avoid exceeding token limits) until annotations were +gathered for 16982/18082 HPO phenotypes. + +We then encoded these responses into a semi-quantitative scoring system +(`never'=0, `rarely'=1, `often'=2, `always'=3), which were then weighted +by multiplying a semi-subjective scoring of the relevance of each metric +to the concept of severity on a scale from 1-5, with 5 being the most +severe (`intellectual\_disability'=5, `death'=5, `impaired\_mobility'=4, +`physical\_malformations'=3, `blindness'=4, `sensory\_impairments'=3, +`immunodeficiency'=3, `cancer'=3, `reduced\_fertility'=1, +`congenital\_onset'=4). Finally, the product of the score was normalised +to a quantitative severity score ranging from 0-100, where 100 is the +theoretical maximum severity score. This phenotype severity scoring +procedure can be expressed as follows. + +\hfill\break +\hfill\break + +\begin{equation*} + \eqnmarkbox[Brown4]{nss}{NSS_p} + = + \frac{ + \eqnmarkbox[Goldenrod]{nss2}{\sum_{j=1}^{m}} + ( + \eqnmarkbox[Goldenrod4]{nss3}{F_{pj}} + \times + \eqnmarkbox[IndianRed4]{nss4}{W_j} + ) + }{ + \eqnmarkbox[Tan]{nss5}{\sum_{j=1}^{m}(\max\{F_j\} \times W_j)} + } \times 100 +\end{equation*} +\annotate[yshift=1em]{left}{nss}{Normalised Severity Score \\for each phenotype} +\annotate[yshift=3em]{left}{nss2}{Sum of weighted annotation values \\across all metrics} +\annotate[yshift=3em]{right}{nss3}{Numerically encoded annotation value \\of metric $j$ for phenotype $p$} +\annotate[yshift=1em]{right}{nss4}{Weight for metric $j$} +\annotate[yshift=-1em]{below,right}{nss5}{Theoretical maximum severity score} + +\hfill\break + +\subsubsection{Enrichment of foetal cell types in congenital +phenotypes}\label{enrichment-of-foetal-cell-types-in-congenital-phenotypes-1} + +The GPT-4 annotations also enabled us to assess whether foetal cell +types were more often significantly associated with congenital +phenotypes in our Human Cell Landscape results as this single-cell +reference contained both adult and foetal versions of cell types +(Fig.~\ref{fig-congenital}). To do this, we performed a chi-squared +(\(\chi^2\)) test on the proportion of significantly associated cell +types containing any of the substrings `fetal', `fetus', `primordial', +`hESC' or `embryonic' (within cell types annotations from the original +Human Cell Landscape authors\textsuperscript{32}) vs.~those associated +without, stratified by how often the corresponding phenotype had a +congenital onset according to the GPT phenotype annotations (including +`never', `rarely', `often', `always'). In addition, a series of +\(\chi^2\) tests were performed within each congenital onset frequency +strata, to determine whether the observed proportion of foetal cell +types vs.~non-foetal cell types significantly deviated from the +proportions expected by chance. + +\subsubsection{Diagnosis via cell type-specific disease +prediction}\label{diagnosis-via-cell-type-specific-disease-prediction-1} + +We designed an algorithm that uses our results to predict the most +likely cell types underlying a set of phenotypic and genotypic traits +observed in a patient (Fig.~\ref{fig-diagnosis}). This is implemented +within \texttt{MSTExplorer::predict\_celltypes} and takes HPO phenotypes +as inputs. It can optionally take included risk genes, excluded risk +genes, included diseases and/or excluded diseases as additional inputs. +It then computes the It then outputs a weighted ranking of cell types, +where higher ranking indicates a higher likelihood of being the +underlying mechanism of the patient's particular form of disease(s). + +\subsubsection{Prognosis via cell type-mediated differential +outcomes}\label{prognosis-via-cell-type-mediated-differential-outcomes-1} + +The phenotype hypotonia is associated with diseases that range in +severity from benign to debilitating to fatal\textsuperscript{75}. In +the absence of additional information, making an accurate diagnosis is +extremely challenging even for experienced physicians. The magnitude of +this challenge is highlighted by the fact that each disease is +associated with anywhere between 1-595 unique phenotypes (median=61, +mean=77.74) within the HPO. Conversely, each phenotype is associated +with 1-5404 diseases (median=6, mean=60.74). We addressed this challenge +by applying our phenotype-cell type association results in combination +with expert-curated HPO annotations of clinical outcomes associated with +each phenotype-disease pairing (Fig.~\ref{fig-prognosis}). We first +extracted results for the phenotype `Hypotonia' (\emph{HP:0001252}) and +its 13 descendant subterms from our phenotype-cell type association +analyses. Next, we encoded the ``Age of Death'' categories associated +with each disease in an ordinal scale ranging from 1, corresponding to +prenatal death, to 8, corresponding to death in late adulthood +(\textbf{?@tbl-death}). To determine whether cell type identity +significantly predicted the age of death, we conducted an ANOVA where +cell type was the predictor and ``Age of Death score'' was the outcome. + +\subsubsection{Therapeutic target +identification}\label{therapeutic-target-identification-1} + +We developed a systematic and automated strategy for identifying +putative cell type-specific gene targets for each phenotype based on a +series of filters at phenotype, cell type, and gene levels. The entire +target prioritisation procedure can be replicated with a single +function: \texttt{MSTExplorer::prioritise\_targets}. This function +automates all of the reference data gathering (e.g.~phenotype metadata, +cell type metadata, cell type signature reference, gene lengths, +severity tiers) and takes a variety of arguments at each step for +greater customisability. + +\subsubsection{Therapeutic target +validation}\label{therapeutic-target-validation-1} + +To assess whether our prioritised therapeutic targets were likely to be +viable, we computed the overlap between our gene targets and those of +existing gene therapies at various stages of clinical development +(Fig.~\ref{fig-therapy-validate}). Gene targets were obtained for each +therapy from the Therapeutic Target Database (TTD; release 2024-03-22) +and mapped onto standardised HUGO Gene Nomenclature Committee (HGNC) +gene symbols using the \texttt{orthogene} R package. We stratified our +overlap metrics according to whether the therapies had failed +(unsuccessful clinical trials or withdrawn), or were non-failed +(successful or ongoing clinical trials). We then conducted +hypergeometric tests to determine whether the observed overlap between +our prioritised targets and the non-failed therapy targets was +significantly greater than expected by chance (i.e.~enrichment). We also +conducted a second hypergeometric test to determine whether the observed +overlap between our prioritised targets and the failed therapy targets +was significantly less than expected by chance (i.e.~depletion). +Finally, we repeated the analysis against all therapeutic targets, not +just those of gene therapies, to determine whether our prioritised +targets had relevance to other therapeutic modalities. + +\subsubsection{Experimental model +translatability}\label{experimental-model-translatability-1} + +To improve the likelihood of successful translation between preclinical +animal models and human patients, we created an interspecies +translatability prediction tool for each phenotype nominated by our gene +therapy prioritised pipeline (Fig.~\ref{fig-animal-models}). First, we +extracted ontological similarity scores of homologous phenotypes across +species from the MKG\textsuperscript{73}. Briefly, the ontological +similarity scores (\(SIM_o\)) are computed for each homologous pair of +phenotypes across two ontologies by calculating the overlap in +homologous phenotypes that are ancestors or descendants of the target +phenotype. Next, we generated genotypic similarity scores (\(SIM_g\)) +for each homologous phenotype pair by computing the proportion of 1:1 +orthologous genes using gene annotation from their respective +ontologies. Interspecies orthologs were also obtained from the MKG. +Finally, both scores are multiplied together to yield a unified +ontological-genotypic similarity score (\(SIM_{o,g}\)). + +\subsubsection{Novel R packages}\label{novel-r-packages} + +To facilitate all analyses described in this study and to make them more +easily reproducible by others, we created several open-source R +packages. +\href{https://github.com/neurogenomics/KGExplorer}{\texttt{KGExplorer}} +imports and analyses large-scale biomedical knowledge graphs and +ontologies. +\href{https://github.com/neurogenomics/HPOExplorer}{\texttt{HPOExplorer}} +aids in managing and querying the directed acyclic ontology graph within +the HPO. +\href{https://github.com/neurogenomics/MSTExplorer}{\texttt{MSTExplorer}} +facilitates the efficient analysis of many thousands of phenotype-cell +type association tests, and provides a suite of multi-scale therapeutic +target prioritisation and visualisation functions. These R packages also +include various functions for distributing the post-processed results +from this study in an organised, tabular format. Of note, +\texttt{MSTExplorer::load\_example\_results} loads all summary +statistics from our phenotype-cell type tests performed here. + +\subsubsection{Rare Disease Celltyping +Portal}\label{rare-disease-celltyping-portal} + +To further increase the ease of access for stakeholders in the RD +community without the need for programmatic experience, we developed a +series of web apps to interactively explore, visualise, and download the +results from our study. Collectively, these web apps are called the Rare +Disease Celltyping Portal. The landing page for the website was made +using HTML, CSS, and javascript and the web apps were created using the +Shiny Web application framework for R and deployed on the +\href{https://www.shinyapps.io}{shinyapps.io} server. The website can be +accessed +\href{https://neurogenomics.github.io/rare_disease_celltyping_apps/home}{here}. +All code used to generate the website can be found +\href{https://github.com/neurogenomics/rare_disease_celltyping_apps}{here}. + +\subsection{Data and Code +Availability}\label{data-and-code-availability} + +All data and code is made freely available through preexisting databases +and/or GitHub repositories / software associated with this publication. + +\begin{itemize} +\tightlist +\item + \href{https://hpo.jax.org}{Human Phenotype Ontology} +\item + \href{https://thegencc.org/}{GenCC} +\item + \href{https://cellxgene.cziscience.com/collections/c114c20f-1ef4-49a5-9c2e-d965787fb90c}{Descartes + Human scRNA-seq atlas} +\item + \href{https://cellxgene.cziscience.com/collections/38833785-fac5-48fd-944a-0f62a4c23ed1}{Human + Cell Landscape scRNA-seq atlas} +\item + \href{https://neurogenomics.github.io/rare_disease_celltyping_apps/home}{Rare + Disease Celltyping Portal} +\item + \href{https://github.com/neurogenomics/KGExplorer}{\texttt{KGExplorer}} +\item + \href{https://github.com/neurogenomics/HPOExplorer}{\texttt{HPOExplorer}} +\item + \href{https://github.com/neurogenomics/MSTExplorer}{\texttt{MSTExplorer}} +\item + \href{https://github.com/neurogenomics/rare_disease_celltyping}{Code + to replicate analyses} +\item + \href{https://neurogenomics.github.io/RareDiseasePrioritisation/reports/prioritise_targets}{Cell + type-specific gene target prioritisation}\\ +\item + \href{https://www.genenames.org/data/genegroup/\#!/group/492}{Complement + system gene list} +\end{itemize} + +\subsection{Acknowledgements}\label{acknowledgements} + +We would like to thank the following individuals for their insightful +feedback and assistance with data resources: Sarah J. Marzi, Gerton +Lunter, Peter Robinson, Melissa Haendel, Ben Coleman, Nico Matentzoglu, +Shawn T. O'Neil, Alan E. Murphy, Sarada Gurung. + +\subsubsection{Funding}\label{funding} + +This work was supported by a UK Dementia Research Institute (UK DRI) +Future Leaders Fellowship {[}MR/T04327X/1{]} and the UK DRI which +receives its funding from UK DRI Ltd, funded by the UK Medical Research +Council, Alzheimer's Society and Alzheimer's Research UK. + +\subsection*{References}\label{references} +\addcontentsline{toc}{subsection}{References} + +\phantomsection\label{refs} +\begin{CSLReferences}{0}{0} +\bibitem[\citeproctext]{ref-Ferreira2019-jp} +\CSLLeftMargin{1. }% +\CSLRightInline{Ferreira, C. R. The burden of rare diseases. \emph{Am. +J. Med. Genet. A} \textbf{179}, 885--892 (2019).} + +\bibitem[\citeproctext]{ref-Zhu2020-vo} +\CSLLeftMargin{2. }% +\CSLRightInline{Zhu, Q. \emph{et al.} An integrative knowledge graph for +rare diseases, derived from the genetic and rare diseases information +center ({GARD}). \emph{J. Biomed. Semantics} \textbf{11}, 13 (2020).} + +\bibitem[\citeproctext]{ref-noauthor_undated-kp} +\CSLLeftMargin{3. }% +\CSLRightInline{Rare diseases {BioResource}.} + +\bibitem[\citeproctext]{ref-Marwaha2022-uy} +\CSLLeftMargin{4. }% +\CSLRightInline{Marwaha, S., Knowles, J. W. \& Ashley, E. A. A guide for +the diagnosis of rare and undiagnosed disease: Beyond the exome. +\emph{Genome Med.} \textbf{14}, 23 (2022).} + +\bibitem[\citeproctext]{ref-Molster2016-da} +\CSLLeftMargin{5. }% +\CSLRightInline{Molster, C. \emph{et al.} Survey of healthcare +experiences of australian adults living with rare diseases. +\emph{Orphanet J. Rare Dis.} \textbf{11}, 30 (2016).} + +\bibitem[\citeproctext]{ref-Halley2022-pd} +\CSLLeftMargin{6. }% +\CSLRightInline{Halley, M. C., Smith, H. S., Ashley, E. A., Goldenberg, +A. J. \& Tabor, H. K. A call for an integrated approach to improve +efficiency, equity and sustainability in rare disease research in the +united states. \emph{Nat. Genet.} \textbf{54}, 219--222 (2022).} + +\bibitem[\citeproctext]{ref-Institute_of_Medicine_US_Committee_on_Accelerating_Rare_Diseases_Research_and_Orphan_Product_Development2010-vj} +\CSLLeftMargin{7. }% +\CSLRightInline{Institute of Medicine (US) Committee on Accelerating +Rare Diseases Research and Orphan Product Development, Field, M. J. \& +Boat, T. F. \emph{Coverage and Reimbursement: Incentives and +Disincentives for Product Development}. (National Academies Press (US), +2010).} + +\bibitem[\citeproctext]{ref-Yates2022-ra} +\CSLLeftMargin{8. }% +\CSLRightInline{Yates, N. \& Hinkel, J. The economics of moonshots: +Value in rare disease drug development. \emph{Clin. Transl. Sci.} +\textbf{15}, 809--812 (2022).} + +\bibitem[\citeproctext]{ref-Nuijten2022-yc} +\CSLLeftMargin{9. }% +\CSLRightInline{Nuijten, M. Pricing zolgensma - the world's most +expensive drug. \emph{J Mark Access Health Policy} \textbf{10}, 2022353 +(2022).} + +\bibitem[\citeproctext]{ref-Thielen2022-ud} +\CSLLeftMargin{10. }% +\CSLRightInline{Thielen, F. W., Heine, R. J. S. D., Berg, S. van den, +Ham, R. M. T. T. \& Groot, C. A. U. Towards sustainability and +affordability of expensive cell and gene therapies? Applying a +cost-based pricing model to estimate prices for libmeldy and zolgensma. +\emph{Cytotherapy} \textbf{24}, 1245--1258 (2022).} + +\bibitem[\citeproctext]{ref-Gargano2024-fc} +\CSLLeftMargin{11. }% +\CSLRightInline{Gargano, M. A. \emph{et al.} The human phenotype +ontology in 2024: Phenotypes around the world. \emph{Nucleic Acids Res.} +\textbf{52}, D1333--D1346 (2024).} + +\bibitem[\citeproctext]{ref-Kohler2019-pc} +\CSLLeftMargin{12. }% +\CSLRightInline{Köhler, S. \emph{et al.} Expansion of the human +phenotype ontology ({HPO}) knowledge base and resources. \emph{Nucleic +Acids Res.} \textbf{47}, D1018--D1027 (2019).} + +\bibitem[\citeproctext]{ref-Kohler2021-wk} +\CSLLeftMargin{13. }% +\CSLRightInline{Köhler, S. \emph{et al.} The human phenotype ontology in +2021. \emph{Nucleic Acids Res.} \textbf{49}, D1207--D1217 (2021).} + +\bibitem[\citeproctext]{ref-Robinson2008-ys} +\CSLLeftMargin{14. }% +\CSLRightInline{Robinson, P. N. \emph{et al.} The human phenotype +ontology: A tool for annotating and analyzing human hereditary disease. +\emph{Am. J. Hum. Genet.} \textbf{83}, 610--615 (2008).} + +\bibitem[\citeproctext]{ref-Osmond2022-ml} +\CSLLeftMargin{15. }% +\CSLRightInline{Osmond, M. \emph{et al.} Outcome of over 1500 matches +through the matchmaker exchange for rare disease gene discovery: The +2-year experience of {Care4Rare} canada. \emph{Genet. Med.} \textbf{24}, +100--108 (2022).} + +\bibitem[\citeproctext]{ref-Philippakis2015-dq} +\CSLLeftMargin{16. }% +\CSLRightInline{Philippakis, A. A. \emph{et al.} The matchmaker +exchange: A platform for rare disease gene discovery. \emph{Hum. Mutat.} +\textbf{36}, 915--921 (2015).} + +\bibitem[\citeproctext]{ref-Zanello2023-zd} +\CSLLeftMargin{17. }% +\CSLRightInline{Zanello, G. \emph{et al.} Targeting shared molecular +etiologies to accelerate drug development for rare diseases. \emph{EMBO +Mol. Med.} \textbf{15}, e17159 (2023).} + +\bibitem[\citeproctext]{ref-Nguengang_Wakap2020-cz} +\CSLLeftMargin{18. }% +\CSLRightInline{Nguengang Wakap, S. \emph{et al.} Estimating cumulative +point prevalence of rare diseases: Analysis of the orphanet database. +\emph{Eur. J. Hum. Genet.} \textbf{28}, 165--173 (2020).} + +\bibitem[\citeproctext]{ref-noauthor_2022-ok} +\CSLLeftMargin{19. }% +\CSLRightInline{Rare diseases, common challenges. \emph{Nat. Genet.} +\textbf{54}, 215 (2022).} + +\bibitem[\citeproctext]{ref-Amberger2019-vl} +\CSLLeftMargin{20. }% +\CSLRightInline{Amberger, J. S., Bocchini, C. A., Scott, A. F. \& +Hamosh, A. {OMIM.org}: Leveraging knowledge across phenotype-gene +relationships. \emph{Nucleic Acids Res.} \textbf{47}, D1038--D1043 +(2019).} + +\bibitem[\citeproctext]{ref-Amberger2017-tg} +\CSLLeftMargin{21. }% +\CSLRightInline{Amberger, J. S. \& Hamosh, A. Searching online mendelian +inheritance in man ({OMIM)}: A knowledgebase of human genes and genetic +phenotypes. \emph{Curr. Protoc. Bioinformatics} \textbf{58}, +1.2.1--1.2.12 (2017).} + +\bibitem[\citeproctext]{ref-McKusick2007-di} +\CSLLeftMargin{22. }% +\CSLRightInline{McKusick, V. A. Mendelian inheritance in man and its +online version, {OMIM}. \emph{Am. J. Hum. Genet.} \textbf{80}, 588--604 +(2007).} + +\bibitem[\citeproctext]{ref-Maiella2013-oo} +\CSLLeftMargin{23. }% +\CSLRightInline{Maiella, S., Rath, A., Angin, C., Mousson, F. \& Kremp, +O. {[}Orphanet and its consortium: Where to find expert-validated +information on rare diseases{]}. \emph{Rev. Neurol.} \textbf{169 Suppl +1}, S3--8 (2013).} + +\bibitem[\citeproctext]{ref-Weinreich2008-wm} +\CSLLeftMargin{24. }% +\CSLRightInline{Weinreich, S. S., Mangon, R., Sikkens, J. J., Teeuw, M. +E. en \& Cornel, M. C. {[}Orphanet: A european database for rare +diseases{]}. \emph{Ned. Tijdschr. Geneeskd.} \textbf{152}, 518--519 +(2008).} + +\bibitem[\citeproctext]{ref-Firth2009-qg} +\CSLLeftMargin{25. }% +\CSLRightInline{Firth, H. V. \emph{et al.} {DECIPHER}: Database of +chromosomal imbalance and phenotype in humans using ensembl resources. +\emph{Am. J. Hum. Genet.} \textbf{84}, 524--533 (2009).} + +\bibitem[\citeproctext]{ref-Baysoy2023-vt} +\CSLLeftMargin{26. }% +\CSLRightInline{Baysoy, A., Bai, Z., Satija, R. \& Fan, R. The +technological landscape and applications of single-cell multi-omics. +\emph{Nat. Rev. Mol. Cell Biol.} \textbf{24}, 695--713 (2023).} + +\bibitem[\citeproctext]{ref-Haque2017-bn} +\CSLLeftMargin{27. }% +\CSLRightInline{Haque, A., Engel, J., Teichmann, S. A. \& Lönnberg, T. A +practical guide to single-cell {RNA-sequencing} for biomedical research +and clinical applications. \emph{Genome Med.} \textbf{9}, 75 (2017).} + +\bibitem[\citeproctext]{ref-Qi2023-ev} +\CSLLeftMargin{28. }% +\CSLRightInline{Qi, R. \& Zou, Q. Trends and potential of machine +learning and deep learning in drug study at {Single-Cell} level. +\emph{Research} \textbf{6}, 0050 (2023).} + +\bibitem[\citeproctext]{ref-CZI_Single-Cell_Biology_Program2023-fs} +\CSLLeftMargin{29. }% +\CSLRightInline{CZI Single-Cell Biology Program \emph{et al.} {CZ} +{CELL\(\times\)GENE} discover: A single-cell data platform for scalable +exploration, analysis and modeling of aggregated data. \emph{bioRxiv} +2023.10.30.563174 (2023).} + +\bibitem[\citeproctext]{ref-Svensson2020-lg} +\CSLLeftMargin{30. }% +\CSLRightInline{Svensson, V., Veiga Beltrame, E. da \& Pachter, L. A +curated database reveals trends in single-cell transcriptomics. +\emph{Database} \textbf{2020}, (2020).} + +\bibitem[\citeproctext]{ref-Cao2020-qz} +\CSLLeftMargin{31. }% +\CSLRightInline{Cao, J. \emph{et al.} A human cell atlas of fetal gene +expression. \emph{Science} \textbf{370}, (2020).} + +\bibitem[\citeproctext]{ref-Han2020-iq} +\CSLLeftMargin{32. }% +\CSLRightInline{Han, X. \emph{et al.} Construction of a human cell +landscape at single-cell level. \emph{Nature} \textbf{581}, 303--309 +(2020).} + +\bibitem[\citeproctext]{ref-Diehl2016-gt} +\CSLLeftMargin{33. }% +\CSLRightInline{Diehl, A. D. \emph{et al.} The cell ontology 2016: +Enhanced content, modularization, and ontology interoperability. +\emph{J. Biomed. Semantics} \textbf{7}, 44 (2016).} + +\bibitem[\citeproctext]{ref-Heim2014-du} +\CSLLeftMargin{34. }% +\CSLRightInline{Heim, C. E. \emph{et al.} Myeloid-derived suppressor +cells contribute to staphylococcus aureus orthopedic biofilm infection. +\emph{J. Immunol.} \textbf{192}, 3778--3792 (2014).} + +\bibitem[\citeproctext]{ref-Pidwill2020-le} +\CSLLeftMargin{35. }% +\CSLRightInline{Pidwill, G. R., Gibson, J. F., Cole, J., Renshaw, S. A. +\& Foster, S. J. The role of macrophages in staphylococcus aureus +infection. \emph{Front. Immunol.} \textbf{11}, 620339 (2020).} + +\bibitem[\citeproctext]{ref-Stoll2018-dc} +\CSLLeftMargin{36. }% +\CSLRightInline{Stoll, H. \emph{et al.} Staphylococcal enterotoxins +{Dose-Dependently} modulate the generation of {Myeloid-Derived} +suppressor cells. \emph{Front. Cell. Infect. Microbiol.} \textbf{8}, 321 +(2018).} + +\bibitem[\citeproctext]{ref-Tebartz2015-xs} +\CSLLeftMargin{37. }% +\CSLRightInline{Tebartz, C. \emph{et al.} A major role for +myeloid-derived suppressor cells and a minor role for regulatory {T} +cells in immunosuppression during staphylococcus aureus infection. +\emph{J. Immunol.} \textbf{194}, 1100--1111 (2015).} + +\bibitem[\citeproctext]{ref-Zhou2016-kq} +\CSLLeftMargin{38. }% +\CSLRightInline{Zhou, Z., Xu, M.-J. \& Gao, B. Hepatocytes: A key cell +type for innate immunity. \emph{Cell. Mol. Immunol.} \textbf{13}, +301--315 (2016).} + +\bibitem[\citeproctext]{ref-Dixon2013-ok} +\CSLLeftMargin{39. }% +\CSLRightInline{Dixon, L. J., Barnes, M., Tang, H., Pritchard, M. T. \& +Nagy, L. E. Kupffer cells in the liver. \emph{Compr. Physiol.} +\textbf{3}, 785--797 (2013).} + +\bibitem[\citeproctext]{ref-Ladhani2019-nf} +\CSLLeftMargin{40. }% +\CSLRightInline{Ladhani, S. N. \emph{et al.} Invasive meningococcal +disease in patients with complement deficiencies: A case series +(2008-2017). \emph{BMC Infect. Dis.} \textbf{19}, 522 (2019).} + +\bibitem[\citeproctext]{ref-Rosain2017-ih} +\CSLLeftMargin{41. }% +\CSLRightInline{Rosain, J. \emph{et al.} Strains responsible for +invasive meningococcal disease in patients with terminal complement +pathway deficiencies. \emph{J. Infect. Dis.} \textbf{215}, 1331--1338 +(2017).} + +\bibitem[\citeproctext]{ref-The_International_Meningococcal_Genetics_Consortium2010-if} +\CSLLeftMargin{42. }% +\CSLRightInline{The International Meningococcal Genetics Consortium. +Genome-wide association study identifies variants in the {CFH} region +associated with host susceptibility to meningococcal disease. +\emph{Nature Genetics} \textbf{42}, 772--776 (2010).} + +\bibitem[\citeproctext]{ref-Lung2019-il} +\CSLLeftMargin{43. }% +\CSLRightInline{Lung, T. \emph{et al.} The complement system in liver +diseases: Evidence-based approach and therapeutic options. \emph{J +Transl Autoimmun} \textbf{2}, 100017 (2019).} + +\bibitem[\citeproctext]{ref-Reis2015-yz} +\CSLLeftMargin{44. }% +\CSLRightInline{Reis, E. S. \emph{et al.} Applying complement +therapeutics to rare diseases. \emph{Clin. Immunol.} \textbf{161}, +225--240 (2015).} + +\bibitem[\citeproctext]{ref-Seal2023-pa} +\CSLLeftMargin{45. }% +\CSLRightInline{Seal, R. L. \emph{et al.} Genenames.org: The {HGNC} +resources in 2023. \emph{Nucleic Acids Res.} \textbf{51}, D1003--D1009 +(2023).} + +\bibitem[\citeproctext]{ref-Al-Hamoudi2009-le} +\CSLLeftMargin{46. }% +\CSLRightInline{Al-Hamoudi, W. K. Severe autoimmune hepatitis triggered +by varicella zoster infection. \emph{World J. Gastroenterol.} +\textbf{15}, 1004--1006 (2009).} + +\bibitem[\citeproctext]{ref-Brewer2018-dg} +\CSLLeftMargin{47. }% +\CSLRightInline{Brewer, E. C. \& Hunter, L. Acute liver failure due to +disseminated varicella zoster infection. \emph{Case Reports Hepatol} +\textbf{2018}, 1269340 (2018).} + +\bibitem[\citeproctext]{ref-Eshchar1973-tz} +\CSLLeftMargin{48. }% +\CSLRightInline{Eshchar, J., Reif, L., Waron, M. \& Alkan, W. J. Hepatic +lesion in chickenpox. A case report. \emph{Gastroenterology} +\textbf{64}, 462--466 (1973).} + +\bibitem[\citeproctext]{ref-Sugiyama1981-ev} +\CSLLeftMargin{49. }% +\CSLRightInline{Sugiyama, K., Tagawa, S. \& Toda, M. Methods for visual +understanding of hierarchical system structures. \emph{IEEE Trans. Syst. +Man Cybern.} \textbf{11}, 109--125 (1981).} + +\bibitem[\citeproctext]{ref-Srivastava2023-ge} +\CSLLeftMargin{50. }% +\CSLRightInline{Srivastava, P., Tenney, J., Lodish, M., Slavotinek, A. +\& Baskin, L. Utility of genetic work-up for 46, {XY} patients with +severe hypospadias. \emph{J. Pediatr. Urol.} \textbf{19}, 261--272 +(2023).} + +\bibitem[\citeproctext]{ref-Utsch2004-re} +\CSLLeftMargin{51. }% +\CSLRightInline{Utsch, B., Albers, N. \& Ludwig, M. Genetic and +molecular aspects of hypospadias. \emph{Eur. J. Pediatr. Surg.} +\textbf{14}, 297--302 (2004).} + +\bibitem[\citeproctext]{ref-Liu2011-qd} +\CSLLeftMargin{52. }% +\CSLRightInline{Liu, X. \emph{et al.} The therapeutic target database: +An internet resource for the primary targets of approved, clinical trial +and experimental drugs. \emph{Expert Opin. Ther. Targets} \textbf{15}, +903--912 (2011).} + +\bibitem[\citeproctext]{ref-Diaz-Santiago2020-ep} +\CSLLeftMargin{53. }% +\CSLRightInline{Dı́az-Santiago, E. \emph{et al.} Phenotype-genotype +comorbidity analysis of patients with rare disorders provides insight +into their pathological and molecular bases. \emph{PLoS Genet.} +\textbf{16}, e1009054 (2020).} + +\bibitem[\citeproctext]{ref-Lord2021-rf} +\CSLLeftMargin{54. }% +\CSLRightInline{Lord, J. \& Baralle, D. Splicing in the diagnosis of +rare disease: Advances and challenges. \emph{Front. Genet.} \textbf{12}, +689892 (2021).} + +\bibitem[\citeproctext]{ref-Bueren2023-ma} +\CSLLeftMargin{55. }% +\CSLRightInline{Bueren, J. A. \& Auricchio, A. Advances and challenges +in the development of gene therapy medicinal products for rare diseases. +\emph{Hum. Gene Ther.} \textbf{34}, 763--775 (2023).} + +\bibitem[\citeproctext]{ref-Bulaklak2020-ta} +\CSLLeftMargin{56. }% +\CSLRightInline{Bulaklak, K. \& Gersbach, C. A. The once and future gene +therapy. \emph{Nat. Commun.} \textbf{11}, 5820 (2020).} + +\bibitem[\citeproctext]{ref-Godbout2023-uo} +\CSLLeftMargin{57. }% +\CSLRightInline{Godbout, K. \& Tremblay, J. P. Prime editing for human +gene therapy: Where are we now? \emph{Cells} \textbf{12}, (2023).} + +\bibitem[\citeproctext]{ref-Kohn2023-vh} +\CSLLeftMargin{58. }% +\CSLRightInline{Kohn, D. B., Chen, Y. Y. \& Spencer, M. J. Successes and +challenges in clinical gene therapy. \emph{Gene Ther.} \textbf{30}, +738--746 (2023).} + +\bibitem[\citeproctext]{ref-Zhao2023-qy} +\CSLLeftMargin{59. }% +\CSLRightInline{Zhao, Z., Shang, P., Mohanraju, P. \& Geijsen, N. Prime +editing: Advances and therapeutic applications. \emph{Trends +Biotechnol.} \textbf{41}, 1000--1012 (2023).} + +\bibitem[\citeproctext]{ref-Darrow2019-om} +\CSLLeftMargin{60. }% +\CSLRightInline{Darrow, J. J. Luxturna: {FDA} documents reveal the value +of a costly gene therapy. \emph{Drug Discov. Today} \textbf{24}, +949--954 (2019).} + +\bibitem[\citeproctext]{ref-Mendell2017-kg} +\CSLLeftMargin{61. }% +\CSLRightInline{Mendell, J. R. \emph{et al.} {Single-Dose} +{Gene-Replacement} therapy for spinal muscular atrophy. \emph{N. Engl. +J. Med.} \textbf{377}, 1713--1722 (2017).} + +\bibitem[\citeproctext]{ref-Mueller2017-fz} +\CSLLeftMargin{62. }% +\CSLRightInline{Mueller, C. \emph{et al.} 5 year expression and +neutrophil defect repair after gene therapy in alpha-1 antitrypsin +deficiency. \emph{Mol. Ther.} \textbf{25}, 1387--1394 (2017).} + +\bibitem[\citeproctext]{ref-Russell2017-dh} +\CSLLeftMargin{63. }% +\CSLRightInline{Russell, S. \emph{et al.} Efficacy and safety of +voretigene neparvovec ({AAV2-hRPE65v2}) in patients with +{RPE65-mediated} inherited retinal dystrophy: A randomised, controlled, +open-label, phase 3 trial. \emph{Lancet} \textbf{390}, 849--860 (2017).} + +\bibitem[\citeproctext]{ref-Lu2024-kl} +\CSLLeftMargin{64. }% +\CSLRightInline{Lu, C.-F. {FDA} takes first step toward international +regulation of gene therapies to treat rare diseases. (2024).} + +\bibitem[\citeproctext]{ref-Brown2022-ye} +\CSLLeftMargin{65. }% +\CSLRightInline{Brown, D. G., Wobst, H. J., Kapoor, A., Kenna, L. A. \& +Southall, N. Clinical development times for innovative drugs. \emph{Nat. +Rev. Drug Discov.} \textbf{21}, 793--794 (2022).} + +\bibitem[\citeproctext]{ref-Moffat2017-al} +\CSLLeftMargin{66. }% +\CSLRightInline{Moffat, J. G., Vincent, F., Lee, J. A., Eder, J. \& +Prunotto, M. Opportunities and challenges in phenotypic drug discovery: +An industry perspective. \emph{Nat. Rev. Drug Discov.} \textbf{16}, +531--543 (2017).} + +\bibitem[\citeproctext]{ref-Zhou2013-wx} +\CSLLeftMargin{67. }% +\CSLRightInline{Zhou, Q. \& Buchholz, C. J. Cell type specific gene +delivery by lentiviral vectors: New options in immunotherapy. +\emph{Oncoimmunology} \textbf{2}, e22566 (2013).} + +\bibitem[\citeproctext]{ref-DiStefano2022-ao} +\CSLLeftMargin{68. }% +\CSLRightInline{DiStefano, M. T. \emph{et al.} The gene curation +coalition: A global effort to harmonize gene-disease evidence resources. +\emph{Genet. Med.} \textbf{24}, 1732--1742 (2022).} + +\bibitem[\citeproctext]{ref-DiStefano2023-np} +\CSLLeftMargin{69. }% +\CSLRightInline{DiStefano, M. \emph{et al.} P451: The gene curation +coalition works to resolve discrepancies in gene-disease validity +assertions. \emph{Genetics in Medicine Open} \textbf{1}, 100498 (2023).} + +\bibitem[\citeproctext]{ref-Skene2016-rb} +\CSLLeftMargin{70. }% +\CSLRightInline{Skene, N. G. \& Grant, S. G. N. Identification of +vulnerable cell types in major brain disorders using single cell +transcriptomes and expression weighted cell type enrichment. +\emph{Front. Neurosci.} \textbf{10}, 16 (2016).} + +\bibitem[\citeproctext]{ref-Cote2010-gp} +\CSLLeftMargin{71. }% +\CSLRightInline{Côté, R. \emph{et al.} The ontology lookup service: +Bigger and better. \emph{Nucleic Acids Res.} \textbf{38}, W155--60 +(2010).} + +\bibitem[\citeproctext]{ref-Benjamini1995-vo} +\CSLLeftMargin{72. }% +\CSLRightInline{Benjamini, Y. \& Hochberg, Y. Controlling the false +discovery rate: A practical and powerful approach to multiple testing. +\emph{J. R. Stat. Soc.} (1995).} + +\bibitem[\citeproctext]{ref-Putman2024-et} +\CSLLeftMargin{73. }% +\CSLRightInline{Putman, T. E. \emph{et al.} The monarch initiative in +2024: An analytic platform integrating phenotypes, genes and diseases +across species. \emph{Nucleic Acids Res.} \textbf{52}, D938--D949 +(2024).} + +\bibitem[\citeproctext]{ref-Lazarin2014-we} +\CSLLeftMargin{74. }% +\CSLRightInline{Lazarin, G. A. \emph{et al.} Systematic classification +of disease severity for evaluation of expanded carrier screening panels. +\emph{PLoS One} \textbf{9}, e114391 (2014).} + +\bibitem[\citeproctext]{ref-Ahmed2016-ag} +\CSLLeftMargin{75. }% +\CSLRightInline{Ahmed, M. I., Iqbal, M. \& Hussain, N. A structured +approach to the assessment of a floppy neonate. \emph{J. Pediatr. +Neurosci.} \textbf{11}, 2--6 (2016).} + +\bibitem[\citeproctext]{ref-Chang2018-qj} +\CSLLeftMargin{76. }% +\CSLRightInline{Chang, C.-W., Wakeland, A. K. \& Parast, M. M. +Trophoblast lineage specification, differentiation and their regulation +by oxygen tension. \emph{J. Endocrinol.} \textbf{236}, R43--R56 (2018).} + +\bibitem[\citeproctext]{ref-Fogarty2011-ph} +\CSLLeftMargin{77. }% +\CSLRightInline{Fogarty, N. M. E., Mayhew, T. M., Ferguson-Smith, A. C. +\& Burton, G. J. A quantitative analysis of transcriptionally active +syncytiotrophoblast nuclei across human gestation. \emph{J. Anat.} +\textbf{219}, 601--610 (2011).} + +\bibitem[\citeproctext]{ref-Hu2010-eh} +\CSLLeftMargin{78. }% +\CSLRightInline{Hu, D. \& Cross, J. C. Development and function of +trophoblast giant cells in the rodent placenta. \emph{Int. J. Dev. +Biol.} \textbf{54}, 341--354 (2010).} + +\end{CSLReferences} + +\hfill\break + +\newpage{} + +\subsection{Supplementary Materials}\label{supplementary-materials} + +\subsubsection{Supplementary Figures}\label{supplementary-figures} + +\phantomsection\label{cell-fig-evidence-histograms} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-evidence-histograms-1.pdf} + +} + +\caption{\label{fig-evidence-histograms}Distribution of evidence scores +at each processing step.} + +\end{figure}% + +\begin{figure} + +\centering{ + +\includegraphics{img/fig-diagram.png} + +} + +\caption{\label{fig-diagram}Diagrammatic overview of multi-scale disease +investigation strategy. Here we provide an abstract example of +differential disease aetiology across multiple scales: diseases (\(D\)), +phenotypes (\(P\)), cell types (\(C\)), genes (\(G\)), and clinical +outcomes (\(O\)). In the HPO, genes are assigned to phenotypes via +particular diseases (\(G_{d,p}\)). Therefore, the final gene list for +each phenotype is aggregated from across multiple diseases (\(G_{p}\)). +We performed association tests for all pairwise combinations of cell +types and phenotypes and filtered results after multiple testing +corrections (\(FDR_{p,c}<0.05\)). Each phenotype in the context of a +given disease is referred to here as a symptom. Links were established +between symptoms and cell types through proportional gene set overlap at +a minimum threshold of 25\%.} + +\end{figure}% + +\begin{figure} + +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-ctd-correlation-1.pdf} + +} + +\subcaption{\label{fig-ctd-correlation-1}Correlation between the +uncorrected p-values from all phenotype-cell type association tests +using the Descartes Human vs.~Human Cell Landscape CTDs.} + +\end{minipage}% +% +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-ctd-correlation-2.pdf} + +} + +\subcaption{\label{fig-ctd-correlation-2}Correlation between the +\(log_{10}(fold-change)\) from significant phenotype-cell type +association tests (\(FDR_{p,c}<0.05\)) using the Descartes Human +vs.~Human Cell Landscape CTDs.} + +\end{minipage}% +\newline +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-ctd-correlation-3.pdf} + +} + +\subcaption{\label{fig-ctd-correlation-3}Correlation between the +uncorrected p-values from all phenotype-cell type association tests +using the Human Cell Landscape fetal samples vs.~Human Cell Landscape +adult samples.} + +\end{minipage}% +% +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-ctd-correlation-4.pdf} + +} + +\subcaption{\label{fig-ctd-correlation-4}Correlation between the +\(log_{10}(fold-change)\) from significant phenotype-cell type +association tests (\(FDR_{p,c}<0.05\)) using the Human Cell Landscape +fetal samples vs.~Human Cell Landscape adult samples.} + +\end{minipage}% + +\caption{\label{fig-ctd-correlation}Inter- and intra-dataset validation +across the different CellTypeDataset (CTD) and developmental stages. +Correlations are computed using Pearson's correlation coefficient. Point +density is plotted using a 2D kernel density estimate.} + +\end{figure}% + +\phantomsection\label{cell-fig-therapy-filter} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-filter-1.pdf} + +} + +\caption{\label{fig-therapy-filter}Therapeutics - Prioritised target +filtering steps. This plot visualises the number of unique +phenotype-cell type associations, cell types, genes, and phenotypes +(\emph{y-axis}) at each filtering step (\emph{x-axis}) within the +multi-scale therapeutic target prioritisation pipeline. Each step in the +pipeline can be easily adjusted according to user preference and use +case. See \textbf{Methods} for descriptions and criterion of each +filtering step.\textbf{a}, The percentage of phenotypes belonging to +each severity Tier after each filtering step (Tier 1 being the most +severe). \textbf{b}, The number of phenotypes, cell types, associated +diseases and genes remaining after each filtering step during the gene +prioritisation pipeline.} + +\end{figure}% + +\phantomsection\label{cell-fig-monarch-recall} +\begin{verbatim} +!!!RECOMPUTE!!! +\end{verbatim} + +\phantomsection\label{cell-fig-therapy-validate-all} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-validate-all-1.pdf} + +} + +\caption{\label{fig-therapy-validate-all}Therapeutics - Validation of +prioritised therapeutic targets. Proportion of existing all therapy +targets (documented in the Therapeutic Target Database) recapitulated by +our prioritisation pipeline.} + +\end{figure}% + +\phantomsection\label{cell-fig-animal-models} +\begin{figure}[H] + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-animal-models-1.pdf} + +} + +\caption{\label{fig-animal-models}Identification of translatable +experimental models. Interspecies translatability of human phenotypes +nominated by our gene therapy prioritised pipeline. Above, our combined +ontological-genotypic similarity score (\(SIM_{o,g}\)) is displayed as +the heatmap fill colour stratified by the model organism +(\emph{x-axis}). An additional column (``n\_genes\_db1'' on the far +left) displays the total number of unique genes annotated to the +phenotypic within the HPO. Phenotypes are clustered according to their +ontological similarity in the HPO (\emph{y-axis}).} + +\end{figure}% + +\begin{figure} + +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-supp-1.pdf} + +} + +\subcaption{\label{fig-therapy-examples-supp-1}Respiratory failure} + +\end{minipage}% +% +\begin{minipage}{0.50\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-supp-2.pdf} + +} + +\subcaption{\label{fig-therapy-examples-supp-2}Amyotrophic lateral +sclerosis} + +\end{minipage}% +\newline +\begin{minipage}{\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-supp-3.pdf} + +} + +\subcaption{\label{fig-therapy-examples-supp-3}Neurodegeneration} + +\end{minipage}% +\newline +\begin{minipage}{\linewidth} + +\centering{ + +\includegraphics{index_files/figure-pdf/fig-therapy-examples-supp-4.pdf} + +} + +\subcaption{\label{fig-therapy-examples-supp-4}Small vessel disease} + +\end{minipage}% + +\caption{\label{fig-therapy-examples-supp}Example cell type-specific +gene therapy targets for several severe phenotypes and their associated +diseases. Each disease (blue cylinders) is connected to its phenotype +(purple cylinders) based on well-established clinical observations +recorded within the HPO\textsuperscript{11}.Phenotypes are connected to +cell types (red circles) via association testing between weighted gene +sets (\(FDR_{p,c}<0.05\)). Each cell type is connected to the +prioritised gene targets (yellow boxes) based on the driver gene +analysis.The thickness of the edges connecting the nodes represent the +(mean) fold-change from the bootstrapped enrichment tests. Nodes were +spatially arranged using the Sugiyama algorithm\textsuperscript{49}.} + +\end{figure}% + +\subsubsection{Supplementary Methods}\label{supplementary-methods} + +\paragraph{Therapeutics: Gene therapy target +identification}\label{therapeutics-gene-therapy-target-identification} + +Descriptions of each step in the prioritisation pipeline are as follows: + +\begin{enumerate} +\def\labelenumi{\arabic{enumi}.} +\item + \textbf{start}: All phenotype-cell type association results. +\item + \textbf{q\_threshold}: Keep only results that were significant after + multiple-testing correction (q\textless0.05). +\item + \textbf{fold\_threshold}: Keep only results with fold + change\textgreater=1. +\item + \textbf{keep\_ont\_levels}: Keep only phenotypes at certain absolute + ontology levels within the HPO. +\item + \textbf{keep\_onsets}: Keep only phenotypes with postnatal age of + onsets to circumvent technical and ethical challenges associated with + antenatal gene therapeutics delivery. +\item + \textbf{keep\_tiers}: Keep only phenotypes with high severity Tiers. + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \item + We used a combination of manual curation and automated text-based + substring queries to assign each phenotype a severity Tier as + characterised in a survey of healthcare + professionals\textsuperscript{74}. + \item + Tier 1: Diseases that shortened life span in adolescence or earlier + or resulted in intellectual disability. + \item + Tier 2: Diseases that shortened lifespan prematurely in adulthood, + or resulted in impaired mobility or internal physical malformation. + \item + Tier 3: Diseases causing sensory impairments (hearing, vision, + touch, pain, or other), immunodeficiency/cancer, mental illness, or + dysmorphic features. + \item + Tier 4: Diseases that reduce fertility. Of the 49 phenotypes that + were available in this severity ranking, we selected three that were + classified as Tier 1 (the most severe disease category): mental + deterioration, coma and respiratory failure. + \end{enumerate} +\item + \textbf{severity\_threshold}: Keep only phenotypes with mean severity + score equal to or below the threshold. + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \item + Severity scores were computed by assigning each severity modifier + term found in the HPO annotations a numerical value. In order of + increasing severity: + \item + HP:0012825 ``Mild'' (Severity\_score=4) + \item + HP:0012827 ``Borderline'' (Severity\_score=3) + \item + HP:0012828 ``Severe'' (Severity\_score=2) + \item + HP:0012829 ``Profound'' (Severity\_score=1) + \end{enumerate} +\item + \textbf{pheno\_frequency\_threshold}: Keep only phenotypes with mean + frequency equal to or above the threshold (i.e.~how frequently a + phenotype is associated with any diseases in which it occurs). + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \tightlist + \item + Keep phenotypes with a mean frequency ≥10\% or are NA by default. + \end{enumerate} +\item + \textbf{keep\_celltypes}: Keep only terminally differentiated cell + types. + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \tightlist + \item + Of the 77 cell types tested in the Descartes cell type reference, + the 40 terminally differentiated cell types were identified through + a literature search. Of these, three (extravillous trophoblasts, + syncytiotrophoblasts and trophoblast giant cells) were excluded as + they only played a role in pregnancy\textsuperscript{76--78}, which + would raise additional technical and ethical challenges as rAAV + therapy has not yet been used to target foetuses in clinical trials. + \end{enumerate} +\item + \textbf{keep\_seqnames}: Remove genes on non-standard chromosomes. + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \tightlist + \item + Only keep chromosomes 1-22, X, and Y. + \end{enumerate} +\item + \textbf{gene\_size}: Keep only genes \textless4.3kb in length. + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \tightlist + \item + Due to limitations in the length of the gene that can be carried by + the rAAV vector, genes with a length of \textgreater4.3kb were + excluded. + \end{enumerate} +\item + \textbf{keep\_biotypes}: Keep only genes belonging to certain biotypes + (e.g.~``protein\_coding'', ``processed\_transcript'', ``snRNA'', + ``lincRNA'', ``snoRNA'', ``IG\_C\_gene''). + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \tightlist + \item + Keep all biotypes by default. + \end{enumerate} +\item + \textbf{gene\_frequency\_threshold}: Keep only genes at or above a + certain mean frequency threshold (i.e.~how frequently a gene is + associated with a given phenotype when observed within a disease). + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \tightlist + \item + Keep genes with a mean frequency ≥10\% or are NA by default. + \end{enumerate} +\item + \textbf{keep\_specificity\_quantiles}: Keep only genes in top + specificity quantiles from the cell type dataset. + + \begin{enumerate} + \def\labelenumii{\arabic{enumii}.} + \tightlist + \item + To further narrow down genes, we extracted relevant metrics from the + Descartes reference for each gene in each cell type. These included + mean expression, specificity, and specificity quantiles (using 40 + bins). Only genes with the most specific quantiles (39-40) were + included for further analysis, as cell type-specific genes may be + less likely to have off-target effects in other cell types. + \end{enumerate} +\item + \textbf{keep\_mean\_exp\_quantiles}: Keep only genes in top mean + expression quantiles from the cell type dataset +\item + \textbf{end}: Final table of prioritised cell type- / + phenotype-specific gene targets. +\end{enumerate} + +Finally, for more comprehensive target search, the we removed the +filters for onsets (keep\_onsets=NULL), Tier (keep\_tiers=NULL), +severity (severity\_threshold=NULL), as well as relaxed the filters for +phenotype frequency threshold (pheno\_frequency\_threshold=c(10,NA)), +gene frequency threshold (gene\_frequency\_threshold = c(10,NA)), gene +specificity quantiles (keep\_specificity\_quantiles = seq(20,40)), and +gene expression quantiles (keep\_mean\_exp\_quantiles = seq(20,40)). + + + +\end{document} diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-animal-models-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-animal-models-1.pdf new file mode 100644 index 0000000..3e29712 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-animal-models-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-congenital-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-congenital-1.pdf new file mode 100644 index 0000000..cae0c36 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-congenital-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-1.pdf new file mode 100644 index 0000000..c17b3b9 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-2.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-2.pdf new file mode 100644 index 0000000..f86409f Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-2.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-3.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-3.pdf new file mode 100644 index 0000000..eefcdd7 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-3.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-4.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-4.pdf new file mode 100644 index 0000000..837ce45 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ctd-correlation-4.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-diagnosis-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-diagnosis-1.pdf new file mode 100644 index 0000000..3fe5026 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-diagnosis-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-evidence-histograms-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-evidence-histograms-1.pdf new file mode 100644 index 0000000..357ff18 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-evidence-histograms-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-network-rni-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-network-rni-1.pdf new file mode 100644 index 0000000..d26dc13 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-network-rni-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ontology-lvl-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ontology-lvl-1.pdf new file mode 100644 index 0000000..4cb62ce Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-ontology-lvl-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-prognosis-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-prognosis-1.pdf new file mode 100644 index 0000000..7c6ee9b Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-prognosis-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-rni-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-rni-1.pdf new file mode 100644 index 0000000..78cdea6 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-rni-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-summary-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-summary-1.pdf new file mode 100644 index 0000000..7c2b913 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-summary-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-1.pdf new file mode 100644 index 0000000..8175b39 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-2.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-2.pdf new file mode 100644 index 0000000..6b7bd4c Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-2.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-3.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-3.pdf new file mode 100644 index 0000000..f67c5c2 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-3.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-4.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-4.pdf new file mode 100644 index 0000000..b1d92c3 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-4.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-1.pdf new file mode 100644 index 0000000..9542354 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-2.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-2.pdf new file mode 100644 index 0000000..8e4d58e Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-2.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-3.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-3.pdf new file mode 100644 index 0000000..baf4daa Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-3.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-4.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-4.pdf new file mode 100644 index 0000000..626b8bb Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-examples-supp-4.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-filter-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-filter-1.pdf new file mode 100644 index 0000000..ca68446 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-filter-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-validate-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-validate-1.pdf new file mode 100644 index 0000000..699d645 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-validate-1.pdf differ diff --git a/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-validate-all-1.pdf b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-validate-all-1.pdf new file mode 100644 index 0000000..2c19362 Binary files /dev/null and b/manuscript/_manuscript/_tex/index_files/figure-pdf/fig-therapy-validate-all-1.pdf differ diff --git a/manuscript/_manuscript/_tex/references.bib b/manuscript/_manuscript/_tex/references.bib new file mode 100644 index 0000000..1288de8 --- /dev/null +++ b/manuscript/_manuscript/_tex/references.bib @@ -0,0 +1,4055 @@ +% Generated by Paperpile. Check out https://paperpile.com for more information. +% BibTeX export options can be customized via Settings -> BibTeX. + +@ARTICLE{noauthor_2022-ok, + title = "Rare diseases, common challenges", + journal = "Nat. Genet.", + volume = 54, + number = 3, + pages = "215", + month = mar, + year = 2022, + language = "en" +} + +@ARTICLE{Hughes2018-yy, + title = "Regulation of islet glucagon secretion: Beyond calcium", + author = "Hughes, Jing W and Ustione, Alessandro and Lavagnino, Zeno and + Piston, David W", + abstract = "The islet of Langerhans plays a key role in glucose homeostasis + through regulated secretion of the hormones insulin and glucagon. + Islet research has focused on the insulin-secreting + $\beta$-cells, even though aberrant glucagon secretion from + $\alpha$-cells also contributes to the aetiology of diabetes. + Despite its importance, the mechanisms controlling glucagon + secretion remain controversial. Proper $\alpha$-cell function + requires the islet milieu, where $\beta$- and $\delta$-cells + drive and constrain $\alpha$-cell dynamics. The response of + glucagon to glucose is similar between isolated islets and that + measured in vivo, so it appears that the glucose dependence + requires only islet-intrinsic factors and not input from blood + flow or the nervous system. Elevated intracellular free Ca2+ is + needed for $\alpha$-cell exocytosis, but interpreting Ca2+ data + is tricky since it is heterogeneous among $\alpha$-cells at all + physiological glucose levels. Total Ca2+ activity in + $\alpha$-cells increases slightly with glucose, so Ca2+ may serve + a permissive, rather than regulatory, role in glucagon secretion. + On the other hand, cAMP is a more promising candidate for + controlling glucagon secretion and is itself driven by paracrine + signalling from $\beta$- and $\delta$-cells. Another pathway, + juxtacrine signalling through the $\alpha$-cell EphA receptors, + stimulated by $\beta$-cell ephrin ligands, leads to a tonic + inhibition of glucagon secretion. We discuss potential + combinations of Ca2+ , cAMP, paracrine and juxtacrine factors in + the regulation of glucagon secretion, focusing on recent data in + the literature that might unify the field towards a quantitative + understanding of $\alpha$-cell function.", + journal = "Diabetes Obes. Metab.", + volume = "20 Suppl 2", + number = "Suppl 2", + pages = "127--136", + month = sep, + year = 2018, + keywords = "EphA receptor; cAMP; calcium; cellular heterogeneity; ephrin; + glucagon; insulin; islet of Langerhans; microscopy; somatostatin; + $\alpha$-cell", + language = "en" +} + +@ARTICLE{Eshchar1973-tz, + title = "Hepatic lesion in chickenpox. A case report", + author = "Eshchar, J and Reif, L and Waron, M and Alkan, W J", + journal = "Gastroenterology", + volume = 64, + number = 3, + pages = "462--466", + month = mar, + year = 1973, + language = "en" +} + +@ARTICLE{Zhao2023-qy, + title = "Prime editing: advances and therapeutic applications", + author = "Zhao, Zhihan and Shang, Peng and Mohanraju, Prarthana and + Geijsen, Niels", + abstract = "Clustered regularly interspaced short palindromic + repeats-associated protein 9 (CRISPR-Cas)-mediated genome editing + has revolutionized biomedical research and will likely change the + therapeutic and diagnostic landscape. However, CRISPR-Cas9, which + edits DNA by activating DNA double-strand break (DSB) repair + pathways, is not always sufficient for gene therapy applications + where precise mutation repair is required. Prime editing, the + latest revolution in genome-editing technologies, can achieve any + possible base substitution, insertion, or deletion without the + requirement for DSBs. However, prime editing is still in its + infancy, and further development is needed to improve editing + efficiency and delivery strategies for therapeutic applications. + We summarize latest developments in the optimization of prime + editor (PE) variants with improved editing efficiency and + precision. Moreover, we highlight some potential therapeutic + applications.", + journal = "Trends Biotechnol.", + volume = 41, + number = 8, + pages = "1000--1012", + month = aug, + year = 2023, + keywords = "CRIPSR--Cas; genome editing; prime editing; therapeutics", + language = "en" +} + +@ARTICLE{Bernardinelli2022-ih, + title = "Novel {POU3F4} variants identified in patients with inner ear + malformations exhibit aberrant cellular distribution and lack of + {SLC6A20} transcriptional upregulation", + author = "Bernardinelli, Emanuele and Roesch, Sebastian and Simoni, Edi and + Marino, Angela and Rasp, Gerd and Astolfi, Laura and Sarikas, + Antonio and Dossena, Silvia", + abstract = "Hearing loss (HL) is the most common sensory defect and affects + 450 million people worldwide in a disabling form. Pathogenic + sequence alterations in the POU3F4 gene, which encodes a + transcription factor, are causative of the most common type of + X-linked deafness (X-linked deafness type 3, DFN3, DFNX2). + POU3F4-related deafness is characterized by a typical inner ear + malformation, namely an incomplete partition of the cochlea type + 3 (IP3), with or without an enlargement of the vestibular + aqueduct (EVA). The pathomechanism underlying POU3F4-related + deafness and the corresponding transcriptional targets are + largely uncharacterized. Two male patients belonging to a + Caucasian cohort with HL and EVA who presented with an IP3 were + submitted to genetic analysis. Two novel sequence variants in + POU3F4 were identified by Sanger sequencing. In cell-based + assays, the corresponding protein variants (p.S74Afs*8 and + p.C327*) showed an aberrant expression and subcellular + distribution and lack of transcriptional activity. These two + protein variants failed to upregulate the transcript levels of + the amino acid transporter gene SLC6A20, which was identified as + a novel transcriptional target of POU3F4 by RNA sequencing and + RT-qPCR. Accordingly, POU3F4 silencing by siRNA resulted in + downregulation of SLC6A20 in mouse embryonic fibroblasts. + Moreover, we showed for the first time that SLC6A20 is expressed + in the mouse cochlea, and co-localized with POU3F4 in the spiral + ligament. The findings presented here point to a novel role of + amino acid transporters in the inner ear and pave the way for + mechanistic studies of POU3F4-related HL.", + journal = "Front. Mol. Neurosci.", + volume = 15, + pages = "999833", + month = sep, + year = 2022, + keywords = "DFN3; DFNX2; POU3F4; SLC6A20; enlarged vestibular aqueduct; + hearing loss; incomplete partition 3", + language = "en" +} + +@BOOK{Institute_of_Medicine_US_Committee_on_Accelerating_Rare_Diseases_Research_and_Orphan_Product_Development2010-vj, + title = "Coverage and Reimbursement: Incentives and Disincentives for + Product Development", + author = "{Institute of Medicine (US) Committee on Accelerating Rare + Diseases Research and Orphan Product Development} and Field, + Marilyn J and Boat, Thomas F", + abstract = "A small market is generally viewed as a disincentive for the + development of drugs. Many of the costs of developing a new drug + are incurred regardless of the size of the potential market. If, + however, a company can set a price that is high enough to + recover its costs and generate profits because enough public and + private health insurance plans and patients and families will + pay that price, then a manufacturer may not be deterred by a + small target market. Some orphan drugs are among the most + expensive drugs in the world, costing as much as \$400,000 per + year.", + publisher = "National Academies Press (US)", + year = 2010, + language = "en" +} + +@ARTICLE{Stoll2018-dc, + title = "Staphylococcal Enterotoxins {Dose-Dependently} Modulate the + Generation of {Myeloid-Derived} Suppressor Cells", + author = "Stoll, Hartmut and Ost, Michael and Singh, Anurag and Mehling, + Roman and Neri, Davide and Sch{\"a}fer, Iris and Velic, Ana and + Macek, Boris and Kretschmer, Dorothee and Weidenmaier, + Christopher and Hector, Andreas and Handgretinger, Rupert and + G{\"o}tz, Friedrich and Peschel, Andreas and Hartl, Dominik and + Rieber, Nikolaus", + abstract = "Staphylococcus aureus is one of the major human bacterial + pathogens causing a broad spectrum of serious infections. + Myeloid-derived suppressor cells (MDSC) represent an innate + immune cell subset capable of regulating host-pathogen + interactions, yet their role in the pathogenesis of S. aureus + infections remains incompletely defined. The aim of this study + was to determine the influence of different S. aureus strains and + associated virulence factors on human MDSC generation. Using an + in vitro MDSC generation assay we demonstrate that low + concentrations of supernatants of different S. aureus strains led + to an induction of functional MDSC, whereas increased + concentrations, conversely, reduced MDSC numbers. The + concentration-dependent reduction of MDSC correlated with T cell + proliferation and cytotoxicity. Several findings supported a role + for staphylococcal enterotoxins in modulating MDSC generation. + Staphylococcal enterotoxins recapitulated concentration-dependent + MDSC induction and inhibition, T cell proliferation and + cytotoxicity, while an enterotoxin-deficient S. aureus strain + largely failed to alter MDSC. Taken together, we identified + staphylococcal enterotoxins as main modulators of MDSC + generation. The inhibition of MDSC generation by staphylococcal + enterotoxins might represent a novel therapeutic target in S. + aureus infections and beyond in non-infectious conditions, such + as cancer.", + journal = "Front. Cell. Infect. Microbiol.", + volume = 8, + pages = "321", + month = sep, + year = 2018, + keywords = "MDSC; S. aureus; Staphylococcus aureus; T cells; enterotoxin; + granulocytes; immunomodulation; myeloid-derived suppressor cells", + language = "en" +} + +@ARTICLE{Dixon2013-ok, + title = "Kupffer cells in the liver", + author = "Dixon, Laura J and Barnes, Mark and Tang, Hui and Pritchard, + Michele T and Nagy, Laura E", + abstract = "Kupffer cells are a critical component of the mononuclear + phagocytic system and are central to both the hepatic and + systemic response to pathogens. Kupffer cells are reemerging as + critical mediators of both liver injury and repair. Kupffer cells + exhibit a tremendous plasticity; depending on the local metabolic + and immune environment, then can express a range of polarized + phenotypes, from the proinflammatory M1 phenotype to the + alternative/M2 phenotype. Multiple M2 phenotypes can be + distinguished, each involved in the resolution of inflammation + and wound healing. Here, we have provided an update on recent + research that has contributed to the developing delineation of + the contribution of Kupffer cells to different types of liver + injury, with an emphasis on alcoholic and nonalcoholic liver + diseases. These recent advances in our understanding of Kupffer + cell function and regulation will likely provide new insights + into the potential for therapeutic manipulation of Kupffer cells + to promote the resolution of inflammation and enhance wound + healing in liver disease.", + journal = "Compr. Physiol.", + volume = 3, + number = 2, + pages = "785--797", + month = apr, + year = 2013, + language = "en" +} + +@ARTICLE{Godbout2023-uo, + title = "Prime Editing for Human Gene Therapy: Where Are We Now?", + author = "Godbout, Kelly and Tremblay, Jacques P", + abstract = "Gene therapy holds tremendous potential in the treatment of + inherited diseases. Unlike traditional medicines, which only + treat the symptoms, gene therapy has the potential to cure the + disease by addressing the root of the problem: genetic mutations. + The discovery of CRISPR/Cas9 in 2012 paved the way for the + development of those therapies. Improvement of this system led to + the recent development of an outstanding technology called prime + editing. This system can introduce targeted insertions, + deletions, and all 12 possible base-to-base conversions in the + human genome. Since the first publication on prime editing in + 2019, groups all around the world have worked on this promising + technology to develop a treatment for genetic diseases. To date, + prime editing has been attempted in preclinical studies for + liver, eye, skin, muscular, and neurodegenerative hereditary + diseases, in addition to cystic fibrosis, beta-thalassemia, + X-linked severe combined immunodeficiency, and cancer. In this + review, we portrayed where we are now on prime editing for human + gene therapy and outlined the best strategies for correcting + pathogenic mutations by prime editing.", + journal = "Cells", + volume = 12, + number = 4, + month = feb, + year = 2023, + keywords = "CRISPR/Cas9; gene therapy; genetic diseases; inherited diseases; + prime editing", + language = "en" +} + +@ARTICLE{Nuijten2022-yc, + title = "Pricing Zolgensma - the world's most expensive drug", + author = "Nuijten, Mark", + journal = "J Mark Access Health Policy", + volume = 10, + number = 1, + pages = "2022353", + year = 2022, + keywords = "Price valuation reimbursement; orphan disease", + language = "en" +} + +@ARTICLE{Kamada1989-qf, + title = "An algorithm for drawing general undirected graphs", + author = "Kamada, Tomihisa and Kawai, Satoru", + journal = "Inf. Process. Lett.", + volume = 31, + number = 1, + pages = "7--15", + month = apr, + year = 1989, + keywords = "Graph; network structure; layout; drawing algorithm" +} + +@ARTICLE{Haque2017-bn, + title = "A practical guide to single-cell {RNA-sequencing} for biomedical + research and clinical applications", + author = "Haque, Ashraful and Engel, Jessica and Teichmann, Sarah A and + L{\"o}nnberg, Tapio", + abstract = "RNA sequencing (RNA-seq) is a genomic approach for the detection + and quantitative analysis of messenger RNA molecules in a + biological sample and is useful for studying cellular responses. + RNA-seq has fueled much discovery and innovation in medicine over + recent years. For practical reasons, the technique is usually + conducted on samples comprising thousands to millions of cells. + However, this has hindered direct assessment of the fundamental + unit of biology-the cell. Since the first single-cell + RNA-sequencing (scRNA-seq) study was published in 2009, many more + have been conducted, mostly by specialist laboratories with + unique skills in wet-lab single-cell genomics, bioinformatics, + and computation. However, with the increasing commercial + availability of scRNA-seq platforms, and the rapid ongoing + maturation of bioinformatics approaches, a point has been reached + where any biomedical researcher or clinician can use scRNA-seq to + make exciting discoveries. In this review, we present a practical + guide to help researchers design their first scRNA-seq studies, + including introductory information on experimental hardware, + protocol choice, quality control, data analysis and biological + interpretation.", + journal = "Genome Med.", + volume = 9, + number = 1, + pages = "75", + month = aug, + year = 2017, + language = "en" +} + +@ARTICLE{Zhou2013-wx, + title = "Cell type specific gene delivery by lentiviral vectors: New + options in immunotherapy", + author = "Zhou, Qi and Buchholz, Christian J", + abstract = "Many cells of the immune system are defined by distinct surface + markers, which can be used to restrict gene delivery exclusively + to a cell type of choice. This article explains recent findings + about a CD8-specific vector that enhances the killing of tumor + cells in TCR-based gene transfer strategies.", + journal = "Oncoimmunology", + volume = 2, + number = 1, + pages = "e22566", + month = jan, + year = 2013, + keywords = "CD8+ T lymphocytes; gene transfer; lentiviral vector; targeting; + tumor", + language = "en" +} + +@ARTICLE{Hu2010-eh, + title = "Development and function of trophoblast giant cells in the rodent + placenta", + author = "Hu, Dong and Cross, James C", + abstract = "Trophoblast giant cells (TGCs) are the first cell type to + terminally differentiate during embryogenesis and are of vital + importance for implantation and modulation of post-implantation + placentation. TGCs are mononuclear and polyploid but are + heterogenous and dynamic. At least four different subtypes of + TGCs are present within the mature placenta that have distinct + cell lineage origins. The development of TGCs is complex and + requires transition from the mitotic to the endoreduplication + cell cycle and is regulated by a wide variety of factors. During + early gestation, TGCs mediate blastocyst attachment and invasion + into the uterine epithelium, regulate uterus decidualization, and + anatomosis with maternal blood spaces to form the transient yolk + sac placenta. During later gestation, TGCs secrete a wide array + of hormones and paracrine factors, including steroid hormones and + Prolactin-related cytokines, to target the maternal physiological + systems for proper maternal adaptations to pregnancy and the + fetal-maternal interface to ensure vasculature remodeling. The + large number of mouse mutants with defects in TGC development and + function are giving us significant new insights into the biology + of these fascinating cells.", + journal = "Int. J. Dev. Biol.", + volume = 54, + number = "2-3", + pages = "341--354", + year = 2010, + language = "en" +} + +@ARTICLE{Karslioglu_French2019-yi, + title = "Diabetic ketoacidosis and hyperosmolar hyperglycemic syndrome: + review of acute decompensated diabetes in adult patients", + author = "Karslioglu French, Esra and Donihi, Amy C and Korytkowski, Mary T", + abstract = "Diabetic ketoacidosis and hyperosmolar hyperglycemic syndrome + (HHS) are life threatening complications that occur in patients + with diabetes. In addition to timely identification of the + precipitating cause, the first step in acute management of these + disorders includes aggressive administration of intravenous + fluids with appropriate replacement of electrolytes (primarily + potassium). In patients with diabetic ketoacidosis, this is + always followed by administration of insulin, usually via an + intravenous insulin infusion that is continued until resolution + of ketonemia, but potentially via the subcutaneous route in mild + cases. Careful monitoring by experienced physicians is needed + during treatment for diabetic ketoacidosis and HHS. Common + pitfalls in management include premature termination of + intravenous insulin therapy and insufficient timing or dosing of + subcutaneous insulin before discontinuation of intravenous + insulin. This review covers recommendations for acute management + of diabetic ketoacidosis and HHS, the complications associated + with these disorders, and methods for preventing recurrence. It + also discusses why many patients who present with these disorders + are at high risk for hospital readmissions, early morbidity, and + mortality well beyond the acute presentation.", + journal = "BMJ", + volume = 365, + pages = "l1114", + month = may, + year = 2019, + language = "en" +} + +@ARTICLE{Smeds2022-fm, + title = "X-linked Malformation Deafness: Neurodevelopmental Symptoms Are + Common in Children With {IP3} Malformation and Mutation in + {POU3F4}", + author = "Smeds, Henrik and Wales, Jeremy and Karltorp, Eva and Anderlid, + Britt-Marie and Henricson, Cecilia and Asp, Filip and Anmyr, Lena + and Lagerstedt-Robinson, Kristina and L{\"o}fkvist, Ulrika", + abstract = "OBJECTIVE: Incomplete partition type 3 (IP3) malformation + deafness is a rare hereditary cause of congenital or rapid + progressive hearing loss. The children present with a severe to + profound mixed hearing loss and temporal bone imaging show a + typical inner ear malformation classified as IP3. Cochlear + implantation is one option of hearing restoration in severe + cases. Little is known about other specific difficulties these + children might exhibit, for instance possible neurodevelopmental + symptoms. MATERIAL AND METHODS: Ten 2; 0 to 9; 6-year-old + children with IP3 malformation deafness (nine boys and one girl) + with cochlear implants were evaluated with a retrospective chart + review in combination with an additional extensive + multidisciplinary assessment day. Hearing, language, cognition, + and mental ill-health were compared with a control group of ten + 1; 6 to 14; 5-year-old children with cochlear implants (seven + boys and three girls) with another genetic cause of deafness, + mutations in the GJB2 gene. RESULTS: Mutations in POU3F4 were + found in nine of the 10 children with IP3 malformation. Children + with IP3 malformation deafness had an atypical outcome with low + level of speech recognition (especially in noise), executive + functioning deficits, delayed or impaired speech as well as + atypical lexical-semantic and pragmatic abilities, and exhibited + mental ill-health issues. Parents of children with IP3 + malformation were more likely to report that they were worried + about their child's psychosocial wellbeing. Controls, however, + had more age-typical results in all these domains. Eight of 10 + children in the experimental group had high nonverbal cognitive + ability despite their broad range of neurodevelopmental symptoms. + CONCLUSIONS: While cochlear implantation is a feasible + alternative for children with IP3 malformation deafness, + co-occurring neurodevelopmental anomalies, such as attention + deficit hyperactivity or developmental language disorder, and + mental ill-health issues require an extensive and consistent + multidisciplinary team approach during childhood to support their + overall habilitation.", + journal = "Ear Hear.", + volume = 43, + number = 1, + pages = "53--69", + year = 2022, + language = "en" +} + +@ARTICLE{De_Kok1995-qy, + title = "Association between X-linked mixed deafness and mutations in the + {POU} domain gene {POU3F4}", + author = "de Kok, Y J and van der Maarel, S M and Bitner-Glindzicz, M and + Huber, I and Monaco, A P and Malcolm, S and Pembrey, M E and + Ropers, H H and Cremers, F P", + abstract = "Deafness with fixation of the stapes (DFN3) is the most frequent + X-linked form of hearing impairment. The underlying gene has been + localized to a 500-kilobase segment of the Xq21 band. Here, it is + reported that a candidate gene for this disorder, Brain 4 + (POU3F4), which encodes a transcription factor with a POU domain, + maps to the same interval. In five unrelated patients with DFN3 + but not in 50 normal controls, small mutations were found that + result in truncation of the predicted protein or in + nonconservative amino acid substitutions. These findings indicate + that POU3F4 mutations are a molecular cause of DFN3.", + journal = "Science", + volume = 267, + number = 5198, + pages = "685--688", + month = feb, + year = 1995, + language = "en" +} + +@ARTICLE{Mendell2017-kg, + title = "{Single-Dose} {Gene-Replacement} Therapy for Spinal Muscular + Atrophy", + author = "Mendell, Jerry R and Al-Zaidy, Samiah and Shell, Richard and + Arnold, W Dave and Rodino-Klapac, Louise R and Prior, Thomas W + and Lowes, Linda and Alfano, Lindsay and Berry, Katherine and + Church, Kathleen and Kissel, John T and Nagendran, Sukumar and + L'Italien, James and Sproule, Douglas M and Wells, Courtney and + Cardenas, Jessica A and Heitzer, Marjet D and Kaspar, Allan and + Corcoran, Sarah and Braun, Lyndsey and Likhite, Shibi and + Miranda, Carlos and Meyer, Kathrin and Foust, K D and Burghes, + Arthur H M and Kaspar, Brian K", + abstract = "BACKGROUND: Spinal muscular atrophy type 1 (SMA1) is a + progressive, monogenic motor neuron disease with an onset during + infancy that results in failure to achieve motor milestones and + in death or the need for mechanical ventilation by 2 years of + age. We studied functional replacement of the mutated gene + encoding survival motor neuron 1 (SMN1) in this disease. METHODS: + Fifteen patients with SMA1 received a single dose of intravenous + adeno-associated virus serotype 9 carrying SMN complementary DNA + encoding the missing SMN protein. Three of the patients received + a low dose (6.7$\times$1013 vg per kilogram of body weight), and + 12 received a high dose (2.0$\times$1014 vg per kilogram). The + primary outcome was safety. The secondary outcome was the time + until death or the need for permanent ventilatory assistance. In + exploratory analyses, we compared scores on the CHOP INTEND + (Children's Hospital of Philadelphia Infant Test of Neuromuscular + Disorders) scale of motor function (ranging from 0 to 64, with + higher scores indicating better function) in the two cohorts and + motor milestones in the high-dose cohort with scores in studies + of the natural history of the disease (historical cohorts). + RESULTS: As of the data cutoff on August 7, 2017, all 15 patients + were alive and event-free at 20 months of age, as compared with a + rate of survival of 8\% in a historical cohort. In the high-dose + cohort, a rapid increase from baseline in the score on the CHOP + INTEND scale followed gene delivery, with an increase of 9.8 + points at 1 month and 15.4 points at 3 months, as compared with a + decline in this score in a historical cohort. Of the 12 patients + who had received the high dose, 11 sat unassisted, 9 rolled over, + 11 fed orally and could speak, and 2 walked independently. + Elevated serum aminotransferase levels occurred in 4 patients and + were attenuated by prednisolone. CONCLUSIONS: In patients with + SMA1, a single intravenous infusion of adeno-associated viral + vector containing DNA coding for SMN resulted in longer survival, + superior achievement of motor milestones, and better motor + function than in historical cohorts. Further studies are + necessary to confirm the safety and efficacy of this gene + therapy. (Funded by AveXis and others; ClinicalTrials.gov number, + NCT02122952 .).", + journal = "N. Engl. J. Med.", + volume = 377, + number = 18, + pages = "1713--1722", + month = nov, + year = 2017, + language = "en" +} + +@ARTICLE{Jackson2021-cp, + title = "{OBO} Foundry in 2021: operationalizing open data principles to + evaluate ontologies", + author = "Jackson, Rebecca and Matentzoglu, Nicolas and Overton, James A + and Vita, Randi and Balhoff, James P and Buttigieg, Pier Luigi + and Carbon, Seth and Courtot, Melanie and Diehl, Alexander D and + Dooley, Damion M and Duncan, William D and Harris, Nomi L and + Haendel, Melissa A and Lewis, Suzanna E and Natale, Darren A and + Osumi-Sutherland, David and Ruttenberg, Alan and Schriml, Lynn M + and Smith, Barry and Stoeckert, Jr, Christian J and Vasilevsky, + Nicole A and Walls, Ramona L and Zheng, Jie and Mungall, + Christopher J and Peters, Bjoern", + abstract = "Biological ontologies are used to organize, curate and interpret + the vast quantities of data arising from biological experiments. + While this works well when using a single ontology, integrating + multiple ontologies can be problematic, as they are developed + independently, which can lead to incompatibilities. The Open + Biological and Biomedical Ontologies (OBO) Foundry was created to + address this by facilitating the development, harmonization, + application and sharing of ontologies, guided by a set of + overarching principles. One challenge in reaching these goals was + that the OBO principles were not originally encoded in a precise + fashion, and interpretation was subjective. Here, we show how we + have addressed this by formally encoding the OBO principles as + operational rules and implementing a suite of automated + validation checks and a dashboard for objectively evaluating each + ontology's compliance with each principle. This entailed a + substantial effort to curate metadata across all ontologies and + to coordinate with individual stakeholders. We have applied these + checks across the full OBO suite of ontologies, revealing areas + where individual ontologies require changes to conform to our + principles. Our work demonstrates how a sizable, federated + community can be organized and evaluated on objective criteria + that help improve overall quality and interoperability, which is + vital for the sustenance of the OBO project and towards the + overall goals of making data Findable, Accessible, Interoperable, + and Reusable (FAIR). Database URL http://obofoundry.org/.", + journal = "Database", + volume = 2021, + month = oct, + year = 2021, + language = "en" +} + +@ARTICLE{Bueren2023-ma, + title = "Advances and Challenges in the Development of Gene Therapy + Medicinal Products for Rare Diseases", + author = "Bueren, Juan A and Auricchio, Alberto", + abstract = "The development of viral vectors and recombinant DNA technology + since the 1960s has enabled gene therapy to become a real + therapeutic option for several inherited and acquired diseases. + After several ups and downs in the gene therapy field, we are + currently living a new era in the history of medicine in which + several ex vivo and in vivo gene therapies have reached maturity. + This is testified by the recent marketing authorization of + several gene therapy medicinal products. In addition, many others + are currently under evaluation after exhaustive investigation in + human clinical trials. In this review, we summarize some of the + most significant milestones in the development of gene therapy + medicinal products that have already facilitated the treatment of + a significant number of rare diseases. Despite progresses in the + gene therapy field, the transfer of these innovative therapies to + clinical practice is also finding important restrictions. + Advances and also challenges in the progress of gene therapy for + rare diseases are discussed in this opening review of a Human + Gene Therapy issue dedicated to the 30th annual Congress of the + European Society for Gene and Cell Therapy.", + journal = "Hum. Gene Ther.", + volume = 34, + number = "17-18", + pages = "763--775", + month = sep, + year = 2023, + keywords = "ATMPs; gene therapy; rare diseases", + language = "en" +} + +@ARTICLE{Diehl2016-gt, + title = "The Cell Ontology 2016: enhanced content, modularization, and + ontology interoperability", + author = "Diehl, Alexander D and Meehan, Terrence F and Bradford, Yvonne M + and Brush, Matthew H and Dahdul, Wasila M and Dougall, David S + and He, Yongqun and Osumi-Sutherland, David and Ruttenberg, Alan + and Sarntivijai, Sirarat and Van Slyke, Ceri E and Vasilevsky, + Nicole A and Haendel, Melissa A and Blake, Judith A and Mungall, + Christopher J", + abstract = "BACKGROUND: The Cell Ontology (CL) is an OBO Foundry candidate + ontology covering the domain of canonical, natural biological + cell types. Since its inception in 2005, the CL has undergone + multiple rounds of revision and expansion, most notably in its + representation of hematopoietic cells. For in vivo cells, the CL + focuses on vertebrates but provides general classes that can be + used for other metazoans, which can be subtyped in + species-specific ontologies. CONSTRUCTION AND CONTENT: Recent + work on the CL has focused on extending the representation of + various cell types, and developing new modules in the CL itself, + and in related ontologies in coordination with the CL. For + example, the Kidney and Urinary Pathway Ontology was used as a + template to populate the CL with additional cell types. In + addition, subtypes of the class 'cell in vitro' have received + improved definitions and labels to provide for modularity with + the representation of cells in the Cell Line Ontology and Reagent + Ontology. Recent changes in the ontology development methodology + for CL include a switch from OBO to OWL for the primary encoding + of the ontology, and an increasing reliance on logical + definitions for improved reasoning. UTILITY AND DISCUSSION: The + CL is now mandated as a metadata standard for large functional + genomics and transcriptomics projects, and is used extensively + for annotation, querying, and analyses of cell type specific data + in sequencing consortia such as FANTOM5 and ENCODE, as well as + for the NIAID ImmPort database and the Cell Image Library. The CL + is also a vital component used in the modular construction of + other biomedical ontologies-for example, the Gene Ontology and + the cross-species anatomy ontology, Uberon, use CL to support the + consistent representation of cell types across different levels + of anatomical granularity, such as tissues and organs. + CONCLUSIONS: The ongoing improvements to the CL make it a + valuable resource to both the OBO Foundry community and the wider + scientific community, and we continue to experience increased + interest in the CL both among developers and within the user + community.", + journal = "J. Biomed. Semantics", + volume = 7, + number = 1, + pages = "44", + month = jul, + year = 2016, + language = "en" +} + +@ARTICLE{Paff2021-fc, + title = "Current and Future Treatments in Primary Ciliary Dyskinesia", + author = "Paff, Tamara and Omran, Heymut and Nielsen, Kim G and Haarman, + Eric G", + abstract = "Primary ciliary dyskinesia (PCD) is a rare genetic ciliopathy in + which mucociliary clearance is disturbed by the abnormal motion + of cilia or there is a severe reduction in the generation of + multiple motile cilia. Lung damage ensues due to recurrent airway + infections, sometimes even resulting in respiratory failure. So + far, no causative treatment is available and treatment efforts + are primarily aimed at improving mucociliary clearance and early + treatment of bacterial airway infections. Treatment guidelines + are largely based on cystic fibrosis (CF) guidelines, as few + studies have been performed on PCD. In this review, we give a + detailed overview of the clinical studies performed investigating + PCD to date, including three trials and several case reports. In + addition, we explore precision medicine approaches in PCD, + including gene therapy, mRNA transcript and read-through therapy.", + journal = "Int. J. Mol. Sci.", + volume = 22, + number = 18, + month = sep, + year = 2021, + keywords = "genetic; primary ciliary dyskinesia; treatment", + language = "en" +} + +@ARTICLE{Zhu2020-vo, + title = "An integrative knowledge graph for rare diseases, derived from + the Genetic and Rare Diseases Information Center ({GARD})", + author = "Zhu, Qian and Nguyen, Dac-Trung and Grishagin, Ivan and Southall, + Noel and Sid, Eric and Pariser, Anne", + abstract = "BACKGROUND: The Genetic and Rare Diseases (GARD) Information + Center was established by the National Institutes of Health (NIH) + to provide freely accessible consumer health information on over + 6500 genetic and rare diseases. As the cumulative scientific + understanding and underlying evidence for these diseases have + expanded over time, existing practices to generate knowledge from + these publications and resources have not been able to keep pace. + Through determining the applicability of computational approaches + to enhance or replace manual curation tasks, we aim to both + improve the sustainability and relevance of consumer health + information, but also to develop a foundational database, from + which translational science researchers may start to unravel + disease characteristics that are vital to the research process. + RESULTS: We developed a meta-ontology based integrative knowledge + graph for rare diseases in Neo4j. This integrative knowledge + graph includes a total of 3,819,623 nodes and 84,223,681 + relations from 34 different biomedical data resources, including + curated drug and rare disease associations. Semi-automatic + mappings were generated for 2154 unique FDA orphan designations + to 776 unique GARD diseases, and 3322 unique FDA designated drugs + to UNII, as well as 180,363 associations between drug and + indication from Inxight Drugs, which were integrated into the + knowledge graph. We conducted four case studies to demonstrate + the capabilities of this integrative knowledge graph in + accelerating the curation of scientific understanding on rare + diseases through the generation of disease mappings/profiles and + pathogenesis associations. CONCLUSIONS: By integrating + well-established database resources, we developed an integrative + knowledge graph containing a large volume of biomedical and + research data. Demonstration of several immediate use cases and + limitations of this process reveal both the potential feasibility + and barriers of utilizing graph-based resources and approaches to + support their use by providers of consumer health information, + such as GARD, that may struggle with the needs of maintaining + knowledge reliant on an evolving and growing evidence-base. + Finally, the successful integration of these datasets into a + freely accessible knowledge graph highlights an opportunity to + take a translational science view on the field of rare diseases + by enabling researchers to identify disease characteristics, + which may play a role in the translation of discover across + different research domains.", + journal = "J. Biomed. Semantics", + volume = 11, + number = 1, + pages = "13", + month = nov, + year = 2020, + keywords = "Data integration; GARD; Knowledge graph; Ontology; Rare diseases", + language = "en" +} + +@MISC{Lu2024-kl, + title = "{FDA} takes first step toward international regulation of + gene therapies to treat rare diseases", + booktitle = "National Law Review", + author = "Lu, Chia-Feng", + abstract = "Go-To Guide:FDA announces launch of its Collaboration on Gene + Therapies Global Pilot (CoGenT Global) to encourage + ``regulatory convergence'' across countries,", + month = jan, + year = 2024, + howpublished = "\url{https://www.natlawreview.com/c/s/www.natlawreview.com/article/fda-takes-first-step-toward-international-regulation-gene-therapies-treat-rare?amp}", + note = "Accessed: 2024-2-17" +} + +@ARTICLE{Gu2016-op, + title = "Complex heatmaps reveal patterns and correlations in + multidimensional genomic data", + author = "Gu, Zuguang and Eils, Roland and Schlesner, Matthias", + abstract = "UNLABELLED: Parallel heatmaps with carefully designed annotation + graphics are powerful for efficient visualization of patterns and + relationships among high dimensional genomic data. Here we + present the ComplexHeatmap package that provides rich + functionalities for customizing heatmaps, arranging multiple + parallel heatmaps and including user-defined annotation graphics. + We demonstrate the power of ComplexHeatmap to easily reveal + patterns and correlations among multiple sources of information + with four real-world datasets. AVAILABILITY AND IMPLEMENTATION: + The ComplexHeatmap package and documentation are freely available + from the Bioconductor project: + http://www.bioconductor.org/packages/devel/bioc/html/ComplexHeatmap.html + CONTACT: m.schlesner@dkfz.de SUPPLEMENTARY INFORMATION: + Supplementary data are available at Bioinformatics online.", + journal = "Bioinformatics", + volume = 32, + number = 18, + pages = "2847--2849", + month = sep, + year = 2016, + language = "en" +} + +@ARTICLE{Diaz-Santiago2020-ep, + title = "Phenotype-genotype comorbidity analysis of patients with rare + disorders provides insight into their pathological and molecular + bases", + author = "D{\'\i}az-Santiago, Elena and Jabato, Fernando M and Rojano, + Elena and Seoane, Pedro and Pazos, Florencio and Perkins, James R + and Ranea, Juan A G", + abstract = "Genetic and molecular analysis of rare disease is made difficult + by the small numbers of affected patients. Phenotypic comorbidity + analysis can help rectify this by combining information from + individuals with similar phenotypes and looking for overlap in + terms of shared genes and underlying functional systems. However, + few studies have combined comorbidity analysis with genomic data. + We present a computational approach that connects patient + phenotypes based on phenotypic co-occurence and uses genomic + information related to the patient mutations to assign genes to + the phenotypes, which are used to detect enriched functional + systems. These phenotypes are clustered using network analysis to + obtain functionally coherent phenotype clusters. We applied the + approach to the DECIPHER database, containing phenotypic and + genomic information for thousands of patients with heterogeneous + rare disorders and copy number variants. Validity was + demonstrated through overlap with known diseases, co-mention + within the biomedical literature, semantic similarity measures, + and patient cluster membership. These connected pairs formed + multiple phenotype clusters, showing functional coherence, and + mapped to genes and systems involved in similar pathological + processes. Examples include claudin genes from the 22q11 genomic + region associated with a cluster of phenotypes related to + DiGeorge syndrome and genes related to the GO term + anterior/posterior pattern specification associated with abnormal + development. The clusters generated can help with the diagnosis + of rare diseases, by suggesting additional phenotypes for a given + patient and potential underlying functional systems. Other tools + to find causal genes based on phenotype were also investigated. + The approach has been implemented as a workflow, named PhenCo, + which can be adapted to any set of patients for which phenomic + and genomic data is available. Full details of the analysis, + including the clusters formed, their constituent functional + systems and underlying genes are given. Code to implement the + workflow is available from GitHub.", + journal = "PLoS Genet.", + volume = 16, + number = 10, + pages = "e1009054", + month = oct, + year = 2020, + language = "en" +} + +@ARTICLE{Heim2014-du, + title = "Myeloid-derived suppressor cells contribute to Staphylococcus + aureus orthopedic biofilm infection", + author = "Heim, Cortney E and Vidlak, Debbie and Scherr, Tyler D and Kozel, + Jessica A and Holzapfel, Melissa and Muirhead, David E and + Kielian, Tammy", + abstract = "Myeloid-derived suppressor cells (MDSCs) are a heterogeneous + population of immature monocytes and granulocytes that are potent + inhibitors of T cell activation. A role for MDSCs in bacterial + infections has only recently emerged, and nothing is known about + MDSC function in the context of Staphylococcus aureus infection. + Because S. aureus biofilms are capable of subverting + immune-mediated clearance, we examined whether MDSCs could play a + role in this process. CD11b(+)Gr-1(+) MDSCs represented the main + cellular infiltrate during S. aureus orthopedic biofilm + infection, accounting for >75\% of the CD45+ population. + Biofilm-associated MDSCs inhibited T cell proliferation and + cytokine production, which correlated with a paucity of T cell + infiltrates at the infection site. Analysis of FACS-purified + MDSCs recovered from S. aureus biofilms revealed increased + arginase-1, inducible NO synthase, and IL-10 expression, key + mediators of MDSC suppressive activity. Targeted depletion of + MDSCs and neutrophils using the mAb 1A8 (anti-Ly6G) improved + bacterial clearance by enhancing the intrinsic proinflammatory + attributes of infiltrating monocytes and macrophages. + Furthermore, the ability of monocytes/macrophages to promote + biofilm clearance in the absence of MDSC action was revealed with + RB6-C85 (anti-Gr-1 or anti-Ly6G/Ly6C) administration, which + resulted in significantly increased S. aureus burdens both + locally and in the periphery, because effector Ly 6C monocytes + and, by extension, mature macrophages were also depleted. + Collectively, these results demonstrate that MDSCs are key + contributors to the chronicity of S. aureus biofilm infection, as + their immunosuppressive function prevents monocyte/macrophage + proinflammatory activity, which facilitates biofilm persistence.", + journal = "J. Immunol.", + volume = 192, + number = 8, + pages = "3778--3792", + month = apr, + year = 2014, + language = "en" +} + +@ARTICLE{Brewer2018-dg, + title = "Acute Liver Failure due to Disseminated Varicella Zoster + Infection", + author = "Brewer, Elizabeth Caitlin and Hunter, Leigh", + abstract = "Acute liver failure (ALF) can be due to numerous causes and + result in fatality or necessitate liver transplantation if left + untreated. Possible etiologies of ALF include ischemia, venous + obstruction, medications, toxins, autoimmune hepatitis, metabolic + and infectious causes including hepatitis A-E, varicella-zoster + virus (VZV), cytomegalovirus (CMV), herpes simplex virus (HSV), + Epstein-Barr virus (EBV), and adenovirus with VZV being the most + rarely reported. Pathognomonic skin lesions facilitate diagnosis + of VZV hepatitis, but definitive diagnosis is secured with liver + biopsy, tissue histopathology, culture, and specific VZV + polymerase chain reaction (PCR). Antiviral treatment with + intravenous acyclovir can be effective if initiated in a timely + manner; however, comorbidities and complications frequently + result in high mortality, especially in immunocompromised hosts + as exemplified in this case presentation.", + journal = "Case Reports Hepatol", + volume = 2018, + pages = "1269340", + month = sep, + year = 2018, + language = "en" +} + +@ARTICLE{Al-Hamoudi2009-le, + title = "Severe autoimmune hepatitis triggered by varicella zoster + infection", + author = "Al-Hamoudi, Waleed K", + abstract = "Autoimmune hepatitis (AIH) is a chronic disease of unknown + etiology that is characterized by the presence of circulatory + autoantibodies and inflammatory histological changes in the + liver. Although the pathogenesis of AIH is not known, it is + thought that, in a genetically predisposed individual, + environmental factors such as viruses can trigger the autoimmune + process. Herpes simplex virus, Epstein-Barr virus, measles virus, + and hepatitis viruses are thought to play a role in the etiology + of AIH. Proteins belonging to these viruses may be similar to the + amino acid chains of different autoantigens in the liver, this + causes immune cross reactions and liver tissue damage. We report + a case of severe AIH following varicella zoster infection in a + 23-year-old man, and speculate that, based on the molecular + mimicry hypothesis, the liver damage was caused by an immune + cross reaction to the viral proteins. Varicella-zoster-induced + AIH has not been reported previously.", + journal = "World J. Gastroenterol.", + volume = 15, + number = 8, + pages = "1004--1006", + month = feb, + year = 2009, + language = "en" +} + +@ARTICLE{Svensson2020-lg, + title = "A curated database reveals trends in single-cell transcriptomics", + author = "Svensson, Valentine and da Veiga Beltrame, Eduardo and Pachter, + Lior", + abstract = "The more than 1000 single-cell transcriptomics studies that have + been published to date constitute a valuable and vast resource + for biological discovery. While various 'atlas' projects have + collated some of the associated datasets, most questions related + to specific tissue types, species or other attributes of studies + require identifying papers through manual and challenging + literature search. To facilitate discovery with published + single-cell transcriptomics data, we have assembled a near + exhaustive, manually curated database of single-cell + transcriptomics studies with key information: descriptions of the + type of data and technologies used, along with descriptors of the + biological systems studied. Additionally, the database contains + summarized information about analysis in the papers, allowing for + analysis of trends in the field. As an example, we show that the + number of cell types identified in scRNA-seq studies is + proportional to the number of cells analysed. Database URL: + www.nxn.se/single-cell-studies/gui.", + journal = "Database", + volume = 2020, + month = nov, + year = 2020, + language = "en" +} + +@ARTICLE{Philippakis2015-dq, + title = "The Matchmaker Exchange: a platform for rare disease gene + discovery", + author = "Philippakis, Anthony A and Azzariti, Danielle R and Beltran, + Sergi and Brookes, Anthony J and Brownstein, Catherine A and + Brudno, Michael and Brunner, Han G and Buske, Orion J and Carey, + Knox and Doll, Cassie and Dumitriu, Sergiu and Dyke, Stephanie O + M and den Dunnen, Johan T and Firth, Helen V and Gibbs, Richard A + and Girdea, Marta and Gonzalez, Michael and Haendel, Melissa A + and Hamosh, Ada and Holm, Ingrid A and Huang, Lijia and Hurles, + Matthew E and Hutton, Ben and Krier, Joel B and Misyura, Andriy + and Mungall, Christopher J and Paschall, Justin and Paten, + Benedict and Robinson, Peter N and Schiettecatte, Fran{\c c}ois + and Sobreira, Nara L and Swaminathan, Ganesh J and Taschner, + Peter E and Terry, Sharon F and Washington, Nicole L and + Z{\"u}chner, Stephan and Boycott, Kym M and Rehm, Heidi L", + abstract = "There are few better examples of the need for data sharing than + in the rare disease community, where patients, physicians, and + researchers must search for ``the needle in a haystack'' to + uncover rare, novel causes of disease within the genome. Impeding + the pace of discovery has been the existence of many small siloed + datasets within individual research or clinical laboratory + databases and/or disease-specific organizations, hoping for + serendipitous occasions when two distant investigators happen to + learn they have a rare phenotype in common and can ``match'' + these cases to build evidence for causality. However, serendipity + has never proven to be a reliable or scalable approach in + science. As such, the Matchmaker Exchange (MME) was launched to + provide a robust and systematic approach to rare disease gene + discovery through the creation of a federated network connecting + databases of genotypes and rare phenotypes using a common + application programming interface (API). The core building blocks + of the MME have been defined and assembled. Three MME services + have now been connected through the API and are available for + community use. Additional databases that support internal + matching are anticipated to join the MME network as it continues + to grow.", + journal = "Hum. Mutat.", + volume = 36, + number = 10, + pages = "915--921", + month = oct, + year = 2015, + keywords = "GA4GH, IRDiRC; Matchmaker Exchange; gene discovery; genomic API; + matchmaking; rare disease", + language = "en" +} + +@ARTICLE{Srivastava2023-ge, + title = "Utility of genetic work-up for 46, {XY} patients with severe + hypospadias", + author = "Srivastava, Priya and Tenney, Jessica and Lodish, Maya and + Slavotinek, Anna and Baskin, Laurence", + abstract = "SummaryObjectiveHypospadias is a common congenital abnormality + that has been increasing in prevalence over the last decades. + Historically, 46, XY patients with severe hypospadias and + descended scrotal testes at birth have frequently lacked a + genetic diagnosis. Platforms for molecular genetic testing have + become more readily available and can offer an insight into + underlying genetic causes of severe hypospadias. The goal of + this study was to define the anatomical characteristics of + severe hypospadias that can accurately define patients with 46, + XY severe hypospadias and determine the practical utility of + performing molecular genetic testing in this group of + patients.MethodsPatients who met the criteria for 46, XY severe + hypospadias were offered a molecular genetic work-up in + consultation with pediatric genetics. Patients were identified + through chart review. Data extracted included karyotype, + hypospadias phenotype including stretched penile length at + diagnosis, age at genetic diagnosis, molecular genetic testing, + pathogenic gene variant(s), gender identity, and clinical + course. All patients underwent clinical genetic testing via 46, + XY Disorders of Sexual Development (DSD) panels offered by + Invitae\textregistered{}, GeneDx\textregistered{}, or Blueprint + Genetics\textregistered{}.ResultsOf the 14 patients that + underwent genetic testing, there were 5 previously published and + 3 novel pathogenic or likely pathogenic variants in genes + associated with 46, XY severe hypospadias (Table). Pathogenic + variants were identified in AR (3), SRD5A2 [1], NR5A1 [2], WT1 + [1], and ARTX [1]. Two patients had a variant of unknown + significance, one in FREM2 and another in CEP41. Four had + negative gene panels. The patient with the WT1 pathogenic + variant was subsequently found to have developed a Wilms tumor + and the patients with NR5A1 pathogenic variants are now + undergoing adrenal insufficiency + surveillance.Discussion/ConclusionPatients with 46,XY severe + hypospadias and descended testes in the scrotum at birth can + benefit from molecular genetic testing as their underlying + disorders may reveal pathogenic variants that could have + potentially life-altering consequences and change surveillance + and monitoring.", + journal = "J. Pediatr. Urol.", + publisher = "Elsevier", + volume = 19, + number = 3, + pages = "261--272", + month = jun, + year = 2023, + keywords = "Hypospadias; Genetics; 46,XY gene platforms; Whole exome + sequencing", + language = "en" +} + +@ARTICLE{Halley2022-pd, + title = "A call for an integrated approach to improve efficiency, equity + and sustainability in rare disease research in the United States", + author = "Halley, Meghan C and Smith, Hadley Stevens and Ashley, Euan A and + Goldenberg, Aaron J and Tabor, Holly K", + abstract = "To build a more efficient, equitable, and sustainable approach to + rare disease research in the United States, we must prioritize + integrated research infrastructure and approaches that focus on + understanding connections across rare diseases.", + journal = "Nat. Genet.", + volume = 54, + number = 3, + pages = "219--222", + month = mar, + year = 2022, + language = "en" +} + +@ARTICLE{Bulaklak2020-ta, + title = "The once and future gene therapy", + author = "Bulaklak, Karen and Gersbach, Charles A", + abstract = "Gene therapy is at an inflection point. Recent successes in + genetic medicine have paved the path for a broader second wave of + therapies and laid the foundation for next-generation + technologies. This comment summarizes recent advances and + expectations for the near future.", + journal = "Nat. Commun.", + volume = 11, + number = 1, + pages = "5820", + month = nov, + year = 2020, + language = "en" +} + +@ARTICLE{Cao2020-qz, + title = "A human cell atlas of fetal gene expression", + author = "Cao, Junyue and O'Day, Diana R and Pliner, Hannah A and Kingsley, + Paul D and Deng, Mei and Daza, Riza M and Zager, Michael A and + Aldinger, Kimberly A and Blecher-Gonen, Ronnie and Zhang, Fan and + Spielmann, Malte and Palis, James and Doherty, Dan and Steemers, + Frank J and Glass, Ian A and Trapnell, Cole and Shendure, Jay", + abstract = "The gene expression program underlying the specification of human + cell types is of fundamental interest. We generated human cell + atlases of gene expression and chromatin accessibility in fetal + tissues. For gene expression, we applied three-level + combinatorial indexing to >110 samples representing 15 organs, + ultimately profiling ~4 million single cells. We leveraged the + literature and other atlases to identify and annotate hundreds of + cell types and subtypes, both within and across tissues. Our + analyses focused on organ-specific specializations of broadly + distributed cell types (such as blood, endothelial, and + epithelial), sites of fetal erythropoiesis (which notably + included the adrenal gland), and integration with mouse + developmental atlases (such as conserved specification of blood + cells). These data represent a rich resource for the exploration + of in vivo human gene expression in diverse tissues and cell + types.", + journal = "Science", + volume = 370, + number = 6518, + month = nov, + year = 2020, + language = "en" +} + +@ARTICLE{Amberger2017-tg, + title = "Searching Online Mendelian Inheritance in Man ({OMIM)}: A + Knowledgebase of Human Genes and Genetic Phenotypes", + author = "Amberger, Joanna S and Hamosh, Ada", + abstract = "Online Mendelian Inheritance in Man (OMIM) at OMIM.org is the + primary repository of comprehensive, curated information on genes + and genetic phenotypes and the relationships between them. This + unit provides an overview of the types of information in OMIM and + optimal strategies for searching and retrieving the information. + OMIM.org has links to many related and complementary databases, + providing easy access to more information on a topic. The + relationship between genes and genetic disorders is highlighted + in this unit. The basic protocol explains searching OMIM both + from a gene perspective and a clinical features perspective. Two + alternate protocols provide strategies for viewing gene-phenotype + relationships: a gene map table and Quick View or Side-by-Side + format for clinical features. OMIM.org is updated nightly, and + the MIMmatch service, described in the support protocol, provides + a convenient way to follow updates to entries, gene-phenotype + relationships, and collaborate with other researchers. + \copyright{} 2017 by John Wiley \& Sons, Inc.", + journal = "Curr. Protoc. Bioinformatics", + volume = 58, + pages = "1.2.1--1.2.12", + month = jun, + year = 2017, + keywords = "OMIM; disease gene discovery; human genetic disorders; molecular + genetics", + language = "en" +} + +@ARTICLE{Robinson2008-ys, + title = "The Human Phenotype Ontology: a tool for annotating and analyzing + human hereditary disease", + author = "Robinson, Peter N and K{\"o}hler, Sebastian and Bauer, Sebastian + and Seelow, Dominik and Horn, Denise and Mundlos, Stefan", + abstract = "There are many thousands of hereditary diseases in humans, each + of which has a specific combination of phenotypic features, but + computational analysis of phenotypic data has been hampered by + lack of adequate computational data structures. Therefore, we + have developed a Human Phenotype Ontology (HPO) with over 8000 + terms representing individual phenotypic anomalies and have + annotated all clinical entries in Online Mendelian Inheritance in + Man with the terms of the HPO. We show that the HPO is able to + capture phenotypic similarities between diseases in a useful and + highly significant fashion.", + journal = "Am. J. Hum. Genet.", + volume = 83, + number = 5, + pages = "610--615", + month = nov, + year = 2008, + language = "en" +} + +@ARTICLE{Osmond2022-ml, + title = "Outcome of over 1500 matches through the Matchmaker Exchange for + rare disease gene discovery: The 2-year experience of {Care4Rare} + Canada", + author = "Osmond, Matthew and Hartley, Taila and Dyment, David A and + Kernohan, Kristin D and Brudno, Michael and Buske, Orion J and + Innes, A Micheil and Boycott, Kym M and {Care4Rare Canada + Consortium}", + abstract = "PURPOSE: Matchmaking has emerged as a useful strategy for + building evidence toward causality of novel disease genes in + patients with undiagnosed rare diseases. The Matchmaker Exchange + (MME) is a collaborative initiative that facilitates + international data sharing for matchmaking purposes; however, + data on user experience is limited. METHODS: Patients enrolled as + part of the Finding of Rare Disease Genes in Canada (FORGE) and + Care4Rare Canada research programs had their exome sequencing + data reanalyzed by a multidisciplinary research team over a + 2-year period. Compelling variants in genes not previously + associated with a human phenotype were submitted through the MME + node PhenomeCentral, and outcomes were collected. RESULTS: In + this study, 194 novel candidate genes were submitted to the MME, + resulting in 1514 matches, and 15\% of the genes submitted + resulted in collaborations. Most submissions resulted in at least + 1 match, and most matches were with GeneMatcher (82\%), where + additional email exchange was required to evaluate the match + because of the lack of phenotypic or inheritance information. + CONCLUSION: Matchmaking through the MME is an effective way to + investigate novel candidate genes; however, it is a + labor-intensive process. Engagement from the community to + contribute phenotypic, genotypic, and inheritance data will + ensure that matchmaking continues to be a useful approach in the + future.", + journal = "Genet. Med.", + volume = 24, + number = 1, + pages = "100--108", + month = jan, + year = 2022, + keywords = "Data sharing; GeneMatcher; Matchmaker exchange; PhenomeCentral; + Rare diseases", + language = "en" +} + +@ARTICLE{Yates2022-ra, + title = "The economics of moonshots: Value in rare disease drug + development", + author = "Yates, Nathan and Hinkel, Jennifer", + journal = "Clin. Transl. Sci.", + volume = 15, + number = 4, + pages = "809--812", + month = apr, + year = 2022, + language = "en" +} + +@ARTICLE{Warburg2001-li, + title = "Visual impairment in adult people with intellectual disability: + literature review", + author = "Warburg, M", + abstract = "The present paper reviews studies on the prevalence of visual + impairment (VI) in adults with intellectual disability (ID). + Every publication describes an alarming prevalence of blindness + and VI. Cataract and keratoconus were common. Many cases of poor + distance vision were treatable by ordinary spherical or + astigmatic glasses, but few people had had such prescriptions. + Elderly residents in community and institutional care often did + not receive glasses for near vision. Professional assessments + disclosed higher prevalences of VI than questionnaires mailed to + the care personnel. The prevalence of VI increased dramatically + with the severity of ID and with age. Regular professional + assessment of eye disorders, visual acuity and refraction are + warranted in residents in both hospital and community care.", + journal = "J. Intellect. Disabil. Res.", + volume = 45, + number = "Pt 5", + pages = "424--438", + month = oct, + year = 2001, + language = "en" +} + +@ARTICLE{Thielen2022-ud, + title = "Towards sustainability and affordability of expensive cell and + gene therapies? Applying a cost-based pricing model to estimate + prices for Libmeldy and Zolgensma", + author = "Thielen, Frederick W and Heine, Renaud J S D and van den Berg, + Sibren and Ham, Renske M T Ten and Groot, Carin A Uyl-de", + abstract = "BACKGROUND AIMS: Drug prices are regarded as one of the most + influential factors in determining accessibility and + affordability to novel therapies. Cell and gene therapies such as + OTL-200 (brand name: Libmeldy) and AVXS-101 (brand name: + Zolgensma) with (expected) list prices of 3.0 million EUR and 1.9 + million EUR per treatment, respectively, spark a global debate on + the affordability of such therapies. The aim of this study was to + use a recently published cost-based pricing model to calculate + prices for cell and gene therapies, with OTL-200 and AVXS-101 as + case study examples. METHODS: Using the pricing model proposed by + Uyl-de Groot and L{\"o}wenberg, we estimated a price for both + therapies. We searched the literature and online public sources + to estimate (i) research and development (R\&D) expenses adjusted + for risk of failure and cost of capital, (ii) the eligible + patient population and (iii) costs of drug manufacturing to + calculate a base-case price for OTL-200 and AVXS-101. All model + input parameters were varied in a stepwise, deterministic + sensitivity analysis and scenario analyses to assess their impact + on the calculated prices. RESULTS: Prices for OTL-200 and + AVXS-101 were estimated at 1 048 138 EUR and 380 444 EUR per + treatment, respectively. In deterministic sensitivity analyses, + varying R\&D estimates had the greatest impact on the price for + OTL-200, whereas for AVXS-101, changes in the profit margin + changed the calculated price substantially. Highest prices in + scenario analyses were achieved when assuming the lowest number + of patients for OTL-200 and highest R\&D expenses for AVXS-101. + The lowest R\&D expenses scenario resulted in lowest prices for + either therapy. CONCLUSIONS: Our results show that, using the + proposed model, prices for both OTL-200 and AVXS-101 lie + substantially below the currently (proposed) list prices for both + therapies. Nevertheless, the uncertainty of the used model input + parameters is considerable, which translates in a wide range of + estimated prices. This is mainly because of a lack of + transparency from pharmaceutical companies regarding R\&D + expenses and the costs of drug manufacturing. Simultaneously, the + disease indications for both therapies remain heavily + understudied in terms of their epidemiological profile. Despite + the considerable variation in the estimated prices, our results + may support the public debate on value-based and cost-based + pricing models, and on ``fair'' drug prices in general.", + journal = "Cytotherapy", + volume = 24, + number = 12, + pages = "1245--1258", + month = dec, + year = 2022, + keywords = "AVXS-101; Libmeldy; OTL-200; Zolgensma; cost-based pricing; drug + pricing; gene therapy", + language = "en" +} + +@ARTICLE{Fogarty2011-ph, + title = "A quantitative analysis of transcriptionally active + syncytiotrophoblast nuclei across human gestation", + author = "Fogarty, N M E and Mayhew, T M and Ferguson-Smith, A C and + Burton, G J", + abstract = "The syncytiotrophoblast (STB) epithelial covering of the human + placenta is a unique terminally differentiated, multi-nucleated + syncytium. No mitotic bodies are observed in the STB, which is + sustained by continuous fusion of underlying cytotrophoblast + cells (CTB). As a result, STB nuclei are of different ages. + Morphologically, they display varying degrees of chromatin + compaction, suggesting progressive maturational changes. Until + recently, it was thought that STB nuclei were transcriptionally + inactive, with all the mRNAs required by the syncytium being + incorporated upon fusion of CTB. However, recent research has + shown the presence of the active form of RNA polymerase II (RNA + Pol II) in some STB nuclei. In this study, we confirm the + presence of transcriptional activity in STB nuclei by + demonstrating immunoreactivity for a transcription factor and an + RNA polymerase I (RNA Pol I) co-factor, phospho-cAMP response + element-binding protein and phospho-upstream binding factor, + respectively. We also show, through immunoco-localisation + studies, that a proportion of STB nuclei are both RNA Pol I and + II transcriptionally active. Finally, we quantify the numerical + densities of nuclei immunopositive and immunonegative for RNA Pol + II in the STB of normal placentas of 11-39 weeks gestational age + using an unbiased stereological counting tool, the physical + disector. These data were combined with estimates of the volume + of trophoblast to calculate total numbers of both types of nuclei + at each gestational age. We found no correlation between + gestational age and the numerical density of RNA Pol II-positive + nuclei in the villous trophoblast (r = 0.39, P > 0.05). As the + number of STB nuclei increases exponentially during gestation, we + conclude that the number of transcriptionally active nuclei + increases in proportion to trophoblast volume. The ratio of + active to inactive nuclei remains constant at 3.9:1. These + findings confirm that the majority of STB nuclei have intrinsic + transcriptional activity, and that the STB is not dependent on + CTB fusion for the provision of transcripts.", + journal = "J. Anat.", + volume = 219, + number = 5, + pages = "601--610", + month = nov, + year = 2011, + language = "en" +} + +@ARTICLE{Ferreira2019-jp, + title = "The burden of rare diseases", + author = "Ferreira, Carlos R", + abstract = "The subject of rare disease numbers is rife with misconceptions, + not just in websites and other layman's literature, but also in + the medical literature. Various websites mention numbers that are + not validated by any solid data, while in turn the medical + literature cites the aforementioned websites as sources, thus + perpetuating a number of myths about rare diseases and their + burden. We review the existing literature on rare disease + numbers, in an attempt to demystify the subject. Specifically, we + summarize data pertaining to: (a) known number and cumulative + prevalence of rare diseases; (b) rare disease-associated + mortality; (c) rare disease-associated morbidity, including + numbers on health care services related to rare diseases; and (d) + orphan drug numbers.", + journal = "Am. J. Med. Genet. A", + volume = 179, + number = 6, + pages = "885--892", + month = jun, + year = 2019, + keywords = "burden of disease; orphan drugs; prevalence; rare disease", + language = "en" +} + +@ARTICLE{Weinreich2008-wm, + title = "[Orphanet: a European database for rare diseases]", + author = "Weinreich, S S and Mangon, R and Sikkens, J J and Teeuw, M E en + and Cornel, M C", + abstract = "Orphanet is a European initiative that aims to improve the + management and treatment of rare diseases. It comprises a + database dedicated to information on rare diseases and orphan + drugs, and offers services adapted to the needs of patients and + their families, health professionals, and researchers. The + database can be accessed through the website (www.orpha.net) and + has some interesting options for searching, for example research + projects, support groups or searching by clinical signs. Health + professionals are encouraged to add activities concerning rare + diseases to the database.", + journal = "Ned. Tijdschr. Geneeskd.", + volume = 152, + number = 9, + pages = "518--519", + month = mar, + year = 2008, + language = "nl" +} + +% The entry below contains non-ASCII chars that could not be converted +% to a LaTeX equivalent. +@ARTICLE{Lung2019-il, + title = "The complement system in liver diseases: Evidence-based approach + and therapeutic options", + author = "Lung, Thomas and Sakem, Benjamin and Risch, Lorenz and + W{\"u}rzner, Reinhard and Colucci, Giuseppe and Cerny, Andreas + and Nydegger, Urs", + abstract = "Complement is usually seen to largely originate from the liver to + accomplish its tasks systemically - its return to the production + site has long been underestimated. Recent progress in genomics, + therapeutic effects on complement, standardised possibilities in + medical laboratory tests and involvement of complosome brings the + complement system with its three major functions of opsonization, + cytolysis and phagocytosis back to liver biology and pathology. + The LOINC™ system features 20 entries for the C3 component of + complement to anticipate the application of artificial + intelligence data banks algorythms of which are fed with + patient-specific data connected to standard lab assays for liver + function. These advancements now lead to increased vigilance by + clinicians. This reassessment article will further elucidate the + distribution of synthesis sites to the three germ layer-derived + cell systems and the role complement now known to play in + embryogenesis, senescence, allotransplantation and autoimmune + disease. This establishes the liver as part of the + gastro-intestinal system in connection with nosological entities + never thought of, such as the microbiota-liver-brain axis. In + neurological disease etiology infectious and autoimmune hepatitis + play an important role in the context of causative viz reactive + complement activation. The mosaic of autoimmunity, i.e. multiple + combinations of the many factors producing varying clinical + pictures, leads to the manifold facets of liver autoimmunity.", + journal = "J Transl Autoimmun", + volume = 2, + pages = "100017", + month = dec, + year = 2019, + keywords = "Autoantibody; Complement system; Complosome; Diagnostic tests; + Hepatocyte", + language = "en" +} + +@MISC{noauthor_undated-kp, + title = "Rare Diseases {BioResource}", + abstract = "We want to identify genetic causes of rare diseases, which + affect one in 17 people. We work to improve diagnosis and to + support work to develop new treatments.", + howpublished = "\url{https://bioresource.nihr.ac.uk/centres-programmes/rare-diseases-bioresource/}", + note = "Accessed: 2024-2-17", + language = "en" +} + +@ARTICLE{Amirav2016-bi, + title = "Systematic Analysis of {CCNO} Variants in a Defined Population: + Implications for Clinical Phenotype and Differential Diagnosis", + author = "Amirav, Israel and Wallmeier, Julia and Loges, Niki T and + Menchen, Tabea and Pennekamp, Petra and Mussaffi, Huda and + Abitbul, Revital and Avital, Avraham and Bentur, Lea and + Dougherty, Gerard W and Nael, Elias and Lavie, Moran and Olbrich, + Heike and Werner, Claudius and Kintner, Chris and Omran, Heymut + and {Israeli PCD Consortium Investigators}", + abstract = "Reduced generation of multiple motile cilia (RGMC) is a novel + chronic destructive airway disease within the group of + mucociliary clearance disorders with only few cases reported. + Mutations in two genes, CCNO and MCIDAS, have been identified as + a cause of this disease, both leading to a greatly reduced number + of cilia and causing impaired mucociliary clearance. This study + was designed to identify the prevalence of CCNO mutations in + Israel and further delineate the clinical characteristics of + RGMC. We analyzed 170 families with mucociliary clearance + disorders originating from Israel for mutations in CCNO and + identified two novel mutations (c.165delC, p.Gly56Alafs*38; + c.638T>C, p.Leu213Pro) and two known mutations in 15 individuals + from 10 families (6\% prevalence). Pathogenicity of the missense + mutation (c.638T>C, p.Leu213Pro) was demonstrated by functional + analyses in Xenopus. Combining these 15 patients with the + previously reported CCNO case reports revealed rapid + deterioration in lung function, an increased prevalence of + hydrocephalus (10\%) as well as increased female infertility + (22\%). Consistent with these findings, we demonstrate that CCNO + expression is present in murine ependyma and fallopian tubes. + CCNO is mutated more frequently than expected from the rare + previous clinical case reports, leads to severe clinical + manifestations, and should therefore be considered an important + differential diagnosis of mucociliary clearance disorders.", + journal = "Hum. Mutat.", + volume = 37, + number = 4, + pages = "396--405", + month = apr, + year = 2016, + keywords = "CCNO; PCD; RGMC; mucociliary clearance disorder; primary ciliary + dyskinesia", + language = "en" +} + +@ARTICLE{Chen2002-bn, + title = "{TTD}: Therapeutic Target Database", + author = "Chen, X and Ji, Z L and Chen, Y Z", + abstract = "A number of proteins and nucleic acids have been explored as + therapeutic targets. These targets are subjects of interest in + different areas of biomedical and pharmaceutical research and in + the development and evaluation of bioinformatics, molecular + modeling, computer-aided drug design and analytical tools. A + publicly accessible database that provides comprehensive + information about these targets is therefore helpful to the + relevant communities. The Therapeutic Target Database (TTD) is + designed to provide information about the known therapeutic + protein and nucleic acid targets described in the literature, the + targeted disease conditions, the pathway information and the + corresponding drugs/ligands directed at each of these targets. + Cross-links to other databases are also introduced to facilitate + the access of information about the sequence, 3D structure, + function, nomenclature, drug/ligand binding properties, drug + usage and effects, and related literature for each target. This + database can be accessed at + http://xin.cz3.nus.edu.sg/group/ttd/ttd.asp and it currently + contains entries for 433 targets covering 125 disease conditions + along with 809 drugs/ligands directed at each of these targets. + Each entry can be retrieved through multiple methods including + target name, disease name, drug/ligand name, drug/ligand function + and drug therapeutic classification.", + journal = "Nucleic Acids Res.", + volume = 30, + number = 1, + pages = "412--415", + month = jan, + year = 2002, + language = "en" +} + +@ARTICLE{Lazarin2014-we, + title = "Systematic Classification of Disease Severity for Evaluation of + Expanded Carrier Screening Panels", + author = "Lazarin, Gabriel A and Hawthorne, Felicia and Collins, Nicholas S + and Platt, Elizabeth A and Evans, Eric A and Haque, Imran S", + abstract = "Professional guidelines dictate that disease severity is a key + criterion for carrier screening. Expanded carrier screening, + which tests for hundreds to thousands of mutations + simultaneously, requires an objective, systematic means of + describing a given disease's severity to build screening panels. + We hypothesized that diseases with characteristics deemed to be + of highest impact would likewise be rated as most severe, and + diseases with characteristics of lower impact would be rated as + less severe. We describe a pilot test of this hypothesis in which + we surveyed 192 health care professionals to determine the impact + of specific disease phenotypic characteristics on perceived + severity, and asked the same group to rate the severity of + selected inherited diseases. The results support the hypothesis: + we identified four ``Tiers'' of disease characteristics (1-4). + Based on these responses, we developed an algorithm that, based + on the combination of characteristics normally seen in an + affected individual, classifies the disease as Profound, Severe, + Moderate, or Mild. This algorithm allows simple classification of + disease severity that is replicable and not labor intensive.", + journal = "PLoS One", + volume = 9, + number = 12, + pages = "e114391", + month = dec, + year = 2014, + language = "en" +} + +@ARTICLE{Ladhani2019-nf, + title = "Invasive meningococcal disease in patients with complement + deficiencies: a case series (2008-2017)", + author = "Ladhani, Shamez N and Campbell, Helen and Lucidarme, Jay and + Gray, Steve and Parikh, Sydel and Willerton, Laura and Clark, + Stephen A and Lekshmi, Aiswarya and Walker, Andrew and Patel, + Sima and Bai, Xilian and Ramsay, Mary and Borrow, Ray", + abstract = "BACKGROUND: To describe patients with inherited and acquired + complement deficiency who developed invasive meningococcal + disease (IMD) in England over the last decade. METHODS: Public + Health England conducts enhanced surveillance of IMD in England. + We retrospectively identified patients with complement deficiency + who developed IMD in England during 2008-2017 and retrieved + information on their clinical presentation, vaccination status, + medication history, recurrence of infection and outcomes, as well + as characteristics of the infecting meningococcal strain. + RESULTS: A total of 16 patients with 20 IMD episodes were + identified, including four with two episodes. Six patients had + inherited complement deficiencies, two had immune-mediated + conditions associated with complement deficiency + (glomerulonephritis and vasculitis), and eight others were on + Eculizumab therapy, five for paroxysmal nocturnal haemoglobinuria + and three for atypical haemolytic uraemic syndrome. Cultures were + available for 7 of 11 episodes among those with inherited + complement deficiencies/immune-mediated conditions and the + predominant capsular group was Y (7/11), followed by B (3/11) and + non-groupable (1/11) strains. Among patients receiving Eculizumab + therapy, 3 of the 9 episodes were due to group B (3/9), three + others were NG but genotypically group B, and one case each of + groups E, W and Y. CONCLUSIONS: In England, complement deficiency + is rare among IMD cases and includes inherited disorders of the + late complement pathway, immune-mediated disorders associated + with low complement levels and patients on Eculizumab therapy. + IMD due to capsular group Y predominates in patient with + inherited complement deficiency, whilst those on Eculizumab + therapy develop IMD due to more diverse capsular groups including + non-encapsulated strains.", + journal = "BMC Infect. Dis.", + volume = 19, + number = 1, + pages = "522", + month = jun, + year = 2019, + keywords = "Complement deficiency; Eculizumab; Invasive meningococcal + disease; Risk factors", + language = "en" +} + +@ARTICLE{Lee2009-ag, + title = "Clinical and molecular characterizations of novel {POU3F4} + mutations reveal that {DFN3} is due to null function of {POU3F4} + protein", + author = "Lee, Hee Keun and Song, Mee Hyun and Kang, Myengmo and Lee, Jung + Tae and Kong, Kyoung-Ah and Choi, Su-Jin and Lee, Kyu Yup and + Venselaar, Hanka and Vriend, Gert and Lee, Won-Sang and Park, + Hong-Joon and Kwon, Taeg Kyu and Bok, Jinwoong and Kim, Un-Kyung", + abstract = "X-linked deafness type 3 (DFN3), the most prevalent X-linked form + of hereditary deafness, is caused by mutations in the POU3F4 + locus, which encodes a member of the POU family of transcription + factors. Despite numerous reports on clinical evaluations and + genetic analyses describing novel POU3F4 mutations, little is + known about how such mutations affect normal functions of the + POU3F4 protein and cause inner ear malformations and deafness. + Here we describe three novel mutations of the POU3F4 gene and + their clinical characterizations in three Korean families + carrying deafness segregating at the DFN3 locus. The three + mutations cause a substitution (p.Arg329Pro) or a deletion + (p.Ser310del) of highly conserved amino acid residues in the POU + homeodomain or a truncation that eliminates both DNA-binding + domains (p.Ala116fs). In an attempt to better understand the + molecular mechanisms underlying their inner ear defects, we + examined the behavior of the normal and mutant forms of the + POU3F4 protein in C3H/10T1/2 mesodermal cells. Protein modeling + as well as in vitro assays demonstrated that these mutations are + detrimental to the tertiary structure of the POU3F4 protein and + severely affect its ability to bind DNA. All three mutated POU3F4 + proteins failed to transactivate expression of a reporter gene. + In addition, all three failed to inhibit the transcriptional + activity of wild-type proteins when both wild-type and mutant + proteins were coexpressed. Since most of the mutations reported + for DFN3 thus far are associated with regions that encode the DNA + binding domains of POU3F4, our results strongly suggest that the + deafness in DFN3 patients is largely due to the null function of + POU3F4.", + journal = "Physiol. Genomics", + volume = 39, + number = 3, + pages = "195--201", + month = nov, + year = 2009, + language = "en" +} + +@ARTICLE{Dixit2021-uf, + title = "Functional analysis of novel genetic variants of {NKX2-5} + associated with nonsyndromic congenital heart disease", + author = "Dixit, Ritu and Narasimhan, Chitra and Balekundri, Vijayalakshmi + I and Agrawal, Damyanti and Kumar, Ashok and Mohapatra, + Bhagyalaxmi", + abstract = "NKX2-5, a master cardiac regulatory transcription factor was the + first known genetic cause of congenital heart diseases (CHDs). To + further investigate its role in CHD pathogenesis, we performed + mutational screening of 285 CHD probands and 200 healthy + controls. Five coding sequence variants were identified in six + CHD cases (2.1\%), including three in the N-terminal region + (p.A61G, p.R95L, and p.E131K) and one each in homeodomain (HD) + (p.A148E) and tyrosine-rich domain (p.P247A). Variant-p.A148E + showed tertiary structure changes and differential DNA binding + affinity of mutant compared to wild type. Two N-terminal + variants-p.A61G and p.E131K along with HD variant p.A148E + demonstrated significantly reduced transcriptional activity of + Nppa and Actc1 promoters in dual luciferase promoter assay + supported by their reduced expression in qRT-PCR. Nonetheless, + variant p.R95L affected the synergy of NKX2-5 with serum response + factor and TBX5 leading to significantly decreased Actc1 promoter + activity depicting a distinctive role of this region. The + aberrant expression of other target genes-Irx4, Mef2c, Bmp10, + Myh6, Myh7, and Myocd is also observed in response to NKX2-5 + variants, possibly due to the defective gene regulatory network. + Severely impaired downstream promoter activities and abnormal + expression of target genes due to N-terminal variants supports + the emerging role of this region during cardiac-developmental + pathways.", + journal = "Am. J. Med. Genet. A", + volume = 185, + number = 12, + pages = "3644--3663", + month = dec, + year = 2021, + keywords = "NKX2-5; congenital heart disease; in vitro studies; mutations; + transcription factors", + language = "en" +} + +@ARTICLE{Schott1998-yo, + title = "Congenital heart disease caused by mutations in the transcription + factor {NKX2-5}", + author = "Schott, J J and Benson, D W and Basson, C T and Pease, W and + Silberbach, G M and Moak, J P and Maron, B J and Seidman, C E and + Seidman, J G", + abstract = "Mutations in the gene encoding the homeobox transcription factor + NKX2-5 were found to cause nonsyndromic, human congenital heart + disease. A dominant disease locus associated with cardiac + malformations and atrioventricular conduction abnormalities was + mapped to chromosome 5q35, where NKX2-5, a Drosophila tinman + homolog, is located. Three different NKX2-5 mutations were + identified. Two are predicted to impair binding of NKX2-5 to + target DNA, resulting in haploinsufficiency, and a third + potentially augments target-DNA binding. These data indicate that + NKX2-5 is important for regulation of septation during cardiac + morphogenesis and for maturation and maintenance of + atrioventricular node function throughout life.", + journal = "Science", + volume = 281, + number = 5373, + pages = "108--111", + month = jul, + year = 1998, + language = "en" +} + +@ARTICLE{Liu2011-qd, + title = "The Therapeutic Target Database: an internet resource for the + primary targets of approved, clinical trial and experimental + drugs", + author = "Liu, Xin and Zhu, Feng and Ma, Xiaohua and Tao, Lin and Zhang, + Jingxian and Yang, Shengyong and Wei, Yuquan and Chen, Yu Zong", + abstract = "Increasing numbers of proteins, nucleic acids and other molecular + entities have been explored as therapeutic targets. A challenge + in drug discovery is to decide which targets to pursue from an + increasing pool of potential targets, given the fact that few + innovative targets have made it to the approval list each year. + Knowledge of existing drug targets (both approved and within + clinical trials) is highly useful for facilitating target + discovery, selection, exploration and tool development. The + Therapeutic Target Database (TTD) has been developed and updated + to provide information on 358 successful targets, 251 clinical + trial targets and 1254 research targets in addition to 1511 + approved drugs, 1118 clinical trials drugs and 2331 experimental + drugs linked to their primary targets (3257 drugs with available + structure data). This review briefly describes the TTD database + and illustrates how its data can be explored for facilitating + target and drug searches, the study of the mechanism of + multi-target drugs and the development of in silico target + discovery tools.", + journal = "Expert Opin. Ther. Targets", + volume = 15, + number = 8, + pages = "903--912", + month = aug, + year = 2011, + language = "en" +} + +@ARTICLE{Mueller2017-fz, + title = "5 Year Expression and Neutrophil Defect Repair after Gene Therapy + in Alpha-1 Antitrypsin Deficiency", + author = "Mueller, Christian and Gernoux, Gwladys and Gruntman, Alisha M + and Borel, Florie and Reeves, Emer P and Calcedo, Roberto and + Rouhani, Farshid N and Yachnis, Anthony and Humphries, Margaret + and Campbell-Thompson, Martha and Messina, Louis and Chulay, + Jeffrey D and Trapnell, Bruce and Wilson, James M and McElvaney, + Noel G and Flotte, Terence R", + abstract = "Alpha-1 antitrypsin deficiency is a monogenic disorder resulting + in emphysema due principally to the unopposed effects of + neutrophil elastase. We previously reported achieving plasma + wild-type alpha-1 antitrypsin concentrations at 2.5\%-3.8\% of + the purported therapeutic level at 1 year after a single + intramuscular administration of recombinant adeno-associated + virus serotype 1 alpha-1 antitrypsin vector in alpha-1 + antitrypsin deficient patients. We analyzed blood and muscle for + alpha-1 antitrypsin expression and immune cell response. We also + assayed previously reported markers of neutrophil function known + to be altered in alpha-1 antitrypsin deficient patients. Here, we + report sustained expression at 2.0\%-2.5\% of the target level + from years 1-5 in these same patients without any additional + recombinant adeno-associated virus serotype-1 alpha-1 antitrypsin + vector administration. In addition, we observed partial + correction of disease-associated neutrophil defects, including + neutrophil elastase inhibition, markers of degranulation, and + membrane-bound anti-neutrophil antibodies. There was also + evidence of an active T regulatory cell response (similar to the + 1 year data) and an exhausted cytotoxic T cell response to + adeno-associated virus serotype-1 capsid. These findings suggest + that muscle-based alpha-1 antitrypsin gene replacement is + tolerogenic and that stable levels of M-AAT may exert beneficial + neutrophil effects at lower concentrations than previously + anticipated.", + journal = "Mol. Ther.", + volume = 25, + number = 6, + pages = "1387--1394", + month = jun, + year = 2017, + keywords = "A1AT; AAT; AAV; PD-1; Tregs; alpha-1 antitrypsin; clinical trial; + exhausted T cells; gene therapy; rAAV", + language = "en" +} + +@ARTICLE{Sell2008-zp, + title = "Alpha-fetoprotein, stem cells and cancer: how study of the + production of alpha-fetoprotein during chemical + hepatocarcinogenesis led to reaffirmation of the stem cell theory + of cancer", + author = "Sell, Stewart", + abstract = "Identification of the cells in the liver that produce + alpha-fetoprotein during development, in response to liver injury + and during the early stages of chemical hepatocarcinogenesis led + to the conclusion that maturation arrest of liver-determined + tissue stem cells was the cellular process that gives rise to + hepatocellular carcinomas. When the cellular changes in these + processes were compared to that of the formation of + teratocarcinomas, the hypothesis arose that all cancers arise + from maturation arrest of tissue-determined stem cells. This was + essentially a reinterpretation of the embryonal rest theory of + cancer whereby tissue stem cells take the role of embryonal + rests. A corollary of the stem cell theory of the origin of + cancer is that cancers contain the same functional cell + populations as normal tissues: stem cells, transit-amplifying + cells and mature cells. Cancer stem cells retain the essential + feature of normal stem cells: the ability to self-renew. Growth + of cancers is due to continued proliferation of cancer + transit-amplifying cells that do not differentiate to mature + cells (maturation arrest). On the other hand, cancer stem cells + generally divide very rarely and contribute little to tumor + growth. However, the presence of cancer stem cells in tumors is + believed to be responsible for the properties of immortalization, + transplantability and resistance to therapy characteristic of + cancers. Current therapies for cancer (chemotherapy, + radiotherapy, antiangiogenesis and differentiation therapy) are + directed against the cancer transit-amplifying cells. When these + therapies are discontinued, the cancer reforms from the cancer + stem cells. Therapy directed toward interruption of the cell + signaling pathways that maintain cancer stem cells could lead to + new modalities to the prevention of regrowth of the cancer.", + journal = "Tumour Biol.", + volume = 29, + number = 3, + pages = "161--180", + month = jul, + year = 2008, + language = "en" +} + +@ARTICLE{Qi2023-ev, + title = "Trends and Potential of Machine Learning and Deep Learning in + Drug Study at {Single-Cell} Level", + author = "Qi, Ren and Zou, Quan", + abstract = "Cancer treatments always face challenging problems, particularly + drug resistance due to tumor cell heterogeneity. The existing + datasets include the relationship between gene expression and + drug sensitivities; however, the majority are based on + tissue-level studies. Study drugs at the single-cell level are + perspective to overcome minimal residual disease caused by + subclonal resistant cancer cells retained after initial curative + therapy. Fortunately, machine learning techniques can help us + understand how different types of cells respond to different + cancer drugs from the perspective of single-cell gene expression. + Good modeling using single-cell data and drug response + information will not only improve machine learning for cell-drug + outcome prediction but also facilitate the discovery of drugs for + specific cancer subgroups and specific cancer treatments. In this + paper, we review machine learning and deep learning approaches in + drug research. By analyzing the application of these methods on + cancer cell lines and single-cell data and comparing the + technical gap between single-cell sequencing data analysis and + single-cell drug sensitivity analysis, we hope to explore the + trends and potential of drug research at the single-cell data + level and provide more inspiration for drug research at the + single-cell level. We anticipate that this review will stimulate + the innovative use of machine learning methods to address new + challenges in precision medicine more broadly.", + journal = "Research", + volume = 6, + pages = "0050", + month = mar, + year = 2023, + language = "en" +} + +% The entry below contains non-ASCII chars that could not be converted +% to a LaTeX equivalent. +@ARTICLE{Russell2017-dh, + title = "Efficacy and safety of voretigene neparvovec ({AAV2-hRPE65v2}) in + patients with {RPE65-mediated} inherited retinal dystrophy: a + randomised, controlled, open-label, phase 3 trial", + author = "Russell, Stephen and Bennett, Jean and Wellman, Jennifer A and + Chung, Daniel C and Yu, Zi-Fan and Tillman, Amy and Wittes, Janet + and Pappas, Julie and Elci, Okan and McCague, Sarah and Cross, + Dominique and Marshall, Kathleen A and Walshire, Jean and Kehoe, + Taylor L and Reichert, Hannah and Davis, Maria and Raffini, + Leslie and George, Lindsey A and Hudson, F Parker and Dingfield, + Laura and Zhu, Xiaosong and Haller, Julia A and Sohn, Elliott H + and Mahajan, Vinit B and Pfeifer, Wanda and Weckmann, Michelle + and Johnson, Chris and Gewaily, Dina and Drack, Arlene and Stone, + Edwin and Wachtel, Katie and Simonelli, Francesca and Leroy, Bart + P and Wright, J Fraser and High, Katherine A and Maguire, Albert + M", + abstract = "BACKGROUND: Phase 1 studies have shown potential benefit of gene + replacement in RPE65-mediated inherited retinal dystrophy. This + phase 3 study assessed the efficacy and safety of voretigene + neparvovec in participants whose inherited retinal dystrophy + would otherwise progress to complete blindness. METHODS: In this + open-label, randomised, controlled phase 3 trial done at two + sites in the USA, individuals aged 3 years or older with, in each + eye, best corrected visual acuity of 20/60 or worse, or visual + field less than 20 degrees in any meridian, or both, with + confirmed genetic diagnosis of biallelic RPE65 mutations, + sufficient viable retina, and ability to perform standardised + multi-luminance mobility testing (MLMT) within the luminance + range evaluated, were eligible. Participants were randomly + assigned (2:1) to intervention or control using a permuted block + design, stratified by age (<10 years and $\geq$10 years) and + baseline mobility testing passing level (pass at $\geq$125 lux vs + <125 lux). Graders assessing primary outcome were masked to + treatment group. Intervention was bilateral, subretinal injection + of 1·5 $\times$ 1011 vector genomes of voretigene neparvovec in + 0·3 mL total volume. The primary efficacy endpoint was 1-year + change in MLMT performance, measuring functional vision at + specified light levels. The intention-to-treat (ITT) and modified + ITT populations were included in primary and safety analyses. + This trial is registered with ClinicalTrials.gov, number + NCT00999609, and enrolment is complete. FINDINGS: Between Nov 15, + 2012, and Nov 21, 2013, 31 individuals were enrolled and randomly + assigned to intervention (n=21) or control (n=10). One + participant from each group withdrew after consent, before + intervention, leaving an mITT population of 20 intervention and + nine control participants. At 1 year, mean bilateral MLMT change + score was 1·8 (SD 1·1) light levels in the intervention group + versus 0·2 (1·0) in the control group (difference of 1·6, 95\% CI + 0·72-2·41, p=0·0013). 13 (65\%) of 20 intervention participants, + but no control participants, passed MLMT at the lowest luminance + level tested (1 lux), demonstrating maximum possible improvement. + No product-related serious adverse events or deleterious immune + responses occurred. Two intervention participants, one with a + pre-existing complex seizure disorder and another who experienced + oral surgery complications, had serious adverse events unrelated + to study participation. Most ocular events were mild in severity. + INTERPRETATION: Voretigene neparvovec gene replacement improved + functional vision in RPE65-mediated inherited retinal dystrophy + previously medically untreatable. FUNDING: Spark Therapeutics.", + journal = "Lancet", + volume = 390, + number = 10097, + pages = "849--860", + month = aug, + year = 2017, + language = "en" +} + +@ARTICLE{Utsch2004-re, + title = "Genetic and molecular aspects of hypospadias", + author = "Utsch, B and Albers, N and Ludwig, M", + abstract = "Hypospadias, a midline fusion defect of the male ventral urethra, + is a relatively common genital anomaly occurring in 0.3 - 7 of + 1000 live male births. The anatomical location of the misplaced + urethral meatus determines the severity of this anomaly with the + severity increasing from distal to proximal. Glandular and penile + hypospadias, the most common forms, often appear as an isolated + anomaly and account for the majority of hypospadias, whereas + about 20 \% are classified as scrotal and perineal types. These + latter forms frequently occur in association with other genital + anomalies such as microphallus, bifid scrotum, penoscrotal + transposition, and cryptorchidism, and may represent an intersex + phenotype. Besides a higher incidence in consanguineous families + and a suggested recessive inheritance, in other families a + dominant transmission is likely. The recurrence risk in the next + generation seems to be correlated with the severity of + hypospadias. Only 30 \% of severe hypospadias can be attributed + to defects in the synthesis of testosterone or adrenal steroid + hormones, receptor defects, syndrome-associated hypospadias, + chromosomal anomalies, defects in other genetic factors, or + exogenous forms. To identify the underlying causes of the + remaining 70 \% ``idiopathic'' hypospadias, familial and twin + studies were performed. Familial studies can help identify gene + loci and, subsequently, candidate genes by mutational analysis. + Either linkage analysis in large families with many affected + individuals suspicious for a monogenic trait or association + studies in cases of a complex inheritance in many families with a + few affected individuals can be performed. Microarrays and + proteomics can help detect gene expression or protein + differences. Furthermore, genetically modified animal models can + be used to detect phylogenetically homologous genes in man. In + addition to an optimal documentation and acquisition of blood and + tissue samples this requires a close cooperation between + clinicians in the operative and non-operative specialties as well + as geneticists.", + journal = "Eur. J. Pediatr. Surg.", + volume = 14, + number = 5, + pages = "297--302", + month = oct, + year = 2004, + language = "en" +} + +@ARTICLE{Sugiyama1981-ev, + title = "Methods for Visual Understanding of Hierarchical System + Structures", + author = "Sugiyama, Kozo and Tagawa, Shojiro and Toda, Mitsuhiko", + abstract = "Two kinds of new methods are developed to obtain effective + representations of hierarchies automatically: theoretical and + heuristic methods. The methods determine the positions of + vertices in two steps. First the order of the vertices in each + level is determined to reduce the number of crossings of edges. + Then horizontal positions of the vertices are determined to + improve further the readability of drawings. The theoretical + methods are useful in recognizing the nature of the problem, and + the heuristic methods make it possible to enlarge the size of + hierarchies with which we can deal. Performance tests of the + heuristic methods and several applications are presented.", + journal = "IEEE Trans. Syst. Man Cybern.", + publisher = "IEEE", + volume = 11, + number = 2, + pages = "109--125", + month = feb, + year = 1981 +} + +% The entry below contains non-ASCII chars that could not be converted +% to a LaTeX equivalent. +@ARTICLE{De_Franco2015-mv, + title = "The effect of early, comprehensive genomic testing on clinical + care in neonatal diabetes: an international cohort study", + author = "De Franco, Elisa and Flanagan, Sarah E and Houghton, Jayne A L + and Lango Allen, Hana and Mackay, Deborah J G and Temple, I Karen + and Ellard, Sian and Hattersley, Andrew T", + abstract = "BACKGROUND: Traditional genetic testing focusses on analysis of + one or a few genes according to clinical features; this approach + is changing as improved sequencing methods enable simultaneous + analysis of several genes. Neonatal diabetes is the presenting + feature of many discrete clinical phenotypes defined by different + genetic causes. Genetic subtype defines treatment, with improved + glycaemic control on sulfonylurea treatment for most patients + with potassium channel mutations. We investigated the effect of + early, comprehensive testing of all known genetic causes of + neonatal diabetes. METHODS: In this large, international, cohort + study, we studied patients with neonatal diabetes diagnosed with + diabetes before 6 months of age who were referred from 79 + countries. We identified mutations by comprehensive genetic + testing including Sanger sequencing, 6q24 methylation analysis, + and targeted next-generation sequencing of all known neonatal + diabetes genes. FINDINGS: Between January, 2000, and August, + 2013, genetic testing was done in 1020 patients (571 boys, 449 + girls). Mutations in the potassium channel genes were the most + common cause (n=390) of neonatal diabetes, but were identified + less frequently in consanguineous families (12\% in + consanguineous families vs 46\% in non-consanguineous families; + p4 years; p<0·0001), in whom skeletal and liver involvement was + common. Similarly, for patients with genetically diagnosed + transient neonatal diabetes, the diabetes had remitted in only + ten (10\%) of 101 patients tested early (<3 months) compared with + 60 (100\%) of the 60 later referrals (p<0·0001). INTERPRETATION: + Patients are now referred for genetic testing closer to their + presentation with neonatal diabetes. Comprehensive testing of all + causes identified causal mutations in more than 80\% of cases. + The genetic result predicts the best diabetes treatment and + development of related features. This model represents a new + framework for clinical care with genetic diagnosis preceding + development of clinical features and guiding clinical management. + FUNDING: Wellcome Trust and Diabetes UK.", + journal = "Lancet", + volume = 386, + number = 9997, + pages = "957--963", + month = sep, + year = 2015, + language = "en" +} + +@ARTICLE{Guthrie2004-fk, + title = "Pathophysiology of diabetes mellitus", + author = "Guthrie, Richard A and Guthrie, Diana W", + abstract = "As we learn more about the pathophysiology of diabetes mellitus, + we find that there is more yet to be learned. This may sound like + a trite statement, but in reality it is true. The following + article reviews the basic pathophysiology of both type 1 diabetes + mellitus and type 2 diabetes mellitus as we understand it today. + It continues on to reveal the ``things that go wrong'' when there + is too much or too little glucose available to the body organs + and especially to the brain. The article points out the signs and + symptoms to be aware of when the person is in the acute state of + diabetic ketoacidosis, hyperglycemic hyperosmolar nonketotic coma + (or state), and severe hypoglycemia. It concludes with important + considerations when the individual is in one of these acute + states and contributes key points related to the control of + diabetes when the person is in the state of compromise.", + journal = "Crit. Care Nurs. Q.", + volume = 27, + number = 2, + pages = "113--125", + year = 2004, + language = "en" +} + +@ARTICLE{Zhou2016-kq, + title = "Hepatocytes: a key cell type for innate immunity", + author = "Zhou, Zhou and Xu, Ming-Jiang and Gao, Bin", + abstract = "Hepatocytes, the major parenchymal cells in the liver, play + pivotal roles in metabolism, detoxification, and protein + synthesis. Hepatocytes also activate innate immunity against + invading microorganisms by secreting innate immunity proteins. + These proteins include bactericidal proteins that directly kill + bacteria, opsonins that assist in the phagocytosis of foreign + bacteria, iron-sequestering proteins that block iron uptake by + bacteria, several soluble factors that regulate + lipopolysaccharide signaling, and the coagulation factor + fibrinogen that activates innate immunity. In this review, we + summarize the wide variety of innate immunity proteins produced + by hepatocytes and discuss liver-enriched transcription factors + (e.g. hepatocyte nuclear factors and CCAAT/enhancer-binding + proteins), pro-inflammatory mediators (e.g. interleukin (IL)-6, + IL-22, IL-1$\beta$ and tumor necrosis factor-$\alpha$), and + downstream signaling pathways (e.g. signal transducer and + activator of transcription factor 3 and nuclear factor-$\kappa$B) + that regulate the expression of these innate immunity proteins. + We also briefly discuss the dysregulation of these innate + immunity proteins in chronic liver disease, which may contribute + to an increased susceptibility to bacterial infection in patients + with cirrhosis.", + journal = "Cell. Mol. Immunol.", + volume = 13, + number = 3, + pages = "301--315", + month = may, + year = 2016, + language = "en" +} + +@ARTICLE{Seal2023-pa, + title = "Genenames.org: the {HGNC} resources in 2023", + author = "Seal, Ruth L and Braschi, Bryony and Gray, Kristian and Jones, + Tamsin E M and Tweedie, Susan and Haim-Vilmovsky, Liora and + Bruford, Elspeth A", + abstract = "The HUGO Gene Nomenclature Committee (HGNC) assigns unique + symbols and names to human genes. The HGNC database + (www.genenames.org) currently contains over 43 000 approved gene + symbols, over 19 200 of which are assigned to protein-coding + genes, 14 000 to pseudogenes and nearly 9000 to non-coding RNA + genes. The public website, www.genenames.org, displays all + approved nomenclature within Symbol Reports that contain data + curated by HGNC nomenclature advisors and links to related + genomic, clinical, and proteomic information. Here, we describe + updates to our resource, including improvements to our search + facility and new download features.", + journal = "Nucleic Acids Res.", + volume = 51, + number = "D1", + pages = "D1003--D1009", + month = jan, + year = 2023, + language = "en" +} + +@INCOLLECTION{Tahira2021-zj, + title = "Chapter 13 - Linking {SOX3}, {SRY}, and disorders of + neurodevelopment", + booktitle = "Factors Affecting Neurodevelopment", + author = "Tahira, Ana Carolina and Calegari de Toledo, Victor Hugo and + Feltrin, Arthur Sant'anna and Barbosa, Andr{\'e} Rocha and Vale + Euclydes Colovati, Ver{\^o}nica Luiza and Maschietto, Mariana + and Brentani, Helena", + editor = "Martin, Colin R and Preedy, Victor R and Rajendram, Rajkumar", + abstract = "SRY and SOX3 proteins are transcription factors from the SOX + family. All the SOX genes contain a structurally homologous HMG + domain sequence that binds and regulates DNA. In different + literature examples, it has been shown that this HMG box is + functionally interchangeable between SRY and other SOX proteins. + In this chapter, we review actions and interactions of SOX3 and + SRY in brain tissue and possible contributions to + neurodevelopment. We suggest that, at early stages of + neurodevelopment, the action of both transcription factors on + the same gene targets and/or coactivators could interfere in + cell fate decisions, rendering alterations in brain development. + Thereby, boys will be more vulnerable to toxic stress exposition + and/or other genetic factors acting on the epigenetic regulation + of SRY and/or SOX3 proteins as well as acting on their gene + targets.", + publisher = "Academic Press", + pages = "143--156", + month = jan, + year = 2021, + keywords = "Gene interaction network; Neurodevelopmental disorders; Sexual + dimorphism; SOX3; SRY" +} + +@UNPUBLISHED{CZI_Single-Cell_Biology_Program2023-fs, + title = "{CZ} {CELL$\times$GENE} Discover: A single-cell data platform for + scalable exploration, analysis and modeling of aggregated data", + author = "{CZI Single-Cell Biology Program} and Abdulla, Shibla and + Aevermann, Brian and Assis, Pedro and Badajoz, Seve and Bell, + Sidney M and Bezzi, Emanuele and Cakir, Batuhan and Chaffer, Jim + and Chambers, Signe and Michael Cherry, J and Chi, Tiffany and + Chien, Jennifer and Dorman, Leah and Garcia-Nieto, Pablo and + Gloria, Nayib and Hastie, Mim and Hegeman, Daniel and Hilton, + Jason and Huang, Timmy and Infeld, Amanda and Istrate, Ana-Maria + and Jelic, Ivana and Katsuya, Kuni and Kim, Yang Joon and Liang, + Karen and Lin, Mike and Lombardo, Maximilian and Marshall, Bailey + and Martin, Bruce and McDade, Fran and Megill, Colin and Patel, + Nikhil and Predeus, Alexander and Raymor, Brian and Robatmili, + Behnam and Rogers, Dave and Rutherford, Erica and Sadgat, Dana + and Shin, Andrew and Small, Corinn and Smith, Trent and + Sridharan, Prathap and Tarashansky, Alexander and Tavares, + Norbert and Thomas, Harley and Tolopko, Andrew and Urisko, Meghan + and Yan, Joyce and Yeretssian, Garabet and Zamanian, Jennifer and + Mani, Arathi and Cool, Jonah and Carr, Ambrose", + abstract = "Hundreds of millions of single cells have been analyzed to date + using high throughput transcriptomic methods, thanks to + technological advances driving the increasingly rapid generation + of single-cell data. This provides an exciting opportunity for + unlocking new insights into health and disease, made possible by + meta-analysis that span diverse datasets building on recent + advances in large language models and other machine learning + approaches. Despite the promise of these and emerging analytical + tools for analyzing large amounts of data, a major challenge + remains the sheer number of datasets and inconsistent format, + data models and accessibility. Many datasets are available via + unique portals platforms that often lack interoperability. Here, + we present CZ CellxGene Discover ( cellxgene.cziscience.com), a + data platform that provides curated and interoperable data. This + single-cell data resource, available via a free-to-use online + data portal, hosts a growing corpus of community contributed data + that spans more than 50 million unique cells. Curated, + standardized, and associated with consistent cell-level metadata, + this collection of interoperable single-cell transcriptomic data + is the largest of its kind. A suite of tools and features enables + accessibility and reusability of the data via both computational + and visual interfaces to allow researchers to rapidly explore + individual datasets and perform cross-corpus analysis. This + functionality is enabling meta-analyses of tens of millions of + cells across studies and tissues and providing global views of + human cells at the resolution of single cells. \#\#\# Competing + Interest Statement The authors have declared no competing + interest.", + journal = "bioRxiv", + pages = "2023.10.30.563174", + month = nov, + year = 2023, + language = "en" +} + +@ARTICLE{Rosain2017-ih, + title = "Strains Responsible for Invasive Meningococcal Disease in + Patients With Terminal Complement Pathway Deficiencies", + author = "Rosain, J{\'e}r{\'e}mie and Hong, Eva and Fieschi, Claire and + Martins, Paula Vieira and El Sissy, Carine and Deghmane, + Ala-Eddine and Ouach{\'e}e, Marie and Thomas, Caroline and + Launay, David and de Pontual, Lo{\"\i}c and Suarez, Felipe and + Moshous, Despina and Picard, Capucine and Taha, Muhamed-Kheir and + Fr{\'e}meaux-Bacchi, V{\'e}ronique", + abstract = "Background: Patients with terminal complement pathway deficiency + (TPD) are susceptible to recurrent invasive meningococcal disease + (IMD). Neisseria meningitidis (Nm) strains infecting these + patients are poorly documented in the literature. Methods: We + identified patients with TPD and available Nm strains isolated + during IMD. We investigated the genetic basis of the different + TPDs and the characteristics of the Nm strains. Results: We + included 56 patients with C5 (n = 8), C6 (n = 20), C7 (n = 18), + C8 (n = 9), or C9 (n = 1) deficiency. Genetic study was performed + in 47 patients and 30 pathogenic variants were identified in the + genes coding for C5 (n = 4), C6 (n = 5), C7 (n = 12), C8 (n = 7), + and C9 (n = 2). We characterized 61 Nm strains responsible for + IMD in the 56 patients with TPD. The most frequent strains + belonged to groups Y (n = 27 [44\%]), B (n = 18 [30\%]), and W (n + = 8 [13\%]). Hyperinvasive clonal complexes (CC11, CC32, CC41/44, + and CC269) were responsible for 21\% of IMD cases. The CC23 + predominates and represented 26\% of all invasive isolates. + Eleven of the 15 clonal complexes identified fit to 12 different + clonal complexes belonging to carriage strains. Conclusions: + Unusual meningococcal strains with low level of virulence similar + to carriage strains are most frequently responsible for IMD in + patients with TPD.", + journal = "J. Infect. Dis.", + volume = 215, + number = 8, + pages = "1331--1338", + month = apr, + year = 2017, + keywords = "Neisseria meningitidis; complement; membrane attack complex.; + primary immunodeficiency; terminal complement pathway", + language = "en" +} + +@ARTICLE{Whetzel2011-jf, + title = "{BioPortal}: enhanced functionality via new Web services from the + National Center for Biomedical Ontology to access and use + ontologies in software applications", + author = "Whetzel, Patricia L and Noy, Natalya F and Shah, Nigam H and + Alexander, Paul R and Nyulas, Csongor and Tudorache, Tania and + Musen, Mark A", + abstract = "The National Center for Biomedical Ontology (NCBO) is one of the + National Centers for Biomedical Computing funded under the NIH + Roadmap Initiative. Contributing to the national computing + infrastructure, NCBO has developed BioPortal, a web portal that + provides access to a library of biomedical ontologies and + terminologies (http://bioportal.bioontology.org) via the NCBO Web + services. BioPortal enables community participation in the + evaluation and evolution of ontology content by providing + features to add mappings between terms, to add comments linked to + specific ontology terms and to provide ontology reviews. The NCBO + Web services + (http://www.bioontology.org/wiki/index.php/NCBO\_REST\_services) + enable this functionality and provide a uniform mechanism to + access ontologies from a variety of knowledge representation + formats, such as Web Ontology Language (OWL) and Open Biological + and Biomedical Ontologies (OBO) format. The Web services provide + multi-layered access to the ontology content, from getting all + terms in an ontology to retrieving metadata about a term. Users + can easily incorporate the NCBO Web services into software + applications to generate semantically aware applications and to + facilitate structured data collection.", + journal = "Nucleic Acids Res.", + volume = 39, + number = "Web Server issue", + pages = "W541--5", + month = jul, + year = 2011, + language = "en" +} + +@ARTICLE{Skene2016-rb, + title = "Identification of Vulnerable Cell Types in Major Brain Disorders + Using Single Cell Transcriptomes and Expression Weighted Cell + Type Enrichment", + author = "Skene, Nathan G and Grant, Seth G N", + abstract = "The cell types that trigger the primary pathology in many brain + diseases remain largely unknown. One route to understanding the + primary pathological cell type for a particular disease is to + identify the cells expressing susceptibility genes. Although this + is straightforward for monogenic conditions where the causative + mutation may alter expression of a cell type specific marker, + methods are required for the common polygenic disorders. We + developed the Expression Weighted Cell Type Enrichment (EWCE) + method that uses single cell transcriptomes to generate the + probability distribution associated with a gene list having an + average level of expression within a cell type. Following + validation, we applied EWCE to human genetic data from cases of + epilepsy, Schizophrenia, Autism, Intellectual Disability, + Alzheimer's disease, Multiple Sclerosis and anxiety disorders. + Genetic susceptibility primarily affected microglia in + Alzheimer's and Multiple Sclerosis; was shared between + interneurons and pyramidal neurons in Autism and Schizophrenia; + while intellectual disabilities and epilepsy were attributable to + a range of cell-types, with the strongest enrichment in + interneurons. We hypothesized that the primary cell type + pathology could trigger secondary changes in other cell types and + these could be detected by applying EWCE to transcriptome data + from diseased tissue. In Autism, Schizophrenia and Alzheimer's + disease we find evidence of pathological changes in all of the + major brain cell types. These findings give novel insight into + the cellular origins and progression in common brain disorders. + The methods can be applied to any tissue and disorder and have + applications in validating mouse models.", + journal = "Front. Neurosci.", + volume = 10, + pages = "16", + month = jan, + year = 2016, + keywords = "Alzheimer's Disease; RNA-seq; anxiety; autism; genetics; + schizophrenia; single cell genomics; transcriptome", + language = "en" +} + +% The entry below contains non-ASCII chars that could not be converted +% to a LaTeX equivalent. +@ARTICLE{The_International_Meningococcal_Genetics_Consortium2010-if, + title = "Genome-wide association study identifies variants in the {CFH} + region associated with host susceptibility to meningococcal + disease", + author = "{The International Meningococcal Genetics Consortium}", + abstract = "Meningococcal disease is an infection caused by Neisseria + meningitidis. Genetic factors contribute to host susceptibility + and progression to disease, but the genes responsible for + disease development are largely unknown1,2,3. We report here a + genome-wide association study for host susceptibility to + meningococcal disease using 475 individuals with meningococcal + disease (cases) and 4,703 population controls from the UK. We + performed, in Western European and South European cohorts + (consisting of 968 cases and 1,376 controls), two replication + studies for the most significant SNPs. A cluster of complement + factor SNPs replicated independently in both cohorts, including + SNPs within complement factor H (CFH) (rs1065489 (p.936D").addClass(errClass); + errorSpan.text(err.message); + $el.after(errorSpan); + } + } else if (display === "block") { + // If block, add an error just after the el, set visibility:none on the + // el, and position the error to be on top of the el. + // Mark it with a unique ID and CSS class so we can remove it later. + $el.css("visibility", "hidden"); + if (err.message !== "") { + var errorDiv = $("
").addClass(errClass).css("position", "absolute") + .css("top", el.offsetTop) + .css("left", el.offsetLeft) + // setting width can push out the page size, forcing otherwise + // unnecessary scrollbars to appear and making it impossible for + // the element to shrink; so use max-width instead + .css("maxWidth", el.offsetWidth) + .css("height", el.offsetHeight); + errorDiv.text(err.message); + $el.after(errorDiv); + + // Really dumb way to keep the size/position of the error in sync with + // the parent element as the window is resized or whatever. + var intId = setInterval(function() { + if (!errorDiv[0].parentElement) { + clearInterval(intId); + return; + } + errorDiv + .css("top", el.offsetTop) + .css("left", el.offsetLeft) + .css("maxWidth", el.offsetWidth) + .css("height", el.offsetHeight); + }, 500); + } + } + }, + clearError: function(el) { + var $el = $(el); + var display = $el.data("restore-display-mode"); + $el.data("restore-display-mode", null); + + if (display === "inline" || display === "inline-block") { + if (display) + $el.css("display", display); + $(el.nextSibling).filter(".htmlwidgets-error").remove(); + } else if (display === "block"){ + $el.css("visibility", "inherit"); + $(el.nextSibling).filter(".htmlwidgets-error").remove(); + } + }, + sizing: {} + }; + + // Called by widget bindings to register a new type of widget. The definition + // object can contain the following properties: + // - name (required) - A string indicating the binding name, which will be + // used by default as the CSS classname to look for. + // - initialize (optional) - A function(el) that will be called once per + // widget element; if a value is returned, it will be passed as the third + // value to renderValue. + // - renderValue (required) - A function(el, data, initValue) that will be + // called with data. Static contexts will cause this to be called once per + // element; Shiny apps will cause this to be called multiple times per + // element, as the data changes. + window.HTMLWidgets.widget = function(definition) { + if (!definition.name) { + throw new Error("Widget must have a name"); + } + if (!definition.type) { + throw new Error("Widget must have a type"); + } + // Currently we only support output widgets + if (definition.type !== "output") { + throw new Error("Unrecognized widget type '" + definition.type + "'"); + } + // TODO: Verify that .name is a valid CSS classname + + // Support new-style instance-bound definitions. Old-style class-bound + // definitions have one widget "object" per widget per type/class of + // widget; the renderValue and resize methods on such widget objects + // take el and instance arguments, because the widget object can't + // store them. New-style instance-bound definitions have one widget + // object per widget instance; the definition that's passed in doesn't + // provide renderValue or resize methods at all, just the single method + // factory(el, width, height) + // which returns an object that has renderValue(x) and resize(w, h). + // This enables a far more natural programming style for the widget + // author, who can store per-instance state using either OO-style + // instance fields or functional-style closure variables (I guess this + // is in contrast to what can only be called C-style pseudo-OO which is + // what we required before). + if (definition.factory) { + definition = createLegacyDefinitionAdapter(definition); + } + + if (!definition.renderValue) { + throw new Error("Widget must have a renderValue function"); + } + + // For static rendering (non-Shiny), use a simple widget registration + // scheme. We also use this scheme for Shiny apps/documents that also + // contain static widgets. + window.HTMLWidgets.widgets = window.HTMLWidgets.widgets || []; + // Merge defaults into the definition; don't mutate the original definition. + var staticBinding = extend({}, defaults, definition); + overrideMethod(staticBinding, "find", function(superfunc) { + return function(scope) { + var results = superfunc(scope); + // Filter out Shiny outputs, we only want the static kind + return filterByClass(results, "html-widget-output", false); + }; + }); + window.HTMLWidgets.widgets.push(staticBinding); + + if (shinyMode) { + // Shiny is running. Register the definition with an output binding. + // The definition itself will not be the output binding, instead + // we will make an output binding object that delegates to the + // definition. This is because we foolishly used the same method + // name (renderValue) for htmlwidgets definition and Shiny bindings + // but they actually have quite different semantics (the Shiny + // bindings receive data that includes lots of metadata that it + // strips off before calling htmlwidgets renderValue). We can't + // just ignore the difference because in some widgets it's helpful + // to call this.renderValue() from inside of resize(), and if + // we're not delegating, then that call will go to the Shiny + // version instead of the htmlwidgets version. + + // Merge defaults with definition, without mutating either. + var bindingDef = extend({}, defaults, definition); + + // This object will be our actual Shiny binding. + var shinyBinding = new Shiny.OutputBinding(); + + // With a few exceptions, we'll want to simply use the bindingDef's + // version of methods if they are available, otherwise fall back to + // Shiny's defaults. NOTE: If Shiny's output bindings gain additional + // methods in the future, and we want them to be overrideable by + // HTMLWidget binding definitions, then we'll need to add them to this + // list. + delegateMethod(shinyBinding, bindingDef, "getId"); + delegateMethod(shinyBinding, bindingDef, "onValueChange"); + delegateMethod(shinyBinding, bindingDef, "onValueError"); + delegateMethod(shinyBinding, bindingDef, "renderError"); + delegateMethod(shinyBinding, bindingDef, "clearError"); + delegateMethod(shinyBinding, bindingDef, "showProgress"); + + // The find, renderValue, and resize are handled differently, because we + // want to actually decorate the behavior of the bindingDef methods. + + shinyBinding.find = function(scope) { + var results = bindingDef.find(scope); + + // Only return elements that are Shiny outputs, not static ones + var dynamicResults = results.filter(".html-widget-output"); + + // It's possible that whatever caused Shiny to think there might be + // new dynamic outputs, also caused there to be new static outputs. + // Since there might be lots of different htmlwidgets bindings, we + // schedule execution for later--no need to staticRender multiple + // times. + if (results.length !== dynamicResults.length) + scheduleStaticRender(); + + return dynamicResults; + }; + + // Wrap renderValue to handle initialization, which unfortunately isn't + // supported natively by Shiny at the time of this writing. + + shinyBinding.renderValue = function(el, data) { + Shiny.renderDependencies(data.deps); + // Resolve strings marked as javascript literals to objects + if (!(data.evals instanceof Array)) data.evals = [data.evals]; + for (var i = 0; data.evals && i < data.evals.length; i++) { + window.HTMLWidgets.evaluateStringMember(data.x, data.evals[i]); + } + if (!bindingDef.renderOnNullValue) { + if (data.x === null) { + el.style.visibility = "hidden"; + return; + } else { + el.style.visibility = "inherit"; + } + } + if (!elementData(el, "initialized")) { + initSizing(el); + + elementData(el, "initialized", true); + if (bindingDef.initialize) { + var rect = el.getBoundingClientRect(); + var result = bindingDef.initialize(el, rect.width, rect.height); + elementData(el, "init_result", result); + } + } + bindingDef.renderValue(el, data.x, elementData(el, "init_result")); + evalAndRun(data.jsHooks.render, elementData(el, "init_result"), [el, data.x]); + }; + + // Only override resize if bindingDef implements it + if (bindingDef.resize) { + shinyBinding.resize = function(el, width, height) { + // Shiny can call resize before initialize/renderValue have been + // called, which doesn't make sense for widgets. + if (elementData(el, "initialized")) { + bindingDef.resize(el, width, height, elementData(el, "init_result")); + } + }; + } + + Shiny.outputBindings.register(shinyBinding, bindingDef.name); + } + }; + + var scheduleStaticRenderTimerId = null; + function scheduleStaticRender() { + if (!scheduleStaticRenderTimerId) { + scheduleStaticRenderTimerId = setTimeout(function() { + scheduleStaticRenderTimerId = null; + window.HTMLWidgets.staticRender(); + }, 1); + } + } + + // Render static widgets after the document finishes loading + // Statically render all elements that are of this widget's class + window.HTMLWidgets.staticRender = function() { + var bindings = window.HTMLWidgets.widgets || []; + forEach(bindings, function(binding) { + var matches = binding.find(document.documentElement); + forEach(matches, function(el) { + var sizeObj = initSizing(el, binding); + + var getSize = function(el) { + if (sizeObj) { + return {w: sizeObj.getWidth(), h: sizeObj.getHeight()} + } else { + var rect = el.getBoundingClientRect(); + return {w: rect.width, h: rect.height} + } + }; + + if (hasClass(el, "html-widget-static-bound")) + return; + el.className = el.className + " html-widget-static-bound"; + + var initResult; + if (binding.initialize) { + var size = getSize(el); + initResult = binding.initialize(el, size.w, size.h); + elementData(el, "init_result", initResult); + } + + if (binding.resize) { + var lastSize = getSize(el); + var resizeHandler = function(e) { + var size = getSize(el); + if (size.w === 0 && size.h === 0) + return; + if (size.w === lastSize.w && size.h === lastSize.h) + return; + lastSize = size; + binding.resize(el, size.w, size.h, initResult); + }; + + on(window, "resize", resizeHandler); + + // This is needed for cases where we're running in a Shiny + // app, but the widget itself is not a Shiny output, but + // rather a simple static widget. One example of this is + // an rmarkdown document that has runtime:shiny and widget + // that isn't in a render function. Shiny only knows to + // call resize handlers for Shiny outputs, not for static + // widgets, so we do it ourselves. + if (window.jQuery) { + window.jQuery(document).on( + "shown.htmlwidgets shown.bs.tab.htmlwidgets shown.bs.collapse.htmlwidgets", + resizeHandler + ); + window.jQuery(document).on( + "hidden.htmlwidgets hidden.bs.tab.htmlwidgets hidden.bs.collapse.htmlwidgets", + resizeHandler + ); + } + + // This is needed for the specific case of ioslides, which + // flips slides between display:none and display:block. + // Ideally we would not have to have ioslide-specific code + // here, but rather have ioslides raise a generic event, + // but the rmarkdown package just went to CRAN so the + // window to getting that fixed may be long. + if (window.addEventListener) { + // It's OK to limit this to window.addEventListener + // browsers because ioslides itself only supports + // such browsers. + on(document, "slideenter", resizeHandler); + on(document, "slideleave", resizeHandler); + } + } + + var scriptData = document.querySelector("script[data-for='" + el.id + "'][type='application/json']"); + if (scriptData) { + var data = JSON.parse(scriptData.textContent || scriptData.text); + // Resolve strings marked as javascript literals to objects + if (!(data.evals instanceof Array)) data.evals = [data.evals]; + for (var k = 0; data.evals && k < data.evals.length; k++) { + window.HTMLWidgets.evaluateStringMember(data.x, data.evals[k]); + } + binding.renderValue(el, data.x, initResult); + evalAndRun(data.jsHooks.render, initResult, [el, data.x]); + } + }); + }); + + invokePostRenderHandlers(); + } + + + function has_jQuery3() { + if (!window.jQuery) { + return false; + } + var $version = window.jQuery.fn.jquery; + var $major_version = parseInt($version.split(".")[0]); + return $major_version >= 3; + } + + /* + / Shiny 1.4 bumped jQuery from 1.x to 3.x which means jQuery's + / on-ready handler (i.e., $(fn)) is now asyncronous (i.e., it now + / really means $(setTimeout(fn)). + / https://jquery.com/upgrade-guide/3.0/#breaking-change-document-ready-handlers-are-now-asynchronous + / + / Since Shiny uses $() to schedule initShiny, shiny>=1.4 calls initShiny + / one tick later than it did before, which means staticRender() is + / called renderValue() earlier than (advanced) widget authors might be expecting. + / https://github.com/rstudio/shiny/issues/2630 + / + / For a concrete example, leaflet has some methods (e.g., updateBounds) + / which reference Shiny methods registered in initShiny (e.g., setInputValue). + / Since leaflet is privy to this life-cycle, it knows to use setTimeout() to + / delay execution of those methods (until Shiny methods are ready) + / https://github.com/rstudio/leaflet/blob/18ec981/javascript/src/index.js#L266-L268 + / + / Ideally widget authors wouldn't need to use this setTimeout() hack that + / leaflet uses to call Shiny methods on a staticRender(). In the long run, + / the logic initShiny should be broken up so that method registration happens + / right away, but binding happens later. + */ + function maybeStaticRenderLater() { + if (shinyMode && has_jQuery3()) { + window.jQuery(window.HTMLWidgets.staticRender); + } else { + window.HTMLWidgets.staticRender(); + } + } + + if (document.addEventListener) { + document.addEventListener("DOMContentLoaded", function() { + document.removeEventListener("DOMContentLoaded", arguments.callee, false); + maybeStaticRenderLater(); + }, false); + } else if (document.attachEvent) { + document.attachEvent("onreadystatechange", function() { + if (document.readyState === "complete") { + document.detachEvent("onreadystatechange", arguments.callee); + maybeStaticRenderLater(); + } + }); + } + + + window.HTMLWidgets.getAttachmentUrl = function(depname, key) { + // If no key, default to the first item + if (typeof(key) === "undefined") + key = 1; + + var link = document.getElementById(depname + "-" + key + "-attachment"); + if (!link) { + throw new Error("Attachment " + depname + "/" + key + " not found in document"); + } + return link.getAttribute("href"); + }; + + window.HTMLWidgets.dataframeToD3 = function(df) { + var names = []; + var length; + for (var name in df) { + if (df.hasOwnProperty(name)) + names.push(name); + if (typeof(df[name]) !== "object" || typeof(df[name].length) === "undefined") { + throw new Error("All fields must be arrays"); + } else if (typeof(length) !== "undefined" && length !== df[name].length) { + throw new Error("All fields must be arrays of the same length"); + } + length = df[name].length; + } + var results = []; + var item; + for (var row = 0; row < length; row++) { + item = {}; + for (var col = 0; col < names.length; col++) { + item[names[col]] = df[names[col]][row]; + } + results.push(item); + } + return results; + }; + + window.HTMLWidgets.transposeArray2D = function(array) { + if (array.length === 0) return array; + var newArray = array[0].map(function(col, i) { + return array.map(function(row) { + return row[i] + }) + }); + return newArray; + }; + // Split value at splitChar, but allow splitChar to be escaped + // using escapeChar. Any other characters escaped by escapeChar + // will be included as usual (including escapeChar itself). + function splitWithEscape(value, splitChar, escapeChar) { + var results = []; + var escapeMode = false; + var currentResult = ""; + for (var pos = 0; pos < value.length; pos++) { + if (!escapeMode) { + if (value[pos] === splitChar) { + results.push(currentResult); + currentResult = ""; + } else if (value[pos] === escapeChar) { + escapeMode = true; + } else { + currentResult += value[pos]; + } + } else { + currentResult += value[pos]; + escapeMode = false; + } + } + if (currentResult !== "") { + results.push(currentResult); + } + return results; + } + // Function authored by Yihui/JJ Allaire + window.HTMLWidgets.evaluateStringMember = function(o, member) { + var parts = splitWithEscape(member, '.', '\\'); + for (var i = 0, l = parts.length; i < l; i++) { + var part = parts[i]; + // part may be a character or 'numeric' member name + if (o !== null && typeof o === "object" && part in o) { + if (i == (l - 1)) { // if we are at the end of the line then evalulate + if (typeof o[part] === "string") + o[part] = tryEval(o[part]); + } else { // otherwise continue to next embedded object + o = o[part]; + } + } + } + }; + + // Retrieve the HTMLWidget instance (i.e. the return value of an + // HTMLWidget binding's initialize() or factory() function) + // associated with an element, or null if none. + window.HTMLWidgets.getInstance = function(el) { + return elementData(el, "init_result"); + }; + + // Finds the first element in the scope that matches the selector, + // and returns the HTMLWidget instance (i.e. the return value of + // an HTMLWidget binding's initialize() or factory() function) + // associated with that element, if any. If no element matches the + // selector, or the first matching element has no HTMLWidget + // instance associated with it, then null is returned. + // + // The scope argument is optional, and defaults to window.document. + window.HTMLWidgets.find = function(scope, selector) { + if (arguments.length == 1) { + selector = scope; + scope = document; + } + + var el = scope.querySelector(selector); + if (el === null) { + return null; + } else { + return window.HTMLWidgets.getInstance(el); + } + }; + + // Finds all elements in the scope that match the selector, and + // returns the HTMLWidget instances (i.e. the return values of + // an HTMLWidget binding's initialize() or factory() function) + // associated with the elements, in an array. If elements that + // match the selector don't have an associated HTMLWidget + // instance, the returned array will contain nulls. + // + // The scope argument is optional, and defaults to window.document. + window.HTMLWidgets.findAll = function(scope, selector) { + if (arguments.length == 1) { + selector = scope; + scope = document; + } + + var nodes = scope.querySelectorAll(selector); + var results = []; + for (var i = 0; i < nodes.length; i++) { + results.push(window.HTMLWidgets.getInstance(nodes[i])); + } + return results; + }; + + var postRenderHandlers = []; + function invokePostRenderHandlers() { + while (postRenderHandlers.length) { + var handler = postRenderHandlers.shift(); + if (handler) { + handler(); + } + } + } + + // Register the given callback function to be invoked after the + // next time static widgets are rendered. + window.HTMLWidgets.addPostRenderHandler = function(callback) { + postRenderHandlers.push(callback); + }; + + // Takes a new-style instance-bound definition, and returns an + // old-style class-bound definition. This saves us from having + // to rewrite all the logic in this file to accomodate both + // types of definitions. + function createLegacyDefinitionAdapter(defn) { + var result = { + name: defn.name, + type: defn.type, + initialize: function(el, width, height) { + return defn.factory(el, width, height); + }, + renderValue: function(el, x, instance) { + return instance.renderValue(x); + }, + resize: function(el, width, height, instance) { + return instance.resize(width, height); + } + }; + + if (defn.find) + result.find = defn.find; + if (defn.renderError) + result.renderError = defn.renderError; + if (defn.clearError) + result.clearError = defn.clearError; + + return result; + } +})(); diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/add_css.txt b/manuscript/_manuscript/site_libs/vis-9.1.0/add_css.txt new file mode 100644 index 0000000..e6a7cd0 --- /dev/null +++ b/manuscript/_manuscript/site_libs/vis-9.1.0/add_css.txt @@ -0,0 +1 @@ +.rPartvisNetwork{margin: 0.5em auto; } \ No newline at end of file diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/acceptDeleteIcon.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/acceptDeleteIcon.png new file mode 100644 index 0000000..02a0628 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/acceptDeleteIcon.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/addNodeIcon.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/addNodeIcon.png new file mode 100644 index 0000000..6fa3061 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/addNodeIcon.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/backIcon.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/backIcon.png new file mode 100644 index 0000000..e2f9912 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/backIcon.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/connectIcon.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/connectIcon.png new file mode 100644 index 0000000..4164da1 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/connectIcon.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/cross.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/cross.png new file mode 100644 index 0000000..9cbd189 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/cross.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/cross2.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/cross2.png new file mode 100644 index 0000000..9fc4b95 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/cross2.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/deleteIcon.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/deleteIcon.png new file mode 100644 index 0000000..5402564 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/deleteIcon.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/downArrow.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/downArrow.png new file mode 100644 index 0000000..e77d5e6 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/downArrow.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/editIcon.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/editIcon.png new file mode 100644 index 0000000..494d0f0 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/editIcon.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/leftArrow.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/leftArrow.png new file mode 100644 index 0000000..3823536 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/leftArrow.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/minus.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/minus.png new file mode 100644 index 0000000..3069807 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/minus.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/plus.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/plus.png new file mode 100644 index 0000000..f7ab2a3 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/plus.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/rightArrow.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/rightArrow.png new file mode 100644 index 0000000..c3a209d Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/rightArrow.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/upArrow.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/upArrow.png new file mode 100644 index 0000000..8aedced Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/upArrow.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/zoomExtends.png b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/zoomExtends.png new file mode 100644 index 0000000..74595c6 Binary files /dev/null and b/manuscript/_manuscript/site_libs/vis-9.1.0/img/network/zoomExtends.png differ diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/vis-network.min.css b/manuscript/_manuscript/site_libs/vis-9.1.0/vis-network.min.css new file mode 100644 index 0000000..fc0bd25 --- /dev/null +++ b/manuscript/_manuscript/site_libs/vis-9.1.0/vis-network.min.css @@ -0,0 +1 @@ +.vis-overlay{bottom:0;left:0;position:absolute;right:0;top:0;z-index:10}.vis-active{box-shadow:0 0 10px #86d5f8}.vis [class*=span]{min-height:0;width:auto}div.vis-color-picker{background-color:#fff;border-radius:15px;box-shadow:0 0 10px 0 rgba(0,0,0,.5);display:none;height:444px;left:30px;margin-left:30px;margin-top:-140px;padding:10px;position:absolute;top:0;width:310px;z-index:1}div.vis-color-picker div.vis-arrow{left:5px;position:absolute;top:147px}div.vis-color-picker div.vis-arrow:after,div.vis-color-picker div.vis-arrow:before{border:solid transparent;content:" ";height:0;pointer-events:none;position:absolute;right:100%;top:50%;width:0}div.vis-color-picker div.vis-arrow:after{border-color:hsla(0,0%,100%,0) #fff hsla(0,0%,100%,0) hsla(0,0%,100%,0);border-width:30px;margin-top:-30px}div.vis-color-picker div.vis-color{cursor:pointer;height:289px;position:absolute;width:289px}div.vis-color-picker div.vis-brightness{position:absolute;top:313px}div.vis-color-picker div.vis-opacity{position:absolute;top:350px}div.vis-color-picker div.vis-selector{background:#4c4c4c;background:-moz-linear-gradient(top,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#4c4c4c),color-stop(12%,#595959),color-stop(25%,#666),color-stop(39%,#474747),color-stop(50%,#2c2c2c),color-stop(51%,#000),color-stop(60%,#111),color-stop(76%,#2b2b2b),color-stop(91%,#1c1c1c),color-stop(100%,#131313));background:-webkit-linear-gradient(top,#4c4c4c,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313);background:-o-linear-gradient(top,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);background:-ms-linear-gradient(top,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313 100%);background:linear-gradient(180deg,#4c4c4c 0,#595959 12%,#666 25%,#474747 39%,#2c2c2c 50%,#000 51%,#111 60%,#2b2b2b 76%,#1c1c1c 91%,#131313);border:1px solid #fff;border-radius:15px;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#4c4c4c",endColorstr="#131313",GradientType=0);height:15px;left:137px;position:absolute;top:137px;width:15px}div.vis-color-picker div.vis-new-color{left:159px;padding-right:2px;text-align:right}div.vis-color-picker div.vis-initial-color,div.vis-color-picker div.vis-new-color{border:1px solid rgba(0,0,0,.1);border-radius:5px;color:rgba(0,0,0,.4);font-size:10px;height:20px;line-height:20px;position:absolute;top:380px;vertical-align:middle;width:140px}div.vis-color-picker div.vis-initial-color{left:10px;padding-left:2px;text-align:left}div.vis-color-picker div.vis-label{left:10px;position:absolute;width:300px}div.vis-color-picker div.vis-label.vis-brightness{top:300px}div.vis-color-picker div.vis-label.vis-opacity{top:338px}div.vis-color-picker div.vis-button{background-color:#f7f7f7;border:2px solid #d9d9d9;border-radius:10px;cursor:pointer;height:25px;line-height:25px;position:absolute;text-align:center;top:410px;vertical-align:middle;width:68px}div.vis-color-picker div.vis-button.vis-cancel{left:5px}div.vis-color-picker div.vis-button.vis-load{left:82px}div.vis-color-picker div.vis-button.vis-apply{left:159px}div.vis-color-picker div.vis-button.vis-save{left:236px}div.vis-color-picker input.vis-range{height:20px;width:290px}div.vis-configuration{display:block;float:left;font-size:12px;position:relative}div.vis-configuration-wrapper{display:block;width:700px}div.vis-configuration-wrapper:after{clear:both;content:"";display:block}div.vis-configuration.vis-config-option-container{background-color:#fff;border:2px solid #f7f8fa;border-radius:4px;display:block;left:10px;margin-top:20px;padding-left:5px;width:495px}div.vis-configuration.vis-config-button{background-color:#f7f8fa;border:2px solid #ceced0;border-radius:4px;cursor:pointer;display:block;height:25px;left:10px;line-height:25px;margin-bottom:30px;margin-top:20px;padding-left:5px;vertical-align:middle;width:495px}div.vis-configuration.vis-config-button.hover{background-color:#4588e6;border:2px solid #214373;color:#fff}div.vis-configuration.vis-config-item{display:block;float:left;height:25px;line-height:25px;vertical-align:middle;width:495px}div.vis-configuration.vis-config-item.vis-config-s2{background-color:#f7f8fa;border-radius:3px;left:10px;padding-left:5px}div.vis-configuration.vis-config-item.vis-config-s3{background-color:#e4e9f0;border-radius:3px;left:20px;padding-left:5px}div.vis-configuration.vis-config-item.vis-config-s4{background-color:#cfd8e6;border-radius:3px;left:30px;padding-left:5px}div.vis-configuration.vis-config-header{font-size:18px;font-weight:700}div.vis-configuration.vis-config-label{height:25px;line-height:25px;width:120px}div.vis-configuration.vis-config-label.vis-config-s3{width:110px}div.vis-configuration.vis-config-label.vis-config-s4{width:100px}div.vis-configuration.vis-config-colorBlock{border:1px solid #444;border-radius:2px;cursor:pointer;height:19px;margin:0;padding:0;top:1px;width:30px}input.vis-configuration.vis-config-checkbox{left:-5px}input.vis-configuration.vis-config-rangeinput{margin:0;padding:1px;pointer-events:none;position:relative;top:-5px;width:60px}input.vis-configuration.vis-config-range{-webkit-appearance:none;background-color:transparent;border:0 solid #fff;height:20px;width:300px}input.vis-configuration.vis-config-range::-webkit-slider-runnable-track{background:#dedede;background:-moz-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#dedede),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#dedede,#c8c8c8 99%);background:-o-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:linear-gradient(180deg,#dedede 0,#c8c8c8 99%);border:1px solid #999;border-radius:3px;box-shadow:0 0 3px 0 #aaa;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#dedede",endColorstr="#c8c8c8",GradientType=0);height:5px;width:300px}input.vis-configuration.vis-config-range::-webkit-slider-thumb{-webkit-appearance:none;background:#3876c2;background:-moz-linear-gradient(top,#3876c2 0,#385380 100%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#3876c2),color-stop(100%,#385380));background:-webkit-linear-gradient(top,#3876c2,#385380);background:-o-linear-gradient(top,#3876c2 0,#385380 100%);background:-ms-linear-gradient(top,#3876c2 0,#385380 100%);background:linear-gradient(180deg,#3876c2 0,#385380);border:1px solid #14334b;border-radius:50%;box-shadow:0 0 1px 0 #111927;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#3876c2",endColorstr="#385380",GradientType=0);height:17px;margin-top:-7px;width:17px}input.vis-configuration.vis-config-range:focus{outline:none}input.vis-configuration.vis-config-range:focus::-webkit-slider-runnable-track{background:#9d9d9d;background:-moz-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#9d9d9d),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#9d9d9d,#c8c8c8 99%);background:-o-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#9d9d9d 0,#c8c8c8 99%);background:linear-gradient(180deg,#9d9d9d 0,#c8c8c8 99%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#9d9d9d",endColorstr="#c8c8c8",GradientType=0)}input.vis-configuration.vis-config-range::-moz-range-track{background:#dedede;background:-moz-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#dedede),color-stop(99%,#c8c8c8));background:-webkit-linear-gradient(top,#dedede,#c8c8c8 99%);background:-o-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:-ms-linear-gradient(top,#dedede 0,#c8c8c8 99%);background:linear-gradient(180deg,#dedede 0,#c8c8c8 99%);border:1px solid #999;border-radius:3px;box-shadow:0 0 3px 0 #aaa;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#dedede",endColorstr="#c8c8c8",GradientType=0);height:10px;width:300px}input.vis-configuration.vis-config-range::-moz-range-thumb{background:#385380;border:none;border-radius:50%;height:16px;width:16px}input.vis-configuration.vis-config-range:-moz-focusring{outline:1px solid #fff;outline-offset:-1px}input.vis-configuration.vis-config-range::-ms-track{background:transparent;border-color:transparent;border-width:6px 0;color:transparent;height:5px;width:300px}input.vis-configuration.vis-config-range::-ms-fill-lower{background:#777;border-radius:10px}input.vis-configuration.vis-config-range::-ms-fill-upper{background:#ddd;border-radius:10px}input.vis-configuration.vis-config-range::-ms-thumb{background:#385380;border:none;border-radius:50%;height:16px;width:16px}input.vis-configuration.vis-config-range:focus::-ms-fill-lower{background:#888}input.vis-configuration.vis-config-range:focus::-ms-fill-upper{background:#ccc}.vis-configuration-popup{background:rgba(57,76,89,.85);border:2px solid #f2faff;border-radius:4px;color:#fff;font-size:14px;height:30px;line-height:30px;position:absolute;text-align:center;-webkit-transition:opacity .3s ease-in-out;-moz-transition:opacity .3s ease-in-out;transition:opacity .3s ease-in-out;width:150px}.vis-configuration-popup:after,.vis-configuration-popup:before{border:solid transparent;content:" ";height:0;left:100%;pointer-events:none;position:absolute;top:50%;width:0}.vis-configuration-popup:after{border-color:rgba(136,183,213,0) rgba(136,183,213,0) rgba(136,183,213,0) rgba(57,76,89,.85);border-width:8px;margin-top:-8px}.vis-configuration-popup:before{border-color:rgba(194,225,245,0) rgba(194,225,245,0) rgba(194,225,245,0) #f2faff;border-width:12px;margin-top:-12px}div.vis-tooltip{background-color:#f5f4ed;border:1px solid #808074;-moz-border-radius:3px;-webkit-border-radius:3px;border-radius:3px;box-shadow:3px 3px 10px rgba(0,0,0,.2);color:#000;font-family:verdana;font-size:14px;padding:5px;pointer-events:none;position:absolute;visibility:hidden;white-space:nowrap;z-index:5}div.vis-network div.vis-navigation div.vis-button{-webkit-touch-callout:none;background-position:2px 2px;background-repeat:no-repeat;-moz-border-radius:17px;border-radius:17px;cursor:pointer;display:inline-block;height:34px;position:absolute;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;width:34px}div.vis-network div.vis-navigation div.vis-button:hover{box-shadow:0 0 3px 3px rgba(56,207,21,.3)}div.vis-network div.vis-navigation div.vis-button:active{box-shadow:0 0 1px 3px rgba(56,207,21,.95)}div.vis-network div.vis-navigation div.vis-button.vis-up{background-image:url("");bottom:50px;left:55px}div.vis-network div.vis-navigation div.vis-button.vis-down{background-image:url("");bottom:10px;left:55px}div.vis-network div.vis-navigation div.vis-button.vis-left{background-image:url("");bottom:10px;left:15px}div.vis-network div.vis-navigation div.vis-button.vis-right{background-image:url("");bottom:10px;left:95px}div.vis-network div.vis-navigation div.vis-button.vis-zoomIn{background-image:url("");bottom:10px;right:15px}div.vis-network div.vis-navigation div.vis-button.vis-zoomOut{background-image:url("");bottom:10px;right:55px}div.vis-network div.vis-navigation div.vis-button.vis-zoomExtends{background-image:url("");bottom:50px;right:15px}div.vis-network div.vis-manipulation{background:#fff;background:-moz-linear-gradient(top,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);background:-webkit-gradient(linear,left top,left bottom,color-stop(0,#fff),color-stop(48%,#fcfcfc),color-stop(50%,#fafafa),color-stop(100%,#fcfcfc));background:-webkit-linear-gradient(top,#fff,#fcfcfc 48%,#fafafa 50%,#fcfcfc);background:-o-linear-gradient(top,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);background:-ms-linear-gradient(top,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc 100%);background:linear-gradient(180deg,#fff 0,#fcfcfc 48%,#fafafa 50%,#fcfcfc);border:0 solid #d6d9d8;border-bottom:1px;box-sizing:content-box;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr="#ffffff",endColorstr="#fcfcfc",GradientType=0);height:28px;left:0;padding-top:4px;position:absolute;top:0;width:100%}div.vis-network button.vis-edit-mode,div.vis-network div.vis-edit-mode{height:30px;left:0;position:absolute;top:5px}div.vis-network button.vis-close{-webkit-touch-callout:none;background-color:transparent;background-image:url("");background-position:20px 3px;background-repeat:no-repeat;border:none;cursor:pointer;height:30px;position:absolute;right:0;top:0;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;width:30px}div.vis-network button.vis-close:hover{opacity:.6}div.vis-network div.vis-edit-mode button.vis-button,div.vis-network div.vis-manipulation button.vis-button{-webkit-touch-callout:none;background-color:transparent;background-position:0 0;background-repeat:no-repeat;border:none;-moz-border-radius:15px;border-radius:15px;box-sizing:content-box;cursor:pointer;float:left;font-family:verdana;font-size:12px;height:24px;margin-left:10px;padding:0 8px;-webkit-user-select:none;-khtml-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}div.vis-network div.vis-manipulation button.vis-button:hover{box-shadow:1px 1px 8px rgba(0,0,0,.2)}div.vis-network div.vis-manipulation button.vis-button:active{box-shadow:1px 1px 8px rgba(0,0,0,.5)}div.vis-network div.vis-manipulation button.vis-button.vis-back{background-image:url("")}div.vis-network div.vis-manipulation div.vis-none:hover{box-shadow:1px 1px 8px transparent;cursor:default}div.vis-network div.vis-manipulation div.vis-none:active{box-shadow:1px 1px 8px transparent}div.vis-network div.vis-manipulation div.vis-none{line-height:23px;padding:0}div.vis-network div.vis-manipulation div.notification{font-weight:700;margin:2px}div.vis-network div.vis-manipulation button.vis-button.vis-add{background-image:url("")}div.vis-network div.vis-edit-mode button.vis-button.vis-edit,div.vis-network div.vis-manipulation button.vis-button.vis-edit{background-image:url("")}div.vis-network div.vis-edit-mode button.vis-button.vis-edit.vis-edit-mode{background-color:#fcfcfc;border:1px solid #ccc}div.vis-network div.vis-manipulation button.vis-button.vis-connect{background-image:url("")}div.vis-network div.vis-manipulation button.vis-button.vis-delete{background-image:url("")}div.vis-network div.vis-edit-mode div.vis-label,div.vis-network div.vis-manipulation div.vis-label{line-height:25px;margin:0 0 0 23px}div.vis-network div.vis-manipulation div.vis-separator-line{background-color:#bdbdbd;display:inline-block;float:left;height:21px;margin:0 7px 0 15px;width:1px}.rPartvisNetwork{margin: 0.5em auto; } \ No newline at end of file diff --git a/manuscript/_manuscript/site_libs/vis-9.1.0/vis-network.min.js b/manuscript/_manuscript/site_libs/vis-9.1.0/vis-network.min.js new file mode 100644 index 0000000..b8ad04a --- /dev/null +++ b/manuscript/_manuscript/site_libs/vis-9.1.0/vis-network.min.js @@ -0,0 +1,27 @@ +/** + * vis-network + * https://visjs.github.io/vis-network/ + * + * A dynamic, browser-based visualization library. + * + * @version 0.0.0-no-version + * @date 2021-09-28T12:03:00.413Z + * + * @copyright (c) 2011-2017 Almende B.V, http://almende.com + * @copyright (c) 2017-2019 visjs contributors, https://github.com/visjs + * + * @license + * vis.js is dual licensed under both + * + * 1. The Apache 2.0 License + * http://www.apache.org/licenses/LICENSE-2.0 + * + * and + * + * 2. The MIT License + * http://opensource.org/licenses/MIT + * + * vis.js may be distributed under either license. + */ +!function(t,e){"object"==typeof exports&&"undefined"!=typeof module?e(exports):"function"==typeof define&&define.amd?define(["exports"],e):e((t="undefined"!=typeof globalThis?globalThis:t||self).vis=t.vis||{})}(this,(function(t){"use strict";var e="undefined"!=typeof globalThis?globalThis:"undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:{};function i(t){return t&&t.__esModule&&Object.prototype.hasOwnProperty.call(t,"default")?t.default:t}function n(t,e){return t(e={exports:{}},e.exports),e.exports}var o,r,s=function(t){return t&&t.Math==Math&&t},a=s("object"==typeof globalThis&&globalThis)||s("object"==typeof window&&window)||s("object"==typeof self&&self)||s("object"==typeof e&&e)||function(){return this}()||Function("return this")(),h=function(t){try{return!!t()}catch(t){return!0}},l=!h((function(){return 7!=Object.defineProperty({},1,{get:function(){return 7}})[1]})),d={}.propertyIsEnumerable,c=Object.getOwnPropertyDescriptor,u={f:c&&!d.call({1:2},1)?function(t){var e=c(this,t);return!!e&&e.enumerable}:d},f=function(t,e){return{enumerable:!(1&t),configurable:!(2&t),writable:!(4&t),value:e}},p={}.toString,v=function(t){return p.call(t).slice(8,-1)},g="".split,y=h((function(){return!Object("z").propertyIsEnumerable(0)}))?function(t){return"String"==v(t)?g.call(t,""):Object(t)}:Object,m=function(t){if(null==t)throw TypeError("Can't call method on "+t);return t},b=function(t){return y(m(t))},w=function(t){return"object"==typeof t?null!==t:"function"==typeof t},k={},_=function(t){return"function"==typeof t?t:void 0},x=function(t,e){return arguments.length<2?_(k[t])||_(a[t]):k[t]&&k[t][e]||a[t]&&a[t][e]},E=x("navigator","userAgent")||"",O=a.process,C=a.Deno,S=O&&O.versions||C&&C.version,T=S&&S.v8;T?r=(o=T.split("."))[0]<4?1:o[0]+o[1]:E&&(!(o=E.match(/Edge\/(\d+)/))||o[1]>=74)&&(o=E.match(/Chrome\/(\d+)/))&&(r=o[1]);var M=r&&+r,P=!!Object.getOwnPropertySymbols&&!h((function(){var t=Symbol();return!String(t)||!(Object(t)instanceof Symbol)||!Symbol.sham&&M&&M<41})),D=P&&!Symbol.sham&&"symbol"==typeof Symbol.iterator,I=D?function(t){return"symbol"==typeof t}:function(t){var e=x("Symbol");return"function"==typeof e&&Object(t)instanceof e},B="__core-js_shared__",z=a[B]||function(t,e){try{Object.defineProperty(a,t,{value:e,configurable:!0,writable:!0})}catch(i){a[t]=e}return e}(B,{}),N=n((function(t){(t.exports=function(t,e){return z[t]||(z[t]=void 0!==e?e:{})})("versions",[]).push({version:"3.16.1",mode:"pure",copyright:"© 2021 Denis Pushkarev (zloirock.ru)"})})),A=function(t){return Object(m(t))},F={}.hasOwnProperty,j=Object.hasOwn||function(t,e){return F.call(A(t),e)},R=0,L=Math.random(),H=function(t){return"Symbol("+String(void 0===t?"":t)+")_"+(++R+L).toString(36)},W=N("wks"),q=a.Symbol,V=D?q:q&&q.withoutSetter||H,U=function(t){return j(W,t)&&(P||"string"==typeof W[t])||(P&&j(q,t)?W[t]=q[t]:W[t]=V("Symbol."+t)),W[t]},Y=U("toPrimitive"),X=function(t,e){if(!w(t)||I(t))return t;var i,n=t[Y];if(void 0!==n){if(void 0===e&&(e="default"),i=n.call(t,e),!w(i)||I(i))return i;throw TypeError("Can't convert object to primitive value")}return void 0===e&&(e="number"),function(t,e){var i,n;if("string"===e&&"function"==typeof(i=t.toString)&&!w(n=i.call(t)))return n;if("function"==typeof(i=t.valueOf)&&!w(n=i.call(t)))return n;if("string"!==e&&"function"==typeof(i=t.toString)&&!w(n=i.call(t)))return n;throw TypeError("Can't convert object to primitive value")}(t,e)},G=function(t){var e=X(t,"string");return I(e)?e:String(e)},K=a.document,$=w(K)&&w(K.createElement),Z=function(t){return $?K.createElement(t):{}},Q=!l&&!h((function(){return 7!=Object.defineProperty(Z("div"),"a",{get:function(){return 7}}).a})),J=Object.getOwnPropertyDescriptor,tt={f:l?J:function(t,e){if(t=b(t),e=G(e),Q)try{return J(t,e)}catch(t){}if(j(t,e))return f(!u.f.call(t,e),t[e])}},et=/#|\.prototype\./,it=function(t,e){var i=ot[nt(t)];return i==st||i!=rt&&("function"==typeof e?h(e):!!e)},nt=it.normalize=function(t){return String(t).replace(et,".").toLowerCase()},ot=it.data={},rt=it.NATIVE="N",st=it.POLYFILL="P",at=it,ht=function(t){if("function"!=typeof t)throw TypeError(String(t)+" is not a function");return t},lt=function(t,e,i){if(ht(t),void 0===e)return t;switch(i){case 0:return function(){return t.call(e)};case 1:return function(i){return t.call(e,i)};case 2:return function(i,n){return t.call(e,i,n)};case 3:return function(i,n,o){return t.call(e,i,n,o)}}return function(){return t.apply(e,arguments)}},dt=function(t){if(!w(t))throw TypeError(String(t)+" is not an object");return t},ct=Object.defineProperty,ut={f:l?ct:function(t,e,i){if(dt(t),e=G(e),dt(i),Q)try{return ct(t,e,i)}catch(t){}if("get"in i||"set"in i)throw TypeError("Accessors not supported");return"value"in i&&(t[e]=i.value),t}},ft=l?function(t,e,i){return ut.f(t,e,f(1,i))}:function(t,e,i){return t[e]=i,t},pt=tt.f,vt=function(t){var e=function(e,i,n){if(this instanceof t){switch(arguments.length){case 0:return new t;case 1:return new t(e);case 2:return new t(e,i)}return new t(e,i,n)}return t.apply(this,arguments)};return e.prototype=t.prototype,e},gt=function(t,e){var i,n,o,r,s,h,l,d,c=t.target,u=t.global,f=t.stat,p=t.proto,v=u?a:f?a[c]:(a[c]||{}).prototype,g=u?k:k[c]||(k[c]={}),y=g.prototype;for(o in e)i=!at(u?o:c+(f?".":"#")+o,t.forced)&&v&&j(v,o),s=g[o],i&&(h=t.noTargetGet?(d=pt(v,o))&&d.value:v[o]),r=i&&h?h:e[o],i&&typeof s==typeof r||(l=t.bind&&i?lt(r,a):t.wrap&&i?vt(r):p&&"function"==typeof r?lt(Function.call,r):r,(t.sham||r&&r.sham||s&&s.sham)&&ft(l,"sham",!0),g[o]=l,p&&(j(k,n=c+"Prototype")||ft(k,n,{}),k[n][o]=r,t.real&&y&&!y[o]&&ft(y,o,r)))},yt=Math.ceil,mt=Math.floor,bt=function(t){return isNaN(t=+t)?0:(t>0?mt:yt)(t)},wt=Math.min,kt=function(t){return t>0?wt(bt(t),9007199254740991):0},_t=Math.max,xt=Math.min,Et=function(t,e){var i=bt(t);return i<0?_t(i+e,0):xt(i,e)},Ot=function(t){return function(e,i,n){var o,r=b(e),s=kt(r.length),a=Et(n,s);if(t&&i!=i){for(;s>a;)if((o=r[a++])!=o)return!0}else for(;s>a;a++)if((t||a in r)&&r[a]===i)return t||a||0;return!t&&-1}},Ct={includes:Ot(!0),indexOf:Ot(!1)},St={},Tt=Ct.indexOf,Mt=function(t,e){var i,n=b(t),o=0,r=[];for(i in n)!j(St,i)&&j(n,i)&&r.push(i);for(;e.length>o;)j(n,i=e[o++])&&(~Tt(r,i)||r.push(i));return r},Pt=["constructor","hasOwnProperty","isPrototypeOf","propertyIsEnumerable","toLocaleString","toString","valueOf"],Dt=Object.keys||function(t){return Mt(t,Pt)},It={f:Object.getOwnPropertySymbols},Bt=Object.assign,zt=Object.defineProperty,Nt=!Bt||h((function(){if(l&&1!==Bt({b:1},Bt(zt({},"a",{enumerable:!0,get:function(){zt(this,"b",{value:3,enumerable:!1})}}),{b:2})).b)return!0;var t={},e={},i=Symbol(),n="abcdefghijklmnopqrst";return t[i]=7,n.split("").forEach((function(t){e[t]=t})),7!=Bt({},t)[i]||Dt(Bt({},e)).join("")!=n}))?function(t,e){for(var i=A(t),n=arguments.length,o=1,r=It.f,s=u.f;n>o;)for(var a,h=y(arguments[o++]),d=r?Dt(h).concat(r(h)):Dt(h),c=d.length,f=0;c>f;)a=d[f++],l&&!s.call(h,a)||(i[a]=h[a]);return i}:Bt;gt({target:"Object",stat:!0,forced:Object.assign!==Nt},{assign:Nt});var At=k.Object.assign,Ft=[].slice,jt={},Rt=function(t,e,i){if(!(e in jt)){for(var n=[],o=0;o=.1;)(p=+r[c++%s])>d&&(p=d),f=Math.sqrt(p*p/(1+l*l)),e+=f=a<0?-f:f,i+=l*f,!0===u?t.lineTo(e,i):t.moveTo(e,i),d-=p,u=!u}var $t={circle:Ut,dashedLine:Kt,database:Gt,diamond:function(t,e,i,n){t.beginPath(),t.lineTo(e,i+n),t.lineTo(e+n,i),t.lineTo(e,i-n),t.lineTo(e-n,i),t.closePath()},ellipse:Xt,ellipse_vis:Xt,hexagon:function(t,e,i,n){t.beginPath();var o=2*Math.PI/6;t.moveTo(e+n,i);for(var r=1;r<6;r++)t.lineTo(e+n*Math.cos(o*r),i+n*Math.sin(o*r));t.closePath()},roundRect:Yt,square:function(t,e,i,n){t.beginPath(),t.rect(e-n,i-n,2*n,2*n),t.closePath()},star:function(t,e,i,n){t.beginPath(),i+=.1*(n*=.82);for(var o=0;o<10;o++){var r=o%2==0?1.3*n:.5*n;t.lineTo(e+r*Math.sin(2*o*Math.PI/10),i-r*Math.cos(2*o*Math.PI/10))}t.closePath()},triangle:function(t,e,i,n){t.beginPath(),i+=.275*(n*=1.15);var o=2*n,r=o/2,s=Math.sqrt(3)/6*o,a=Math.sqrt(o*o-r*r);t.moveTo(e,i-(a-s)),t.lineTo(e+r,i+s),t.lineTo(e-r,i+s),t.lineTo(e,i-(a-s)),t.closePath()},triangleDown:function(t,e,i,n){t.beginPath(),i-=.275*(n*=1.15);var o=2*n,r=o/2,s=Math.sqrt(3)/6*o,a=Math.sqrt(o*o-r*r);t.moveTo(e,i+(a-s)),t.lineTo(e+r,i-s),t.lineTo(e-r,i-s),t.lineTo(e,i+(a-s)),t.closePath()}};var Zt=n((function(t){function e(t){if(t)return function(t){for(var i in e.prototype)t[i]=e.prototype[i];return t}(t)}t.exports=e,e.prototype.on=e.prototype.addEventListener=function(t,e){return this._callbacks=this._callbacks||{},(this._callbacks["$"+t]=this._callbacks["$"+t]||[]).push(e),this},e.prototype.once=function(t,e){function i(){this.off(t,i),e.apply(this,arguments)}return i.fn=e,this.on(t,i),this},e.prototype.off=e.prototype.removeListener=e.prototype.removeAllListeners=e.prototype.removeEventListener=function(t,e){if(this._callbacks=this._callbacks||{},0==arguments.length)return this._callbacks={},this;var i,n=this._callbacks["$"+t];if(!n)return this;if(1==arguments.length)return delete this._callbacks["$"+t],this;for(var o=0;o=a?t?"":void 0:(n=r.charCodeAt(s))<55296||n>56319||s+1===a||(o=r.charCodeAt(s+1))<56320||o>57343?t?r.charAt(s):n:t?r.slice(s,s+2):o-56320+(n-55296<<10)+65536}},te={codeAt:Jt(!1),charAt:Jt(!0)},ee=Function.toString;"function"!=typeof z.inspectSource&&(z.inspectSource=function(t){return ee.call(t)});var ie,ne,oe,re=z.inspectSource,se=a.WeakMap,ae="function"==typeof se&&/native code/.test(re(se)),he=N("keys"),le=function(t){return he[t]||(he[t]=H(t))},de="Object already initialized",ce=a.WeakMap;if(ae||z.state){var ue=z.state||(z.state=new ce),fe=ue.get,pe=ue.has,ve=ue.set;ie=function(t,e){if(pe.call(ue,t))throw new TypeError(de);return e.facade=t,ve.call(ue,t,e),e},ne=function(t){return fe.call(ue,t)||{}},oe=function(t){return pe.call(ue,t)}}else{var ge=le("state");St[ge]=!0,ie=function(t,e){if(j(t,ge))throw new TypeError(de);return e.facade=t,ft(t,ge,e),e},ne=function(t){return j(t,ge)?t[ge]:{}},oe=function(t){return j(t,ge)}}var ye,me,be,we={set:ie,get:ne,has:oe,enforce:function(t){return oe(t)?ne(t):ie(t,{})},getterFor:function(t){return function(e){var i;if(!w(e)||(i=ne(e)).type!==t)throw TypeError("Incompatible receiver, "+t+" required");return i}}},ke=!h((function(){function t(){}return t.prototype.constructor=null,Object.getPrototypeOf(new t)!==t.prototype})),_e=le("IE_PROTO"),xe=Object.prototype,Ee=ke?Object.getPrototypeOf:function(t){return t=A(t),j(t,_e)?t[_e]:"function"==typeof t.constructor&&t instanceof t.constructor?t.constructor.prototype:t instanceof Object?xe:null},Oe=U("iterator"),Ce=!1;[].keys&&("next"in(be=[].keys())?(me=Ee(Ee(be)))!==Object.prototype&&(ye=me):Ce=!0);var Se=null==ye||h((function(){var t={};return ye[Oe].call(t)!==t}));Se&&(ye={}),Se&&!j(ye,Oe)&&ft(ye,Oe,(function(){return this}));var Te,Me={IteratorPrototype:ye,BUGGY_SAFARI_ITERATORS:Ce},Pe=l?Object.defineProperties:function(t,e){dt(t);for(var i,n=Dt(e),o=n.length,r=0;o>r;)ut.f(t,i=n[r++],e[i]);return t},De=x("document","documentElement"),Ie=le("IE_PROTO"),Be=function(){},ze=function(t){return"