Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moving karyotype to anoph #702

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def __init__(
taxon_colors=TAXON_COLORS,
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
)

def __repr__(self):
Expand Down
94 changes: 2 additions & 92 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,11 @@

import dask
import pandas as pd # type: ignore
from pandas import CategoricalDtype
import numpy as np # type: ignore
import allel # type: ignore
import plotly.express as px # type: ignore

import malariagen_data
from .anopheles import AnophelesDataResource

from numpydoc_decorator import doc
from .util import check_types, _karyotype_tags_n_alt
from .anoph import base_params
from typing import Optional, Literal, Annotated, TypeAlias

# silence dask performance warnings
dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
Expand All @@ -35,6 +28,7 @@
GENE_NAMES = {
"AGAP004707": "Vgsc/para",
}
INVERSION_TAG_PATH = "karyotype_tag_snps.csv"


def _setup_aim_palettes():
Expand Down Expand Up @@ -83,12 +77,6 @@ def _setup_aim_palettes():
}


inversion_param: TypeAlias = Annotated[
Literal["2La", "2Rb", "2Rc_gam", "2Rc_col", "2Rd", "2Rj"],
"Name of inversion to infer karyotype for.",
]


class Ag3(AnophelesDataResource):
"""Provides access to data from Ag3.x releases.

Expand Down Expand Up @@ -203,6 +191,7 @@ def __init__(
taxon_colors=TAXON_COLORS,
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
)

# set up caches
Expand Down Expand Up @@ -355,82 +344,3 @@ def _results_cache_add_analysis_params(self, params):
super()._results_cache_add_analysis_params(params)
# override parent class to add AIM analysis
params["aim_analysis"] = self._aim_analysis

@check_types
@doc(
summary="Load tag SNPs for a given inversion in Ag.",
)
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
# needs to be modified depending on where we are hosting
import importlib.resources
from . import resources

with importlib.resources.path(resources, "karyotype_tag_snps.csv") as path:
df_tag_snps = pd.read_csv(path, sep=",")
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()

@check_types
@doc(
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
)
def karyotype(
self,
inversion: inversion_param,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
) -> pd.DataFrame:
# load tag snp data
df_tagsnps = self.load_inversion_tags(inversion=inversion)
inversion_pos = df_tagsnps["position"]
inversion_alts = df_tagsnps["alt_allele"]
contig = inversion[0:2]

# get snp calls for inversion region
start, end = np.min(inversion_pos), np.max(inversion_pos)
region = f"{contig}:{start}-{end}"

ds_snps = self.snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

with self._spinner("Inferring karyotype from tag SNPs"):
# access variables we need
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
pos = allel.SortedIndex(ds_snps["variant_position"].values)
samples = ds_snps["sample_id"].values
alts = ds_snps["variant_allele"].values.astype(str)

# subset to position of inversion tags
mask = pos.locate_intersection(inversion_pos)[0]
alts = alts[mask]
geno = geno.compress(mask, axis=0).compute()

# infer karyotype
gn_alt = _karyotype_tags_n_alt(
gt=geno, alts=alts, inversion_alts=inversion_alts
)
is_called = geno.is_called()

# calculate mean genotype for each sample whilst masking missing calls
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
total_sites = np.sum(is_called, axis=0)

df = pd.DataFrame(
{
"sample_id": samples,
"inversion": inversion,
f"karyotype_{inversion}_mean": av_gts,
# round the genotypes then convert to int
f"karyotype_{inversion}": av_gts.round().astype(int),
"total_tag_snps": total_sites,
},
)
# Allow filling missing values with "<NA>" visible placeholder.
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)

return df
131 changes: 131 additions & 0 deletions malariagen_data/anoph/karyotype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import pandas as pd # type: ignore
from pandas import CategoricalDtype
import numpy as np # type: ignore
import allel # type: ignore

from numpydoc_decorator import doc
from ..util import check_types
from . import base_params
from typing import Optional

from .snp_data import AnophelesSnpData
from .karyotype_params import inversion_param


def _karyotype_tags_n_alt(gt, alts, inversion_alts):
# could be Numba'd for speed but was already quick (not many inversion tag snps)
n_sites = gt.shape[0]
n_samples = gt.shape[1]

# create empty array
inv_n_alt = np.empty((n_sites, n_samples), dtype=np.int8)

# for every site
for i in range(n_sites):
# find the index of the correct tag snp allele
tagsnp_index = np.where(alts[i] == inversion_alts[i])[0]

for j in range(n_samples):
# count alleles which == tag snp allele and store
n_tag_alleles = np.sum(gt[i, j] == tagsnp_index[0])
inv_n_alt[i, j] = n_tag_alleles

return inv_n_alt


class AnophelesKaryotypeAnalysis(AnophelesSnpData):
def __init__(
self,
inversion_tag_path: Optional[str] = None,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

self._inversion_tag_path = inversion_tag_path

@check_types
@doc(
summary="Load tag SNPs for a given inversion.",
)
def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
# needs to be modified depending on where we are hosting
import importlib.resources
from .. import resources

if not self._inversion_tag_path:
raise FileNotFoundError(
"The file containing the inversion tags is missing."
)
Comment on lines +58 to +61
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FileNotFoundError isn't quite the right exception class here.

Suggested change
if not self._inversion_tag_path:
raise FileNotFoundError(
"The file containing the inversion tags is missing."
)
if self._inversion_tag_path is None:
raise NotImplementedError(
"No inversion tags are available for this data resource."
)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have been of two minds about this one: on one hand, it is true that we have not generated the tags for Af1 and one could argue that NotImplementedError is a more appropriate error for work that is still to be done; on the other hand, the code itself would (I think) work if the file actually existed and it is thus less an issue of missing code and more an issue of a missing input. Also, I can imagine a situation where someone created tags for an inversion and would want to try to use their local file (it is not currently possible, the path that is used is hard-coded in both Ag3 and Af1) in which case the error would come from the path being incorrect (though, an error message referring to the actual path inputed would be more helpful if we want to offer this option).

else:
with importlib.resources.path(resources, self._inversion_tag_path) as path:
df_tag_snps = pd.read_csv(path, sep=",")
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()

@check_types
@doc(
summary="Infer karyotype from tag SNPs for a given inversion in Ag.",
jonbrenas marked this conversation as resolved.
Show resolved Hide resolved
)
def karyotype(
self,
inversion: inversion_param,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
) -> pd.DataFrame:
# load tag snp data
df_tagsnps = self.load_inversion_tags(inversion=inversion)
inversion_pos = df_tagsnps["position"]
inversion_alts = df_tagsnps["alt_allele"]
contig = inversion[0:2]

# get snp calls for inversion region
start, end = np.min(inversion_pos), np.max(inversion_pos)
region = f"{contig}:{start}-{end}"

ds_snps = self.snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

with self._spinner("Inferring karyotype from tag SNPs"):
# access variables we need
geno = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
pos = allel.SortedIndex(ds_snps["variant_position"].values)
samples = ds_snps["sample_id"].values
alts = ds_snps["variant_allele"].values.astype(str)

# subset to position of inversion tags
mask = pos.locate_intersection(inversion_pos)[0]
alts = alts[mask]
geno = geno.compress(mask, axis=0).compute()

# infer karyotype
gn_alt = _karyotype_tags_n_alt(
gt=geno, alts=alts, inversion_alts=inversion_alts
)
is_called = geno.is_called()

# calculate mean genotype for each sample whilst masking missing calls
av_gts = np.mean(np.ma.MaskedArray(gn_alt, mask=~is_called), axis=0)
total_sites = np.sum(is_called, axis=0)

df = pd.DataFrame(
{
"sample_id": samples,
"inversion": inversion,
f"karyotype_{inversion}_mean": av_gts,
# round the genotypes then convert to int
f"karyotype_{inversion}": av_gts.round().astype(int),
"total_tag_snps": total_sites,
},
)
# Allow filling missing values with "<NA>" visible placeholder.
kt_dtype = CategoricalDtype(categories=[0, 1, 2, "<NA>"], ordered=True)
df[f"karyotype_{inversion}"] = df[f"karyotype_{inversion}"].astype(kt_dtype)

return df
9 changes: 9 additions & 0 deletions malariagen_data/anoph/karyotype_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Parameter definitions for karyotype analysis functions."""


from typing_extensions import Annotated, TypeAlias

inversion_param: TypeAlias = Annotated[
str,
"Name of inversion to infer karyotype for.",
]
4 changes: 4 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
plotly_params,
xpehh_params,
)
from .anoph.karyotype import AnophelesKaryotypeAnalysis
from .anoph.aim_data import AnophelesAimData
from .anoph.base import AnophelesBase
from .anoph.cnv_data import AnophelesCnvData
Expand Down Expand Up @@ -94,6 +95,7 @@ class AnophelesDataResource(
AnophelesPca,
PlinkConverter,
AnophelesIgv,
AnophelesKaryotypeAnalysis,
AnophelesAimData,
AnophelesHapData,
AnophelesSnpData,
Expand Down Expand Up @@ -138,6 +140,7 @@ def __init__(
taxon_colors: Optional[Mapping[str, str]],
virtual_contigs: Optional[Mapping[str, Sequence[str]]],
gene_names: Optional[Mapping[str, str]],
inversion_tag_path: Optional[str],
):
super().__init__(
url=url,
Expand Down Expand Up @@ -171,6 +174,7 @@ def __init__(
taxon_colors=taxon_colors,
virtual_contigs=virtual_contigs,
gene_names=gene_names,
inversion_tag_path=inversion_tag_path,
)

@property
Expand Down
21 changes: 0 additions & 21 deletions malariagen_data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1591,27 +1591,6 @@ def distributed_client():
return client


def _karyotype_tags_n_alt(gt, alts, inversion_alts):
# could be Numba'd for speed but was already quick (not many inversion tag snps)
n_sites = gt.shape[0]
n_samples = gt.shape[1]

# create empty array
inv_n_alt = np.empty((n_sites, n_samples), dtype=np.int8)

# for every site
for i in range(n_sites):
# find the index of the correct tag snp allele
tagsnp_index = np.where(alts[i] == inversion_alts[i])[0]

for j in range(n_samples):
# count alleles which == tag snp allele and store
n_tag_alleles = np.sum(gt[i, j] == tagsnp_index[0])
inv_n_alt[i, j] = n_tag_alleles

return inv_n_alt


def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
# Take a copy, as we will modify the dataframe.
df_samples = df_samples.copy()
Expand Down
10 changes: 1 addition & 9 deletions notebooks/karyotype.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -359,14 +359,6 @@
"source": [
"ag3.plot_pca_coords(pca_df_2rc_col, color=\"karyotype_2Rc_col\", symbol=\"country\", width=600, height=500)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6fb7237-bb4e-490e-9ae0-33a52d4fa650",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -391,7 +383,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.10.11"
}
},
"nbformat": 4,
Expand Down
15 changes: 15 additions & 0 deletions tests/integration/test_af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,18 @@ def test_locate_region(region_raw):
assert region == Region("2RL", 48714463, 48715355)
if region_raw == "2RL:24,630,355-24,633,221":
assert region == Region("2RL", 24630355, 24633221)


@pytest.mark.parametrize(
"inversion",
["2La", "2Rb", "2Rc_col", "X_x"],
)
def test_karyotyping(inversion):
af1 = setup_af1()

with pytest.raises(FileNotFoundError):
af1.karyotype(
inversion=inversion,
sample_sets="1229-VO-GH-DADZIE-VMF00095",
sample_query=None,
)
Loading
Loading