From 72475336be834b2285e6b86265882f8419c919e8 Mon Sep 17 00:00:00 2001 From: Jonas Fuchs <78491186+jonas-fuchs@users.noreply.github.com> Date: Fri, 3 Nov 2023 14:06:03 +0100 Subject: [PATCH] Apobec signature (#8) * added def to check for apobec signature * updated readme --- README.md | 3 ++- bamdash/__init__.py | 2 +- bamdash/scripts/data.py | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3d56c58..df9521a 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ BAMdash automatically computes serveral statistics: - for each track it computes recovery and mean coverage (set `-c` for the min coverage) for each element in the track - if a `*.vcf` is provided it annotates `TRANSITION`/`TRANSVERSION` and type of exchange (`SNP`, `DEL`, `INS`) -If a `*.gb`and `*.vcf` is provided BAMdash computes the aminoacid exchange and the effect in the CDS (inspired by but not as powerful as [snpeff](http://pcingola.github.io/SnpEff/snpeff)). SNP and INDEL vcf annotation supports: +If a `*.gb`and `*.vcf` is provided BAMdash computes if the mutations could have been caused by APOBEC deamination. +Moreover, it annotates the aminoacid exchange and the effect in the CDS (inspired by but not as powerful as [snpeff](http://pcingola.github.io/SnpEff/snpeff)). SNP and INDEL vcf annotation supports: - `START_LOST`: INDEL or SNP start at the CDS and result in a start loss - `STOP_LOST`: INDEL or SNP result in the loss of the stop codon diff --git a/bamdash/__init__.py b/bamdash/__init__.py index 7cea933..7feb1f3 100644 --- a/bamdash/__init__.py +++ b/bamdash/__init__.py @@ -1,3 +1,3 @@ """interactively visualize coverage and tracks""" _program = "bamdash" -__version__ = "0.2" +__version__ = "0.2.1" diff --git a/bamdash/scripts/data.py b/bamdash/scripts/data.py index 7493c0d..6d6d917 100644 --- a/bamdash/scripts/data.py +++ b/bamdash/scripts/data.py @@ -394,6 +394,40 @@ def get_mutations(start, stop, cds, variant, seq): return ac_exchange, ac_effect +def analyse_a3_signature(variant, seq): + """ + analyse if the mutation lies in a APOBEC3 recognition site + (AC) or (GT) and if it is a C>T or G>A (reverse complement) + exchange + + :param variant: slice of the variant df + :param seq: sequence of the ref + + :return: recognition site or NONE + """ + + if variant["type"] == "SNP": + pos = variant["position"] + # for C ref check the prior nt + if variant["reference"] == "C": + site = str(seq[pos-2:pos]) + # check if the site and mutation type is in line with A3A activity + if site == "TC" and variant["mutation"] == "T": + return "YES", f"{site[0]}>{site[1]}<" + else: + return "NO", f"{site[0]}>{site[1]}<" + # for G ref check the following nt + if variant["reference"] == "G": + site = str(seq[pos-1:pos+1]) + # check if the site and mutation type is in line with A3A activity + if site == "GA" and variant["mutation"] == "A": + return "YES", f">{site[0]}<{site[1]}" + else: + return "NO", f">{site[0]}<{site[1]}" + + return "NO", "-" + + def annotate_vcf_df(vcf_df, cds_dict, seq): """ annotate mutations for their aminoacid effect @@ -467,6 +501,13 @@ def annotate_vcfs_in_tracks(track_data): return track_data # annotate each vcf df for vcf_track in index_positions[1]: + # analyse for potential a3a activity + a3_signatures, sites = [], [] + for variant in track_data[vcf_track][0].iterrows(): + a3_result = analyse_a3_signature(variant[1], track_data[gb_indices[0]][2]) + a3_signatures.append(a3_result[0]), sites.append(a3_result[1]) + track_data[vcf_track][0]["potential APOBEC3 activity"] = a3_signatures + track_data[vcf_track][0]["checked site"] = sites # check if CDS is present if "CDS" not in track_data[gb_indices[0]][0]: continue