From 72475336be834b2285e6b86265882f8419c919e8 Mon Sep 17 00:00:00 2001
From: Jonas Fuchs <78491186+jonas-fuchs@users.noreply.github.com>
Date: Fri, 3 Nov 2023 14:06:03 +0100
Subject: [PATCH] Apobec signature (#8)

* added def to check for apobec signature

* updated readme
---
 README.md               |  3 ++-
 bamdash/__init__.py     |  2 +-
 bamdash/scripts/data.py | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3d56c58..df9521a 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,8 @@ BAMdash automatically computes serveral statistics:
 - for each track it computes recovery and mean coverage (set `-c` for the min coverage) for each element in the track
 - if a `*.vcf` is provided it annotates `TRANSITION`/`TRANSVERSION` and type of exchange (`SNP`, `DEL`, `INS`)
 
-If a `*.gb`and `*.vcf` is provided BAMdash computes the aminoacid exchange and the effect in the CDS (inspired by but not as powerful as [snpeff](http://pcingola.github.io/SnpEff/snpeff)). SNP and INDEL vcf annotation supports:
+If a `*.gb`and `*.vcf` is provided BAMdash computes if the mutations could have been caused by APOBEC deamination. 
+Moreover, it annotates the aminoacid exchange and the effect in the CDS (inspired by but not as powerful as [snpeff](http://pcingola.github.io/SnpEff/snpeff)). SNP and INDEL vcf annotation supports:
 
 - `START_LOST`: INDEL or SNP start at the CDS and result in a start loss
 - `STOP_LOST`: INDEL or SNP result in the loss of the stop codon
diff --git a/bamdash/__init__.py b/bamdash/__init__.py
index 7cea933..7feb1f3 100644
--- a/bamdash/__init__.py
+++ b/bamdash/__init__.py
@@ -1,3 +1,3 @@
 """interactively visualize coverage and tracks"""
 _program = "bamdash"
-__version__ = "0.2"
+__version__ = "0.2.1"
diff --git a/bamdash/scripts/data.py b/bamdash/scripts/data.py
index 7493c0d..6d6d917 100644
--- a/bamdash/scripts/data.py
+++ b/bamdash/scripts/data.py
@@ -394,6 +394,40 @@ def get_mutations(start, stop, cds, variant, seq):
     return ac_exchange, ac_effect
 
 
+def analyse_a3_signature(variant, seq):
+    """
+    analyse if the mutation lies in a APOBEC3 recognition site
+    (AC) or (GT) and if it is a C>T or G>A (reverse complement)
+    exchange
+
+    :param variant: slice of the variant df
+    :param seq: sequence of the ref
+
+    :return: recognition site or NONE
+    """
+
+    if variant["type"] == "SNP":
+        pos = variant["position"]
+        # for C ref check the prior nt
+        if variant["reference"] == "C":
+            site = str(seq[pos-2:pos])
+            # check if the site and mutation type is in line with A3A activity
+            if site == "TC" and variant["mutation"] == "T":
+                return "YES", f"{site[0]}>{site[1]}<"
+            else:
+                return "NO", f"{site[0]}>{site[1]}<"
+        # for G ref check the following nt
+        if variant["reference"] == "G":
+            site = str(seq[pos-1:pos+1])
+            # check if the site and mutation type is in line with A3A activity
+            if site == "GA" and variant["mutation"] == "A":
+                return "YES", f">{site[0]}<{site[1]}"
+            else:
+                return "NO", f">{site[0]}<{site[1]}"
+
+    return "NO", "-"
+
+
 def annotate_vcf_df(vcf_df, cds_dict, seq):
     """
     annotate mutations for their aminoacid effect
@@ -467,6 +501,13 @@ def annotate_vcfs_in_tracks(track_data):
             return track_data
             # annotate each vcf df
         for vcf_track in index_positions[1]:
+            # analyse for potential a3a activity
+            a3_signatures, sites = [], []
+            for variant in track_data[vcf_track][0].iterrows():
+                a3_result = analyse_a3_signature(variant[1], track_data[gb_indices[0]][2])
+                a3_signatures.append(a3_result[0]), sites.append(a3_result[1])
+            track_data[vcf_track][0]["potential APOBEC3 activity"] = a3_signatures
+            track_data[vcf_track][0]["checked site"] = sites
             # check if CDS is present
             if "CDS" not in track_data[gb_indices[0]][0]:
                 continue