Further tweaks (#21)

* fixed alignment bug, tweaked config, fixed plot warnings * changed kmer list to tuple to speed up kmer digestion * checks now blast installation for qpcr mode * adjusted best primer selection * fixed unwanted sorting behaviour * updated docs * reworked output and dimer search * docstring update * adjusted cutoff for t=1
jonas-fuchs · Oct 17, 2023 · 3550def · 3550def
1 parent 6354c64
commit 3550def
Show file tree

Hide file tree

Showing 10 changed files with 128 additions and 112 deletions.
diff --git a/docs/how_varvamp_works.md b/docs/how_varvamp_works.md
@@ -23,7 +23,7 @@ varVAMP searches for potential primer regions as defined by a user-defined numbe
 varVAMP uses [`primer3-py`](https://pypi.org/project/primer3-py/) to search for potential primers. Some of the evaluation process, determining if primers match certain criteria, was adapted from [`primalscheme`](www.github.com/aresti/primalscheme). The primer search contains multiple steps:
 1. Digest the primer regions into kmers with the min and max length of primers. This is performed on a consensus sequence that does not contain ambiguous characters but is just the majority consensus of the alignment. Therefore, primer parameters will be later calculated for the best fitting primer.
 2. Evaluate if these kmers are potential primers independent of their orientation (temperature, GC, size, poly-x repeats and poly dinucleotide repeats) and dependent on their orientation (secondary structure, GC clamp, number of GCs in the last 5 bases of the 3' end and min 3' nucleotides without an ambiguous base). Filter for kmers that satisfy all constraints and calculate their penalties (explained in the last section).
-3. Sanger and tiled mode: Find lowest scoring primer. varVAMP sorts the primers by their score and always takes the best scoring if the primer positions have not been covered by a better primer. This greatly reduces the complexity of the later amplicon search while only retaining the best scoring primer of a set of overlapping primers.
+3. Sanger and tiled mode: Find lowest scoring primer. varVAMP sorts the primers by their score and always takes the best scoring if middle third of the primer has not been covered by a better primer. This greatly reduces the complexity of the later amplicon search while only retaining the best scoring primer of a set of overlapping primers.
 
 ### Amplicon search
 

diff --git a/varvamp/__init__.py b/varvamp/__init__.py
@@ -1,3 +1,3 @@
 """Tool to design amplicons for highly variable virusgenomes"""
 _program = "varvamp"
-__version__ = "0.9.3"
+__version__ = "0.9.4"
diff --git a/varvamp/command.py b/varvamp/command.py
@@ -285,7 +285,7 @@ def sanger_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_
     logging.varvamp_progress(
         log_file,
         progress=0.7,
-        job="Considering non-overlapping low scoring primers.",
+        job="Considering low scoring primers.",
         progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rw primers"
     )
 
@@ -345,7 +345,7 @@ def sanger_workflow(args, amplicons, all_primers, log_file):
     return amplicon_scheme
 
 
-def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file):
+def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file, results_dir):
     """
     part of the workflow specific for the tiled mode
     """
@@ -371,7 +371,7 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
             f"varVAMP found {len(dimers_not_solved)} primer dimers without replacements. Check the dimer file and perform the PCR for incomaptible amplicons in a sperate reaction.",
             log_file
         )
-        reporting.write_dimers(dir, dimers_not_solved)
+        reporting.write_dimers(results_dir, dimers_not_solved)
 
     # evaluate coverage
     percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
@@ -491,9 +491,8 @@ def main(sysargs=sys.argv[1:]):
     start_time = datetime.datetime.now()
     results_dir, data_dir, log_file = logging.create_dir_structure(args.input[1])
     # check if blast is installed
-    if args.mode == "tiled" or args.mode == "sanger":
-        if args.database is not None:
-            blast.check_BLAST_installation(log_file)
+    if args.database is not None:
+        blast.check_BLAST_installation(log_file)
 
     # mode unspecific part of the workflow
     alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates = shared_workflow(args, log_file)
@@ -528,7 +527,8 @@ def main(sysargs=sys.argv[1:]):
                 right_primer_candidates,
                 all_primers,
                 ambiguous_consensus,
-                log_file
+                log_file,
+                results_dir
             )
         if args.database is not None:
             blast.write_BLAST_warning(off_target_amplicons, amplicon_scheme, log_file)

diff --git a/varvamp/scripts/alignment.py b/varvamp/scripts/alignment.py
@@ -171,7 +171,7 @@ def clean_gaps(alignment, gaps_to_mask):
             stop = region[0]
             masked_seq_temp = sequence[1][start:stop]
             # check if the deletion is at the start
-            if len(masked_seq_temp) == 0:
+            if start == 0 and len(masked_seq_temp) == 0:
                 masked_seq = mask
             # check if deletion is not at start
             elif start == 0 and len(masked_seq_temp) != 0:
@@ -181,7 +181,7 @@ def clean_gaps(alignment, gaps_to_mask):
                 masked_seq = masked_seq + mask + masked_seq_temp
             start = region[1]+1
         if max(gaps_to_mask)[1] < len(sequence[1])-1:
-            # append the last gaps if it is not
+            # append the last seq if no gap is at
             # the end of the sequence
             start = max(gaps_to_mask)[1]
             stop = len(sequence[1])-1

diff --git a/varvamp/scripts/config.py b/varvamp/scripts/config.py
@@ -5,7 +5,7 @@
 # CAN BE CHANGED, DO NOT DELETE
 
 # basic primer parameters
-PRIMER_TMP = (57, 63, 60)  # melting temperatur (min, max, opt)
+PRIMER_TMP = (56, 63, 60)  # melting temperatur (min, max, opt)
 PRIMER_GC_RANGE = (35, 65, 50)  # gc (min, max, opt)
 PRIMER_SIZES = (18, 24, 21)  # size (min, max, opt)
 PRIMER_MAX_POLYX = 3  # max number of polyx repeats
@@ -40,7 +40,7 @@
 PRIMER_TM_PENALTY = 2  # temperature penalty
 PRIMER_GC_PENALTY = 0.2  # gc penalty
 PRIMER_SIZE_PENALTY = 0.5  # size penalty
-PRIMER_MAX_BASE_PENALTY = 8  # max base penalty for a primer
+PRIMER_MAX_BASE_PENALTY = 10  # max base penalty for a primer
 PRIMER_3_PENALTY = (32, 16, 8, 4, 2)  # penalties for 3' mismatches
 PRIMER_PERMUTATION_PENALTY = 0.1  # penalty for the number of permutations
 

diff --git a/varvamp/scripts/primers.py b/varvamp/scripts/primers.py
@@ -5,6 +5,7 @@
 # LIBS
 from Bio.Seq import Seq
 import primer3 as p3
+import math
 
 # varVAMP
 from varvamp.scripts import config
@@ -129,7 +130,7 @@ def rev_complement(seq):
     """
     reverse complement a sequence
     """
-    return(str(Seq(seq).reverse_complement()))
+    return str(Seq(seq).reverse_complement())
 
 
 def calc_permutation_penalty(amb_seq):
@@ -367,21 +368,34 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
     Primer candidates are likely overlapping. Here, the list of primers
     is sorted for the best to worst scoring. Then, the next best scoring
     is retained if it does not have any nucleotides that have already
-    been covered by a better scoring primer candidate. This significantly
-    reduces the amount of primers while retaining the best scoring ones.
+    been covered by the middle third of a better scoring primer candidate.
+    This significantly reduces the amount of primers while retaining
+    the best scoring ones.
+
+    Example:
+    -------- (score 1) 1
+        ------------- (score 1) 2
+            ------------ (score 0.8) 3
+                      ----------(score 0.9) 4
+
+    --> primer 3 would be retained and primer 2 excluded, primer 4 and 1
+    will however be considered in the next set of overlapping primers.
+
     """
     all_primers = {}
 
     for direction, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
-        # sort the primers for the best scoring
-        primer_candidates.sort(key=lambda x: x[3])
+        # sort the primers for the best scoring, and if same score by start
+        primer_candidates.sort(key=lambda x: (x[3], x[1]))
         # ini everything with the top scoring primer
         to_retain = [primer_candidates[0]]
         primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]+1))
         primer_set = set(primer_ranges)
 
         for primer in primer_candidates:
-            primer_positions = list(range(primer[1], primer[2]+1))
+            # get the thirds of the primer, only consider the middle
+            thirds_len = int((primer[2] - primer[1])/3)
+            primer_positions = list(range(primer[1] + thirds_len, primer[2] - thirds_len + 1))
             # check if none of the nucleotides of the next primer
             # are already covered by a better primer
             if not any(x in primer_positions for x in primer_set):

diff --git a/varvamp/scripts/qpcr.py b/varvamp/scripts/qpcr.py
@@ -111,8 +111,8 @@ def flanking_primer_subset(primer_list, primer_direction, probe):
         if window_start < primer[1] and primer[2] < window_stop:
             subset.append(primer)
 
-    # sort by score
-    subset.sort(key=lambda x: x[3])
+    # sort by score and start if same score
+    subset.sort(key=lambda x: (x[3], x[1]))
 
     return subset
 

diff --git a/varvamp/scripts/regions.py b/varvamp/scripts/regions.py
@@ -69,14 +69,14 @@ def digest_seq(seq, kmer_size):
     """
     digest the sequence into kmers
     """
-    return[[seq[i:i+kmer_size], i, i+len(seq[i:i+kmer_size])] for i in range(len(seq)-kmer_size+1)]
+    return [[seq[i:i+kmer_size], i, i+len(seq[i:i+kmer_size])] for i in range(len(seq)-kmer_size+1)]
 
 
 def produce_kmers(primer_regions, consensus, sizes=config.PRIMER_SIZES):
     """
     produce kmers for all primer regions
     """
-    kmers = []
+    kmers = set()
 
     for region in primer_regions:
         sliced_seq = consensus[region[0]:region[1]]
@@ -87,8 +87,6 @@ def produce_kmers(primer_regions, consensus, sizes=config.PRIMER_SIZES):
                 kmer_temp[1] = kmer_temp[1]+region[0]
                 kmer_temp[2] = kmer_temp[2]+region[0]
                 # check if kmer is already in list (overlapping regions)
-                if kmer_temp in kmers:
-                    continue
-                kmers.append(kmer_temp)
+                kmers.add(tuple(kmer_temp))
 
     return kmers