Skip to content

Commit

Permalink
Further tweaks (#21)
Browse files Browse the repository at this point in the history
* fixed alignment bug, tweaked config, fixed plot warnings

* changed kmer list to tuple to speed up kmer digestion

* checks now blast installation for qpcr mode

* adjusted best primer selection

* fixed unwanted sorting behaviour

* updated docs

* reworked output and dimer search

* docstring update

* adjusted cutoff for t=1
  • Loading branch information
jonas-fuchs authored Oct 17, 2023
1 parent 6354c64 commit 3550def
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 112 deletions.
2 changes: 1 addition & 1 deletion docs/how_varvamp_works.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ varVAMP searches for potential primer regions as defined by a user-defined numbe
varVAMP uses [`primer3-py`](https://pypi.org/project/primer3-py/) to search for potential primers. Some of the evaluation process, determining if primers match certain criteria, was adapted from [`primalscheme`](www.github.com/aresti/primalscheme). The primer search contains multiple steps:
1. Digest the primer regions into kmers with the min and max length of primers. This is performed on a consensus sequence that does not contain ambiguous characters but is just the majority consensus of the alignment. Therefore, primer parameters will be later calculated for the best fitting primer.
2. Evaluate if these kmers are potential primers independent of their orientation (temperature, GC, size, poly-x repeats and poly dinucleotide repeats) and dependent on their orientation (secondary structure, GC clamp, number of GCs in the last 5 bases of the 3' end and min 3' nucleotides without an ambiguous base). Filter for kmers that satisfy all constraints and calculate their penalties (explained in the last section).
3. Sanger and tiled mode: Find lowest scoring primer. varVAMP sorts the primers by their score and always takes the best scoring if the primer positions have not been covered by a better primer. This greatly reduces the complexity of the later amplicon search while only retaining the best scoring primer of a set of overlapping primers.
3. Sanger and tiled mode: Find lowest scoring primer. varVAMP sorts the primers by their score and always takes the best scoring if middle third of the primer has not been covered by a better primer. This greatly reduces the complexity of the later amplicon search while only retaining the best scoring primer of a set of overlapping primers.

### Amplicon search

Expand Down
2 changes: 1 addition & 1 deletion varvamp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Tool to design amplicons for highly variable virusgenomes"""
_program = "varvamp"
__version__ = "0.9.3"
__version__ = "0.9.4"
14 changes: 7 additions & 7 deletions varvamp/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def sanger_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_
logging.varvamp_progress(
log_file,
progress=0.7,
job="Considering non-overlapping low scoring primers.",
job="Considering low scoring primers.",
progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rw primers"
)

Expand Down Expand Up @@ -345,7 +345,7 @@ def sanger_workflow(args, amplicons, all_primers, log_file):
return amplicon_scheme


def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file):
def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file, results_dir):
"""
part of the workflow specific for the tiled mode
"""
Expand All @@ -371,7 +371,7 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
f"varVAMP found {len(dimers_not_solved)} primer dimers without replacements. Check the dimer file and perform the PCR for incomaptible amplicons in a sperate reaction.",
log_file
)
reporting.write_dimers(dir, dimers_not_solved)
reporting.write_dimers(results_dir, dimers_not_solved)

# evaluate coverage
percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
Expand Down Expand Up @@ -491,9 +491,8 @@ def main(sysargs=sys.argv[1:]):
start_time = datetime.datetime.now()
results_dir, data_dir, log_file = logging.create_dir_structure(args.input[1])
# check if blast is installed
if args.mode == "tiled" or args.mode == "sanger":
if args.database is not None:
blast.check_BLAST_installation(log_file)
if args.database is not None:
blast.check_BLAST_installation(log_file)

# mode unspecific part of the workflow
alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates = shared_workflow(args, log_file)
Expand Down Expand Up @@ -528,7 +527,8 @@ def main(sysargs=sys.argv[1:]):
right_primer_candidates,
all_primers,
ambiguous_consensus,
log_file
log_file,
results_dir
)
if args.database is not None:
blast.write_BLAST_warning(off_target_amplicons, amplicon_scheme, log_file)
Expand Down
4 changes: 2 additions & 2 deletions varvamp/scripts/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def clean_gaps(alignment, gaps_to_mask):
stop = region[0]
masked_seq_temp = sequence[1][start:stop]
# check if the deletion is at the start
if len(masked_seq_temp) == 0:
if start == 0 and len(masked_seq_temp) == 0:
masked_seq = mask
# check if deletion is not at start
elif start == 0 and len(masked_seq_temp) != 0:
Expand All @@ -181,7 +181,7 @@ def clean_gaps(alignment, gaps_to_mask):
masked_seq = masked_seq + mask + masked_seq_temp
start = region[1]+1
if max(gaps_to_mask)[1] < len(sequence[1])-1:
# append the last gaps if it is not
# append the last seq if no gap is at
# the end of the sequence
start = max(gaps_to_mask)[1]
stop = len(sequence[1])-1
Expand Down
4 changes: 2 additions & 2 deletions varvamp/scripts/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# CAN BE CHANGED, DO NOT DELETE

# basic primer parameters
PRIMER_TMP = (57, 63, 60) # melting temperatur (min, max, opt)
PRIMER_TMP = (56, 63, 60) # melting temperatur (min, max, opt)
PRIMER_GC_RANGE = (35, 65, 50) # gc (min, max, opt)
PRIMER_SIZES = (18, 24, 21) # size (min, max, opt)
PRIMER_MAX_POLYX = 3 # max number of polyx repeats
Expand Down Expand Up @@ -40,7 +40,7 @@
PRIMER_TM_PENALTY = 2 # temperature penalty
PRIMER_GC_PENALTY = 0.2 # gc penalty
PRIMER_SIZE_PENALTY = 0.5 # size penalty
PRIMER_MAX_BASE_PENALTY = 8 # max base penalty for a primer
PRIMER_MAX_BASE_PENALTY = 10 # max base penalty for a primer
PRIMER_3_PENALTY = (32, 16, 8, 4, 2) # penalties for 3' mismatches
PRIMER_PERMUTATION_PENALTY = 0.1 # penalty for the number of permutations

Expand Down
26 changes: 20 additions & 6 deletions varvamp/scripts/primers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# LIBS
from Bio.Seq import Seq
import primer3 as p3
import math

# varVAMP
from varvamp.scripts import config
Expand Down Expand Up @@ -129,7 +130,7 @@ def rev_complement(seq):
"""
reverse complement a sequence
"""
return(str(Seq(seq).reverse_complement()))
return str(Seq(seq).reverse_complement())


def calc_permutation_penalty(amb_seq):
Expand Down Expand Up @@ -367,21 +368,34 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
Primer candidates are likely overlapping. Here, the list of primers
is sorted for the best to worst scoring. Then, the next best scoring
is retained if it does not have any nucleotides that have already
been covered by a better scoring primer candidate. This significantly
reduces the amount of primers while retaining the best scoring ones.
been covered by the middle third of a better scoring primer candidate.
This significantly reduces the amount of primers while retaining
the best scoring ones.
Example:
-------- (score 1) 1
------------- (score 1) 2
------------ (score 0.8) 3
----------(score 0.9) 4
--> primer 3 would be retained and primer 2 excluded, primer 4 and 1
will however be considered in the next set of overlapping primers.
"""
all_primers = {}

for direction, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
# sort the primers for the best scoring
primer_candidates.sort(key=lambda x: x[3])
# sort the primers for the best scoring, and if same score by start
primer_candidates.sort(key=lambda x: (x[3], x[1]))
# ini everything with the top scoring primer
to_retain = [primer_candidates[0]]
primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]+1))
primer_set = set(primer_ranges)

for primer in primer_candidates:
primer_positions = list(range(primer[1], primer[2]+1))
# get the thirds of the primer, only consider the middle
thirds_len = int((primer[2] - primer[1])/3)
primer_positions = list(range(primer[1] + thirds_len, primer[2] - thirds_len + 1))
# check if none of the nucleotides of the next primer
# are already covered by a better primer
if not any(x in primer_positions for x in primer_set):
Expand Down
4 changes: 2 additions & 2 deletions varvamp/scripts/qpcr.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ def flanking_primer_subset(primer_list, primer_direction, probe):
if window_start < primer[1] and primer[2] < window_stop:
subset.append(primer)

# sort by score
subset.sort(key=lambda x: x[3])
# sort by score and start if same score
subset.sort(key=lambda x: (x[3], x[1]))

return subset

Expand Down
8 changes: 3 additions & 5 deletions varvamp/scripts/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@ def digest_seq(seq, kmer_size):
"""
digest the sequence into kmers
"""
return[[seq[i:i+kmer_size], i, i+len(seq[i:i+kmer_size])] for i in range(len(seq)-kmer_size+1)]
return [[seq[i:i+kmer_size], i, i+len(seq[i:i+kmer_size])] for i in range(len(seq)-kmer_size+1)]


def produce_kmers(primer_regions, consensus, sizes=config.PRIMER_SIZES):
"""
produce kmers for all primer regions
"""
kmers = []
kmers = set()

for region in primer_regions:
sliced_seq = consensus[region[0]:region[1]]
Expand All @@ -87,8 +87,6 @@ def produce_kmers(primer_regions, consensus, sizes=config.PRIMER_SIZES):
kmer_temp[1] = kmer_temp[1]+region[0]
kmer_temp[2] = kmer_temp[2]+region[0]
# check if kmer is already in list (overlapping regions)
if kmer_temp in kmers:
continue
kmers.append(kmer_temp)
kmers.add(tuple(kmer_temp))

return kmers
Loading

0 comments on commit 3550def

Please sign in to comment.