Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow for left/right primer hits to overlap when building primer pair hits with OverlapDetector #102

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
4 changes: 2 additions & 2 deletions prymer/api/picking.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,13 @@ def build_primer_pairs( # noqa: C901

# If the right primer isn't "to the right" of the left primer, move on
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes were from linting.

if rp.span.start < lp.span.start or lp.span.end > rp.span.end:
first_right_primer_idx = max(first_right_primer_idx, j+1)
first_right_primer_idx = max(first_right_primer_idx, j + 1)
continue

amp_span = PrimerPair.calculate_amplicon_span(lp, rp)

if amp_span.length < amplicon_sizes.min:
first_right_primer_idx = max(first_right_primer_idx, j+1)
first_right_primer_idx = max(first_right_primer_idx, j + 1)
continue

if amp_span.length > amplicon_sizes.max:
Expand Down
66 changes: 54 additions & 12 deletions prymer/offtarget/offtarget_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ def __init__( # noqa: C901
max_gap_opens: int = 0,
max_gap_extends: int = -1,
max_amplicon_size: int = 1000,
min_amplicon_size: int = 1,
min_primer_pair_hits: int = 1,
allow_overlapping_hits: bool = False,
cache_results: bool = True,
threads: Optional[int] = None,
keep_spans: bool = True,
Expand All @@ -190,7 +192,8 @@ def __init__( # noqa: C901
and `max_primer_hits`.
2. Filtering of individual primers: `max_primer_hits`.
3. Checking of primer pairs: `max_primer_hits`, `min_primer_pair_hits`,
`max_primer_pair_hits`, and `max_amplicon_size`.
`max_primer_pair_hits`, `allow_overlapping_hits`, `min_amplicon_size, and
`max_amplicon_size`.

The `three_prime_region_length` parameter is used as the seed length for `bwa aln`.

Expand Down Expand Up @@ -225,6 +228,13 @@ def __init__( # noqa: C901
"long gaps". Must be greater than or equal to -1.
max_amplicon_size: the maximum amplicon size to consider amplifiable. Must be greater
than 0.
min_amplicon_size: the minimum amplicon size to consider amplifiable. Must be between 1
and `max_amplicon_size` inclusive. If `allow_overlapping_hits` is False, primer
pair hits that overlap each other will not be considered, even if they would create
an amplicon of at least `min_amplicon_size`. Default 1bp.
allow_overlapping_hits: if True, allow hits to the individual primers in a primer pair
to overlap with each other. If False, no overlap will be allowed, even if
`min_amplicon_size` would otherwise allow it. Default False.
cache_results: if True, cache results for faster re-querying
threads: the number of threads to use when invoking bwa
keep_spans: if True, [[OffTargetResult]] objects will be reported with amplicon spans
Expand All @@ -235,6 +245,8 @@ def __init__( # noqa: C901

Raises:
ValueError: If `max_amplicon_size` is not greater than 0.
ValueError: If `min_amplicon_size` is outside the range 1 to `max_amplicon_size`,
inclusive.
ValueError: If any of `max_primer_hits`, `max_primer_pair_hits`, or
`min_primer_pair_hits` are not greater than or equal to 0.
ValueError: If `three_prime_region_length` is not greater than or equal to 8.
Expand All @@ -247,6 +259,11 @@ def __init__( # noqa: C901
errors: list[str] = []
if max_amplicon_size < 1:
errors.append(f"'max_amplicon_size' must be greater than 0. Saw {max_amplicon_size}")
if min_amplicon_size < 1 or min_amplicon_size > max_amplicon_size:
errors.append(
f"'min_amplicon_size' must be between 1 and 'max_amplicon_size'={max_amplicon_size}"
f" inclusive. Saw {min_amplicon_size}"
)
if max_primer_hits < 0:
errors.append(
f"'max_primer_hits' must be greater than or equal to 0. Saw {max_primer_hits}"
Expand Down Expand Up @@ -310,6 +327,8 @@ def __init__( # noqa: C901
self._max_primer_pair_hits: int = max_primer_pair_hits
self._min_primer_pair_hits: int = min_primer_pair_hits
self._max_amplicon_size: int = max_amplicon_size
self._min_amplicon_size: int = min_amplicon_size
self._allow_overlapping_hits: bool = allow_overlapping_hits
self._cache_results: bool = cache_results
self._keep_spans: bool = keep_spans
self._keep_primer_spans: bool = keep_primer_spans
Expand Down Expand Up @@ -357,7 +376,11 @@ def check_all(self, primer_pairs: list[PrimerPair]) -> dict[PrimerPair, OffTarge

This method maps each primer to the specified reference with `bwa aln` to search for
off-target hits. Possible amplicons are identified from the pairwise combinations of these
alignments, up to the specified `max_amplicon_size`.
alignments.

If `allow_overlapping_hits=True`, amplicons in the size range `min_amplicon_size` to
`max_amplicon_size` will be returned. If `allow_overlapping_hits=False`, the size range will
still apply, but will exclude amplicons with overlapping hits.

Primer pairs are marked as passing if both of the following are true:
1. Each primer has no more than `max_primer_hits` alignments to the genome.
Expand Down Expand Up @@ -461,15 +484,19 @@ def _build_off_target_result(
self._to_amplicons(
positive_hits=left_positive_hits[refname],
negative_hits=right_negative_hits[refname],
min_len=self._min_amplicon_size,
max_len=self._max_amplicon_size,
allow_overlapping_hits=self._allow_overlapping_hits,
strand=Strand.POSITIVE,
)
)
amplicons.extend(
self._to_amplicons(
positive_hits=right_positive_hits[refname],
negative_hits=left_negative_hits[refname],
min_len=self._min_amplicon_size,
max_len=self._max_amplicon_size,
allow_overlapping_hits=self._allow_overlapping_hits,
strand=Strand.NEGATIVE,
)
)
Expand Down Expand Up @@ -546,14 +573,21 @@ def mappings_of(self, primers: list[PrimerType]) -> dict[str, BwaResult]:

@staticmethod
def _to_amplicons(
positive_hits: list[BwaHit], negative_hits: list[BwaHit], max_len: int, strand: Strand
positive_hits: list[BwaHit],
negative_hits: list[BwaHit],
max_len: int,
strand: Strand,
min_len: int = 1,
allow_overlapping_hits: bool = False,
) -> list[Span]:
"""Takes lists of positive strand hits and negative strand hits and constructs amplicon
mappings anywhere a positive strand hit and a negative strand hit occur where the end of
the negative strand hit is no more than `max_len` from the start of the positive strand
hit.
the negative strand hit is at least `min_len` and no more than `max_len` from the start of
the positive strand hit.

Primers may not overlap.
Primer hits may overlap if `min_len` is shorter than the length of the left and right
primers together and `allow_overlapping_hits` is True. If `allow_overlapping_hits` is False,
these will not be returned.

Args:
positive_hits: List of hits on the positive strand for one of the primers in the pair.
Expand All @@ -563,6 +597,12 @@ def _to_amplicons(
`positive_hits` are for the left primer and `negative_hits` are for the right
primer. Set to Strand.NEGATIVE if `positive_hits` are for the right primer and
`negative_hits` are for the left primer.
min_len: Minimum length of amplicons to consider. If `allow_overlapping_hits` is False,
primer pair hits that overlap each other will not be considered, even if they would
create an amplicon of at least `min_len`. Default 1bp.
allow_overlapping_hits: if True, allow hits to the individual primers in a primer pair
to overlap with each other. If False, no overlap will be allowed, even if
`min_amplicon_size` would otherwise allow it. Default False.

Raises:
ValueError: If any of the positive hits are not on the positive strand, or any of the
Expand Down Expand Up @@ -600,13 +640,15 @@ def _to_amplicons(
negative_hits_sorted[prev_negative_hit_index:],
start=prev_negative_hit_index,
):
# TODO: Consider allowing overlapping positive and negative hits.
if (
negative_hit.start > positive_hit.end
and negative_hit.end - positive_hit.start + 1 <= max_len
# Hit coordinates are 1-based end-inclusive.
amplicon_length: int = negative_hit.end - positive_hit.start + 1
if min_len <= amplicon_length <= max_len and (
allow_overlapping_hits or negative_hit.start > positive_hit.end
):
# If the negative hit starts to the right of the positive hit, and the amplicon
# length is <= max_len, add it to the list of amplicon hits to be returned.
# If overlapping hits are allowed and the amplicon length is in the correct
# size range, -OR- if overlapping hits are not allowed, the negative hit starts
# after the positive hit ends, and the amplicon length is less than or equal
# to the allowed maximum, add it to the list of amplicon hits to be returned.
amplicons.append(
Span(
refname=positive_hit.refname,
Expand Down
106 changes: 93 additions & 13 deletions tests/offtarget/test_offtarget.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ def _build_detector(
three_prime_region_length: int = 20,
max_mismatches_in_three_prime_region: int = 0,
max_mismatches: int = 0,
min_amplicon_size: int = 1,
max_amplicon_size: int = 250,
allow_overlapping_hits: bool = False,
cache_results: bool = True,
) -> OffTargetDetector:
"""Builds an `OffTargetDetector` with strict defaults"""
Expand All @@ -35,7 +37,9 @@ def _build_detector(
three_prime_region_length=three_prime_region_length,
max_mismatches_in_three_prime_region=max_mismatches_in_three_prime_region,
max_mismatches=max_mismatches,
min_amplicon_size=min_amplicon_size,
max_amplicon_size=max_amplicon_size,
allow_overlapping_hits=allow_overlapping_hits,
cache_results=cache_results,
keep_spans=True,
keep_primer_spans=True,
Expand Down Expand Up @@ -281,7 +285,75 @@ def test_to_amplicons(
) -> None:
with _build_detector(ref_fasta=ref_fasta, cache_results=cache_results) as detector:
actual = detector._to_amplicons(
positive_hits=[positive], negative_hits=[negative], max_len=250, strand=strand
positive_hits=[positive],
negative_hits=[negative],
min_len=200,
max_len=250,
strand=strand,
)
assert actual == expected, test_id


# Test that using the cache (or not) does not affect the results
# NB: BwaHit and Span coordinates are 1-based end-inclusive
#
# One mapping - Overlapping primers (1bp overlap)
# >>>>>>>>>>
# <<<<<<<<<<
# 19bp amplicon [1,19]
#
# One mapping - Overlapping primers (1bp amplicon)
# >>>>>>>>>>
# <<<<<<<<<<
# 1bp amplicon [10,10]
#
# No mappings - amplicon length would be 0.
# >>>>>>>>>>
# <<<<<<<<<<
@pytest.mark.parametrize("cache_results", [True, False])
@pytest.mark.parametrize(
"test_id, positive, negative, strand, expected",
[
(
"One mapping - Overlapping primers (1bp overlap)",
BwaHit.build("chr1", 1, False, "10M", 0),
BwaHit.build("chr1", 10, True, "10M", 0),
Strand.POSITIVE,
[Span(refname="chr1", start=1, end=19, strand=Strand.POSITIVE)],
),
(
"One mapping - Overlapping primers (1bp amplicon)",
BwaHit.build("chr1", 10, False, "10M", 0),
BwaHit.build("chr1", 1, True, "10M", 0),
Strand.POSITIVE,
[Span(refname="chr1", start=10, end=10, strand=Strand.POSITIVE)],
),
(
"No mappings",
BwaHit.build("chr1", 11, False, "10M", 0),
BwaHit.build("chr1", 1, True, "10M", 0),
Strand.POSITIVE,
[],
),
],
)
def test_to_amplicons_overlapping(
ref_fasta: Path,
test_id: str,
positive: BwaHit,
negative: BwaHit,
strand: Strand,
expected: list[Span],
cache_results: bool,
) -> None:
with _build_detector(ref_fasta=ref_fasta, cache_results=cache_results) as detector:
actual = detector._to_amplicons(
positive_hits=[positive],
negative_hits=[negative],
min_len=1,
max_len=250,
allow_overlapping_hits=True,
strand=strand,
)
assert actual == expected, test_id

Expand Down Expand Up @@ -322,7 +394,11 @@ def test_to_amplicons_value_error(
pytest.raises(ValueError, match=expected_error),
):
detector._to_amplicons(
positive_hits=[positive], negative_hits=[negative], max_len=250, strand=Strand.POSITIVE
positive_hits=[positive],
negative_hits=[negative],
min_len=200,
max_len=250,
strand=Strand.POSITIVE,
)


Expand Down Expand Up @@ -356,20 +432,22 @@ class CustomPrimer(Oligo):
@pytest.mark.parametrize(
(
"max_primer_hits,max_primer_pair_hits,min_primer_pair_hits,three_prime_region_length,"
"max_mismatches_in_three_prime_region,max_mismatches,max_amplicon_size,"
"max_mismatches_in_three_prime_region,max_mismatches,max_amplicon_size,min_amplicon_size,"
"max_gap_opens,max_gap_extends,expected_error"
),
[
(-1, 1, 1, 20, 0, 0, 1, 0, 0, "'max_primer_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, -1, 1, 20, 0, 0, 1, 0, 0, "'max_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, 1, -1, 20, 0, 0, 1, 0, 0, "'min_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, 1, 1, 5, 0, 0, 1, 0, 0, "'three_prime_region_length' must be greater than or equal to 8. Saw 5"), # noqa: E501
(1, 1, 1, 20, -1, 0, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw -1"), # noqa: E501
(1, 1, 1, 20, 21, 0, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw 21"), # noqa: E501
(1, 1, 1, 20, 0, -1, 1, 0, 0, "'max_mismatches' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, 1, 1, 20, 0, 0, 0, 0, 0, "'max_amplicon_size' must be greater than 0. Saw 0"),
(1, 1, 1, 20, 0, 0, 1, -1, 0, "'max_gap_opens' must be greater than or equal to 0. Saw -1"),
(1, 1, 1, 20, 0, 5, 1, 0, -2, re.escape("'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'=5) or greater than or equal to 0. Saw -2")), #noqa: E501
(-1, 1, 1, 20, 0, 0, 1, 1, 0, 0, "'max_primer_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, -1, 1, 20, 0, 0, 1, 1, 0, 0, "'max_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, 1, -1, 20, 0, 0, 1, 1, 0, 0, "'min_primer_pair_hits' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, 1, 1, 5, 0, 0, 1, 1, 0, 0, "'three_prime_region_length' must be greater than or equal to 8. Saw 5"), # noqa: E501
(1, 1, 1, 20, -1, 0, 1, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw -1"), # noqa: E501
(1, 1, 1, 20, 21, 0, 1, 1, 0, 0, "'max_mismatches_in_three_prime_region' must be between 0 and 'three_prime_region_length'=20 inclusive. Saw 21"), # noqa: E501
(1, 1, 1, 20, 0, -1, 1, 1, 0, 0, "'max_mismatches' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, 1, 1, 20, 0, 0, 0, 1, 0, 0, "'max_amplicon_size' must be greater than 0. Saw 0"),
(1, 1, 1, 20, 0, 0, 1, 1, -1, 0, "'max_gap_opens' must be greater than or equal to 0. Saw -1"), # noqa: E501
(1, 1, 1, 20, 0, 5, 1, 1, 0, -2, re.escape("'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'=5) or greater than or equal to 0. Saw -2")), #noqa: E501
(1, 1, 1, 20, 0, 0, 10, 0, 0, 0, "'min_amplicon_size' must be between 1 and 'max_amplicon_size'=10 inclusive. Saw 0"), # noqa: E501
(1, 1, 1, 20, 0, 0, 10, 11, 0, 0, "'min_amplicon_size' must be between 1 and 'max_amplicon_size'=10 inclusive. Saw 11"), # noqa: E501
],
)
# fmt: on
Expand All @@ -382,6 +460,7 @@ def test_init(
max_mismatches_in_three_prime_region: int,
max_mismatches: int,
max_amplicon_size: int,
min_amplicon_size: int,
max_gap_opens: int,
max_gap_extends: int,
expected_error: str,
Expand All @@ -396,6 +475,7 @@ def test_init(
max_mismatches_in_three_prime_region=max_mismatches_in_three_prime_region,
max_mismatches=max_mismatches,
max_amplicon_size=max_amplicon_size,
min_amplicon_size=min_amplicon_size,
max_gap_opens=max_gap_opens,
max_gap_extends=max_gap_extends,
)
Loading