Skip to content

Commit

Permalink
feat: added template.fixmate() [issue #83]
Browse files Browse the repository at this point in the history
- added fixmate() method
- added set_mate_info_on_supplementary() helper function
- moved builder._set_mate_info() to helper function set_mate_info()
- moved test_template_iterator.py to test_template.py (contained more than just iterator)
- added template_test_fixmate() tests
  • Loading branch information
TimD1 committed Sep 3, 2024
1 parent d574e5a commit dff494d
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 71 deletions.
106 changes: 105 additions & 1 deletion fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This module contains utility classes for working with SAM/BAM files and the data contained
within them. This includes i) utilities for opening SAM/BAM files for reading and writing,
ii) functions for manipulating supplementary alignments, iii) classes and functions for
maniuplating CIGAR strings, and iv) a class for building sam records and files for testing.
manipulating CIGAR strings, and iv) a class for building sam records and files for testing.
## Motivation for Reader and Writer methods
Expand Down Expand Up @@ -691,6 +691,100 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr
r2.template_length = -insert_size


def set_mate_info(r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.
Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.
Args:
r1: the first read in the pair
r2: the second read in the pair
"""
assert (
r1.query_name == r2.query_name
), "Attempting to set mate info on reads with different qnames."

for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = NO_REF_INDEX
rec.next_reference_id = NO_REF_INDEX
rec.reference_start = NO_REF_POS
rec.next_reference_start = NO_REF_POS
rec.is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = True
rec.set_tag("MC", None)

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True
m.set_tag("MC", None)

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True


def set_mate_info_on_supplementary(
primary: AlignedSegment, supp: AlignedSegment, set_cigar: bool = False
) -> None:
"""Sets mate pair information on a supplemental record (e.g. from a split alignment), using
the primary alignment of the read's mate.
Args:
primary: the primary alignment of the mate pair of the supplemental
supp: a supplemental alignment for the mate pair of the primary supplied
set_cigar: true if we should update/create the mate CIGAR (MC) optional tag,
false if we should remove any mate CIGAR tag that is present
"""
supp.next_reference_id = primary.reference_id
supp.next_reference_start = primary.reference_start
supp.mate_is_reverse = primary.is_reverse
supp.mate_is_unmapped = primary.is_unmapped
supp.template_length = -primary.template_length
if set_cigar and not primary.is_unmapped:
supp.set_tag("MC", primary.cigarstring)
else:
supp.set_tag("MC", None)

Check warning on line 785 in fgpyo/sam/__init__.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sam/__init__.py#L785

Added line #L785 was not covered by tests


@attr.s(frozen=True, auto_attribs=True)
class ReadEditInfo:
"""
Expand Down Expand Up @@ -954,6 +1048,16 @@ def set_tag(
for rec in self.all_recs():
rec.set_tag(tag, value)

def fixmate(self) -> None:
"""Fixes mate information and sets mate CIGAR on all primary and supplementary
(but not secondary) records.
"""
for r2_supplemental in self.r2_supplementals:
set_mate_info_on_supplementary(self.r1, r2_supplemental, True)
for r1_supplemental in self.r1_supplementals:
set_mate_info_on_supplementary(self.r2, r1_supplemental, True)
set_mate_info(self.r1, self.r2)


class TemplateIterator(Iterator[Template]):
"""
Expand Down
72 changes: 4 additions & 68 deletions fgpyo/sam/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _set_length_dependent_fields(
If any of bases, quals or cigar are defined, they must all have the same length/query
length. If none are defined then the length parameter is used. Undefined values are
synthesize at the inferred length.
synthesized at the inferred length.
Args:
rec: a SAM record
Expand Down Expand Up @@ -264,70 +264,6 @@ def _set_length_dependent_fields(
if not rec.is_unmapped:
rec.cigarstring = cigar if cigar else f"{length}M"

def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.
Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.
Args:
r1: the first read in the pair
r2: the sceond read in the pair
"""
for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = sam.NO_REF_INDEX
rec.next_reference_id = sam.NO_REF_INDEX
rec.reference_start = sam.NO_REF_POS
rec.next_reference_start = sam.NO_REF_POS
rec.is_unmapped = True
rec.mate_is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True

def rg(self) -> Dict[str, Any]:
"""Returns the single read group that is defined in the header."""
# The `RG` field contains a list of read group mappings
Expand Down Expand Up @@ -468,7 +404,7 @@ def add_pair(
)

# Sync up mate info and we're done!
self._set_mate_info(r1, r2)
sam.set_mate_info(r1, r2)
self._records.append(r1)
self._records.append(r2)
return r1, r2
Expand All @@ -489,12 +425,12 @@ def add_single(
supplementary: bool = False,
attrs: Optional[Dict[str, Any]] = None,
) -> AlignedSegment:
"""Generates a new single reads, adds them to the internal collection, and returns it.
"""Generates a new single read, adds it to the internal collection, and returns it.
Most fields are optional.
If `read_num` is None (the default) an unpaired read will be created. If `read_num` is
set to 1 or 2, the read will have it's paired flag set and read number flags set.
set to 1 or 2, the read will have its paired flag set and read number flags set.
An unmapped read can be created by calling the method with no parameters (specifically,
not setting chrom, start1 or start2). If cigar is provided, it will be ignored.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from fgpyo import sam
from fgpyo.sam import Template
from fgpyo.sam import TemplateIterator
from fgpyo.sam import reader
Expand Down Expand Up @@ -29,7 +30,7 @@ def test_template_init_function() -> None:
assert len([t for t in template.r1_secondaries]) == 0


def test_to_templates() -> None:
def test_template_iterator() -> None:
builder = SamBuilder()

# Series of alignments for one template
Expand Down Expand Up @@ -103,7 +104,7 @@ def test_write_template(
assert len([r for r in template.all_recs()]) == 2


def test_set_tag() -> None:
def test_template_set_tag() -> None:
builder = SamBuilder()
template = Template.build(builder.add_pair(chrom="chr1", start1=100, start2=200))

Expand All @@ -129,3 +130,65 @@ def test_set_tag() -> None:
for bad_tag in ["", "A", "ABC", "ABCD"]:
with pytest.raises(AssertionError, match="Tags must be 2 characters"):
template.set_tag(bad_tag, VALUE)


def test_template_fixmate() -> None:
builder = SamBuilder(r1_len=100, r2_len=100, base_quality=30)

# both reads are mapped
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2])
template.fixmate()
assert r1.get_tag("MC") == "100M"
assert r2.get_tag("MC") == "80M20S"
assert r2.next_reference_start == 107

# only read 1 is mapped
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2])
template.fixmate()
with pytest.raises(KeyError):
r1.get_tag("MC")
assert r2.get_tag("MC") == "80M20S"
assert r2.next_reference_start == 107

# neither reads are mapped
r1, r2 = builder.add_pair(chrom=sam.NO_REF_NAME)
r1.cigarstring = "80M20S"
template = Template.build([r1, r2])
template.fixmate()
with pytest.raises(KeyError):
r1.get_tag("MC")
with pytest.raises(KeyError):
r2.get_tag("MC")

# all supplementary (and not secondard) records should be updated
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500)
supp1a = builder.add_single(name="q1", read_num=1, chrom="chr1", start=101, supplementary=True)
supp1b = builder.add_single(name="q1", read_num=1, chrom="chr1", start=102, supplementary=True)
supp2a = builder.add_single(name="q1", read_num=2, chrom="chr1", start=501, supplementary=True)
supp2b = builder.add_single(name="q1", read_num=2, chrom="chr1", start=502, supplementary=True)
sec1 = builder.add_single(name="q1", read_num=1, chrom="chr1", start=1001, secondary=True)
sec2 = builder.add_single(name="q1", read_num=2, chrom="chr1", start=1002, secondary=True)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2, supp1a, supp1b, supp2a, supp2b, sec1, sec2])
template.fixmate()
assert r1.get_tag("MC") == "100M"
assert supp1a.get_tag("MC") == "100M"
assert supp1b.get_tag("MC") == "100M"
with pytest.raises(KeyError):
sec1.get_tag("MC")
assert r2.get_tag("MC") == "80M20S"
assert supp2a.get_tag("MC") == "80M20S"
assert supp2b.get_tag("MC") == "80M20S"
with pytest.raises(KeyError):
sec2.get_tag("MC")
assert r2.next_reference_start == 107
assert supp2a.next_reference_start == 107
assert supp2b.next_reference_start == 107
assert sec2.next_reference_start == -1

0 comments on commit dff494d

Please sign in to comment.