Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: added template.fixmate() [issue #83]
Browse files Browse the repository at this point in the history
- added fixmate() method
- added set_mate_info_on_supplementary() helper function
- moved builder._set_mate_info() to helper function set_mate_info()
- moved test_template_iterator.py to test_template.py (contained more than just iterator)
- added template_test_fixmate() tests
TimD1 committed Sep 3, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent d574e5a commit c0f71f5
Showing 3 changed files with 169 additions and 71 deletions.
106 changes: 105 additions & 1 deletion fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
This module contains utility classes for working with SAM/BAM files and the data contained
within them. This includes i) utilities for opening SAM/BAM files for reading and writing,
ii) functions for manipulating supplementary alignments, iii) classes and functions for
maniuplating CIGAR strings, and iv) a class for building sam records and files for testing.
manipulating CIGAR strings, and iv) a class for building sam records and files for testing.
## Motivation for Reader and Writer methods
@@ -691,6 +691,100 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr
r2.template_length = -insert_size


def set_mate_info(r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.
Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.
Args:
r1: the first read in the pair
r2: the second read in the pair
"""
assert (
r1.query_name == r2.query_name
), "Attempting to set mate info on reads with different qnames."

for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = NO_REF_INDEX
rec.next_reference_id = NO_REF_INDEX
rec.reference_start = NO_REF_POS
rec.next_reference_start = NO_REF_POS
rec.is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = True
rec.set_tag("MC", None)

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True
m.set_tag("MC", None)

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True


def set_mate_info_on_supplementary(
primary: AlignedSegment, supp: AlignedSegment, set_cigar: bool = False
) -> None:
"""Sets mate pair information on a supplemental record (e.g. from a split alignment), using
the primary alignment of the read's mate.
Args:
primary: the primary alignment of the mate pair of the supplemental
supp: a supplemental alignment for the mate pair of the primary supplied
set_cigar: true if we should update/create the mate CIGAR (MC) optional tag,
false if we should remove any mate CIGAR tag that is present
"""
supp.next_reference_id = primary.reference_id
supp.next_reference_start = primary.reference_start
supp.mate_is_reverse = primary.is_reverse
supp.mate_is_unmapped = primary.is_unmapped
supp.template_length = -primary.template_length
if set_cigar and not primary.is_unmapped:
supp.set_tag("MC", primary.cigarstring)
else:
supp.set_tag("MC", None)


@attr.s(frozen=True, auto_attribs=True)
class ReadEditInfo:
"""
@@ -954,6 +1048,16 @@ def set_tag(
for rec in self.all_recs():
rec.set_tag(tag, value)

def fixmate(self) -> None:
"""Fixes mate information and sets mate CIGAR on all primary and supplementary
(but not secondary) records.
"""
for r2_supplemental in self.r2_supplementals:
set_mate_info_on_supplementary(self.r1, r2_supplemental, True)
for r1_supplemental in self.r1_supplementals:
set_mate_info_on_supplementary(self.r2, r1_supplemental, True)
set_mate_info(self.r1, self.r2)


class TemplateIterator(Iterator[Template]):
"""
72 changes: 4 additions & 68 deletions fgpyo/sam/builder.py
Original file line number Diff line number Diff line change
@@ -230,7 +230,7 @@ def _set_length_dependent_fields(
If any of bases, quals or cigar are defined, they must all have the same length/query
length. If none are defined then the length parameter is used. Undefined values are
synthesize at the inferred length.
synthesized at the inferred length.
Args:
rec: a SAM record
@@ -264,70 +264,6 @@ def _set_length_dependent_fields(
if not rec.is_unmapped:
rec.cigarstring = cigar if cigar else f"{length}M"

def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.
Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.
Args:
r1: the first read in the pair
r2: the sceond read in the pair
"""
for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = sam.NO_REF_INDEX
rec.next_reference_id = sam.NO_REF_INDEX
rec.reference_start = sam.NO_REF_POS
rec.next_reference_start = sam.NO_REF_POS
rec.is_unmapped = True
rec.mate_is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True

def rg(self) -> Dict[str, Any]:
"""Returns the single read group that is defined in the header."""
# The `RG` field contains a list of read group mappings
@@ -468,7 +404,7 @@ def add_pair(
)

# Sync up mate info and we're done!
self._set_mate_info(r1, r2)
sam.set_mate_info(r1, r2)
self._records.append(r1)
self._records.append(r2)
return r1, r2
@@ -489,12 +425,12 @@ def add_single(
supplementary: bool = False,
attrs: Optional[Dict[str, Any]] = None,
) -> AlignedSegment:
"""Generates a new single reads, adds them to the internal collection, and returns it.
"""Generates a new single read, adds it to the internal collection, and returns it.
Most fields are optional.
If `read_num` is None (the default) an unpaired read will be created. If `read_num` is
set to 1 or 2, the read will have it's paired flag set and read number flags set.
set to 1 or 2, the read will have its paired flag set and read number flags set.
An unmapped read can be created by calling the method with no parameters (specifically,
not setting chrom, start1 or start2). If cigar is provided, it will be ignored.
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@

import pytest

from fgpyo import sam
from fgpyo.sam import Template
from fgpyo.sam import TemplateIterator
from fgpyo.sam import reader
@@ -29,7 +30,7 @@ def test_template_init_function() -> None:
assert len([t for t in template.r1_secondaries]) == 0


def test_to_templates() -> None:
def test_template_iterator() -> None:
builder = SamBuilder()

# Series of alignments for one template
@@ -103,7 +104,7 @@ def test_write_template(
assert len([r for r in template.all_recs()]) == 2


def test_set_tag() -> None:
def test_template_set_tag() -> None:
builder = SamBuilder()
template = Template.build(builder.add_pair(chrom="chr1", start1=100, start2=200))

@@ -129,3 +130,60 @@ def test_set_tag() -> None:
for bad_tag in ["", "A", "ABC", "ABCD"]:
with pytest.raises(AssertionError, match="Tags must be 2 characters"):
template.set_tag(bad_tag, VALUE)


def test_template_fixmate() -> None:
builder = SamBuilder(r1_len=100, r2_len=100, base_quality=30)

# both reads are mapped
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2])
template.fixmate()
assert r1.get_tag("MC") == "100M"
assert r2.get_tag("MC") == "80M20S"
assert r2.next_reference_start == 107

# only read 1 is mapped
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2])
template.fixmate()
with pytest.raises(KeyError): r1.get_tag("MC")
assert r2.get_tag("MC") == "80M20S"
assert r2.next_reference_start == 107

# neither reads are mapped
r1, r2 = builder.add_pair(chrom=sam.NO_REF_NAME)
r1.cigarstring = "80M20S"
template = Template.build([r1, r2])
template.fixmate()
with pytest.raises(KeyError): r1.get_tag("MC")
with pytest.raises(KeyError): r2.get_tag("MC")

# all supplementary (and not secondard) records should be updated
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500)
supp1a = builder.add_single(name="q1", read_num=1, chrom="chr1", start=101, supplementary=True)
supp1b = builder.add_single(name="q1", read_num=1, chrom="chr1", start=102, supplementary=True)
supp2a = builder.add_single(name="q1", read_num=2, chrom="chr1", start=501, supplementary=True)
supp2b = builder.add_single(name="q1", read_num=2, chrom="chr1", start=502, supplementary=True)
sec1 = builder.add_single(name="q1", read_num=1, chrom="chr1", start=1001, secondary=True)
sec2 = builder.add_single(name="q1", read_num=2, chrom="chr1", start=1002, secondary=True)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2, supp1a, supp1b, supp2a, supp2b, sec1, sec2])
template.fixmate()
assert r1.get_tag("MC") == "100M"
assert supp1a.get_tag("MC") == "100M"
assert supp1b.get_tag("MC") == "100M"
with pytest.raises(KeyError): sec1.get_tag("MC")
assert r2.get_tag("MC") == "80M20S"
assert supp2a.get_tag("MC") == "80M20S"
assert supp2b.get_tag("MC") == "80M20S"
with pytest.raises(KeyError): sec2.get_tag("MC")
assert r2.next_reference_start == 107
assert supp2a.next_reference_start == 107
assert supp2b.next_reference_start == 107
assert sec2.next_reference_start == -1

0 comments on commit c0f71f5

Please sign in to comment.