Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added template.fixmate() (#83) #182

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 105 additions & 1 deletion fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This module contains utility classes for working with SAM/BAM files and the data contained
within them. This includes i) utilities for opening SAM/BAM files for reading and writing,
ii) functions for manipulating supplementary alignments, iii) classes and functions for
maniuplating CIGAR strings, and iv) a class for building sam records and files for testing.
manipulating CIGAR strings, and iv) a class for building sam records and files for testing.

## Motivation for Reader and Writer methods

Expand Down Expand Up @@ -691,6 +691,100 @@
r2.template_length = -insert_size


def set_mate_info(r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.

Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.

Args:
r1: the first read in the pair
r2: the second read in the pair
"""
assert (
r1.query_name == r2.query_name
), "Attempting to set mate info on reads with different qnames."

for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = NO_REF_INDEX
rec.next_reference_id = NO_REF_INDEX
rec.reference_start = NO_REF_POS
rec.next_reference_start = NO_REF_POS
rec.is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = True
rec.set_tag("MC", None)

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True
m.set_tag("MC", None)

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
Comment on lines +757 to +758
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue This will break many structural variant callers.

Is there a reason to overwrite the proper pair status reported by the aligner?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what initially motivated this; I transferred this _set_mate_info() helper function out of fgpyo/sam/builder.py. Are you aware of a reason for it to be present there?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@msto what's the alternative? Invoking this function is tantamount to saying "I've changed some of the values on one or both of these reads, please make sure everything is in sync". I understand the concern around proper_pair - and it's true that this implementation made sense when it was in SamBuilder, and less so here...

But if someone has changed alignment values .. the value from the aligner is also questionable here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TimD1 - @msto and I chatted and we think the right solution here is a little complicated. We suggest doing something like:

def set_mate_info(
  r1: AlignedSegment,
  r2: AlignedSegment,
  is_proper_pair: Callable[[AlignedSegment, AlignedSegment], bool] = lambda a, b: a.is_proper_pair and b.is_proper_pair
) -> Unit
    ...

I.e. the default is to leave it True if it's already set on R1 and R1, but set it to false if already false, or there is disagreement. Then a user (e.g. SamBuilder) can provide it's own lambda.

fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True


def set_mate_info_on_supplementary(
primary: AlignedSegment, supp: AlignedSegment, set_cigar: bool = False
) -> None:
"""Sets mate pair information on a supplemental record (e.g. from a split alignment), using
the primary alignment of the read's mate.

Args:
primary: the primary alignment of the mate pair of the supplemental
supp: a supplemental alignment for the mate pair of the primary supplied
set_cigar: true if we should update/create the mate CIGAR (MC) optional tag,
false if we should remove any mate CIGAR tag that is present
"""
supp.next_reference_id = primary.reference_id
supp.next_reference_start = primary.reference_start
supp.mate_is_reverse = primary.is_reverse
supp.mate_is_unmapped = primary.is_unmapped
supp.template_length = -primary.template_length
if set_cigar and not primary.is_unmapped:
supp.set_tag("MC", primary.cigarstring)
else:
supp.set_tag("MC", None)

Check warning on line 785 in fgpyo/sam/__init__.py

View check run for this annotation

Codecov / codecov/patch

fgpyo/sam/__init__.py#L785

Added line #L785 was not covered by tests


@attr.s(frozen=True, auto_attribs=True)
class ReadEditInfo:
"""
Expand Down Expand Up @@ -954,6 +1048,16 @@
for rec in self.all_recs():
rec.set_tag(tag, value)

def fixmate(self) -> None:
"""Fixes mate information and sets mate CIGAR on all primary and supplementary
(but not secondary) records.
"""
for r2_supplemental in self.r2_supplementals:
set_mate_info_on_supplementary(self.r1, r2_supplemental, True)
for r1_supplemental in self.r1_supplementals:
set_mate_info_on_supplementary(self.r2, r1_supplemental, True)
set_mate_info(self.r1, self.r2)


class TemplateIterator(Iterator[Template]):
"""
Expand Down
72 changes: 4 additions & 68 deletions fgpyo/sam/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _set_length_dependent_fields(

If any of bases, quals or cigar are defined, they must all have the same length/query
length. If none are defined then the length parameter is used. Undefined values are
synthesize at the inferred length.
synthesized at the inferred length.

Args:
rec: a SAM record
Expand Down Expand Up @@ -264,70 +264,6 @@ def _set_length_dependent_fields(
if not rec.is_unmapped:
rec.cigarstring = cigar if cigar else f"{length}M"

def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.

Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.

Args:
r1: the first read in the pair
r2: the sceond read in the pair
"""
for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = sam.NO_REF_INDEX
rec.next_reference_id = sam.NO_REF_INDEX
rec.reference_start = sam.NO_REF_POS
rec.next_reference_start = sam.NO_REF_POS
rec.is_unmapped = True
rec.mate_is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True

def rg(self) -> Dict[str, Any]:
"""Returns the single read group that is defined in the header."""
# The `RG` field contains a list of read group mappings
Expand Down Expand Up @@ -468,7 +404,7 @@ def add_pair(
)

# Sync up mate info and we're done!
self._set_mate_info(r1, r2)
sam.set_mate_info(r1, r2)
self._records.append(r1)
self._records.append(r2)
return r1, r2
Expand All @@ -489,12 +425,12 @@ def add_single(
supplementary: bool = False,
attrs: Optional[Dict[str, Any]] = None,
) -> AlignedSegment:
"""Generates a new single reads, adds them to the internal collection, and returns it.
"""Generates a new single read, adds it to the internal collection, and returns it.

Most fields are optional.

If `read_num` is None (the default) an unpaired read will be created. If `read_num` is
set to 1 or 2, the read will have it's paired flag set and read number flags set.
set to 1 or 2, the read will have its paired flag set and read number flags set.

An unmapped read can be created by calling the method with no parameters (specifically,
not setting chrom, start1 or start2). If cigar is provided, it will be ignored.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

from fgpyo import sam
from fgpyo.sam import Template
from fgpyo.sam import TemplateIterator
from fgpyo.sam import reader
Expand Down Expand Up @@ -29,7 +30,7 @@ def test_template_init_function() -> None:
assert len([t for t in template.r1_secondaries]) == 0


def test_to_templates() -> None:
def test_template_iterator() -> None:
builder = SamBuilder()

# Series of alignments for one template
Expand Down Expand Up @@ -103,7 +104,7 @@ def test_write_template(
assert len([r for r in template.all_recs()]) == 2


def test_set_tag() -> None:
def test_template_set_tag() -> None:
builder = SamBuilder()
template = Template.build(builder.add_pair(chrom="chr1", start1=100, start2=200))

Expand All @@ -129,3 +130,65 @@ def test_set_tag() -> None:
for bad_tag in ["", "A", "ABC", "ABCD"]:
with pytest.raises(AssertionError, match="Tags must be 2 characters"):
template.set_tag(bad_tag, VALUE)


def test_template_fixmate() -> None:
builder = SamBuilder(r1_len=100, r2_len=100, base_quality=30)

# both reads are mapped
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2])
template.fixmate()
assert r1.get_tag("MC") == "100M"
assert r2.get_tag("MC") == "80M20S"
assert r2.next_reference_start == 107

# only read 1 is mapped
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2])
template.fixmate()
with pytest.raises(KeyError):
r1.get_tag("MC")
assert r2.get_tag("MC") == "80M20S"
assert r2.next_reference_start == 107

# neither reads are mapped
r1, r2 = builder.add_pair(chrom=sam.NO_REF_NAME)
r1.cigarstring = "80M20S"
template = Template.build([r1, r2])
template.fixmate()
with pytest.raises(KeyError):
r1.get_tag("MC")
with pytest.raises(KeyError):
r2.get_tag("MC")

# all supplementary (and not secondard) records should be updated
r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500)
supp1a = builder.add_single(name="q1", read_num=1, chrom="chr1", start=101, supplementary=True)
supp1b = builder.add_single(name="q1", read_num=1, chrom="chr1", start=102, supplementary=True)
supp2a = builder.add_single(name="q1", read_num=2, chrom="chr1", start=501, supplementary=True)
supp2b = builder.add_single(name="q1", read_num=2, chrom="chr1", start=502, supplementary=True)
sec1 = builder.add_single(name="q1", read_num=1, chrom="chr1", start=1001, secondary=True)
sec2 = builder.add_single(name="q1", read_num=2, chrom="chr1", start=1002, secondary=True)
r1.cigarstring = "80M20S"
r1.reference_start = 107
template = Template.build([r1, r2, supp1a, supp1b, supp2a, supp2b, sec1, sec2])
template.fixmate()
assert r1.get_tag("MC") == "100M"
assert supp1a.get_tag("MC") == "100M"
assert supp1b.get_tag("MC") == "100M"
with pytest.raises(KeyError):
sec1.get_tag("MC")
assert r2.get_tag("MC") == "80M20S"
assert supp2a.get_tag("MC") == "80M20S"
assert supp2b.get_tag("MC") == "80M20S"
with pytest.raises(KeyError):
sec2.get_tag("MC")
assert r2.next_reference_start == 107
assert supp2a.next_reference_start == 107
assert supp2b.next_reference_start == 107
assert sec2.next_reference_start == -1
Loading