Skip to content

Commit

Permalink
WIP: fq stuff
Browse files Browse the repository at this point in the history
implements overlap detection and correction
  • Loading branch information
brentp committed Aug 16, 2023
1 parent 96c130a commit c0d9550
Show file tree
Hide file tree
Showing 2 changed files with 239 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/lib/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pub mod barcode_matching;
pub mod pair_overlap;
pub mod samples;

/// Checks whether a given u8 byte is a "No-call"-ed base, signified by the bytes 'N', 'n' and '.'
Expand Down
238 changes: 238 additions & 0 deletions src/lib/pair_overlap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
use seq_io::fastq::OwnedRecord;

fn complement(base: u8) -> u8 {
match base {
b'A' => b'T',
b'C' => b'G',
b'G' => b'C',
b'T' => b'A',
b'a' => b't',
b'c' => b'g',
b'g' => b'c',
b't' => b'a',
_ => b'N',
}
}

/// Overlap describes the overlap between two reads.
/// if `adapter` is then `shift` is the amount of adapter sequence at ends of the reads.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct PairOverlap {
// shift required to get read1 to overlap with read2
pub shift: usize,
// length of the overlap
pub overlap: usize,
// number of single-base differences within the overlap
pub diffs: usize,
// true if the end of read1 is "aligned" right of end of read2.
pub adapter: bool,
}

impl PairOverlap {
pub fn correct(
&self,
r1: &mut OwnedRecord,
r2: &mut OwnedRecord,
bq_delta: u8,
drop_adapter: bool,
) {
// NOTE: copying everything for now. This is not ideal but simplifies code.
// Only applies to subset of reads that actually overlap.
let mut r1_seq = r1.seq.clone();
let mut r2_seq = reverse_complement(&r2.seq);
let mut r1_quals = r1.qual.clone();
let mut r2_quals = r2.qual.clone();
r2_quals.reverse();

if self.adapter {
std::mem::swap(&mut r1_seq, &mut r2_seq);
std::mem::swap(&mut r1_quals, &mut r2_quals);
}

for i in 0..self.overlap {
let agree = r1_seq[i + self.shift] == r2_seq[i];

// when bases agree, set both to the sum of the qualities.
if agree {
// avoid overflow and cap bq to 127 (last ascii character)
r2_quals[i] = (r1_quals[i + self.shift] as u16 + r2_quals[i] as u16).min(126) as u8;
r1_quals[i + self.shift] = r2_quals[i];
} else {
// when bases disagree, set the lower quality base to the higher quality base.
if r1_quals[i + self.shift] >= r2_quals[i] + bq_delta {
r2_seq[i] = r1_seq[i + self.shift];
r2_quals[i] = r1_quals[i + self.shift];
} else if r2_quals[i] >= r1_quals[i + self.shift] + bq_delta {
r1_seq[i + self.shift] = r2_seq[i];
r1_quals[i + self.shift] = r2_quals[i];
}
}
}

if self.adapter {
std::mem::swap(&mut r1_seq, &mut r2_seq);
std::mem::swap(&mut r1_quals, &mut r2_quals);
if drop_adapter {
// drop right end of r1
r1_seq = r1_seq[..r1_seq.len() - self.shift].to_vec();
r1_quals = r1_quals[..r1_quals.len() - self.shift].to_vec();
// we're still reversed here, so we also drop of left end of r2
r2_seq = r2_seq[self.shift..].to_vec();
r2_quals = r2_quals[self.shift..].to_vec();
}
}

r1.seq = r1_seq;
r1.qual = r1_quals;

r2.seq = reverse_complement(&r2_seq);
r2.qual = r2_quals.iter().rev().copied().collect::<Vec<u8>>();
}
}

fn reverse_complement(s: &[u8]) -> Vec<u8> {
s.iter().rev().map(|x| complement(*x)).collect::<Vec<u8>>()
}

/// find overlap given the sequence of a pair of reads.
pub fn find_overlap(
s1: &[u8],
s2: &[u8],
min_overlap: usize,
// TODO: add max_overlap_error_rate param.
max_diffs: usize,
) -> Option<PairOverlap> {
// note that it's possible to do this without allocating, but then we must (re)complement many times.
let s2 = s2.iter().rev().map(|x| complement(*x)).collect::<Vec<u8>>();

'shift_loop: for shift in 0..s1.len() - min_overlap {
let mut diffs = 0;
let overlap_len = (s1.len() - shift).min(s2.len());
for i in 0..overlap_len {
if s1[i + shift] != s2[i] {
diffs += 1;
if diffs > max_diffs {
continue 'shift_loop;
}
}
}
// since were here, we have a large enough overlap
return Some(PairOverlap { shift, overlap: overlap_len, diffs, adapter: false });
}

// now handle case where there is adapter sequenced.
// shift is in s2 this time.
'shift_loop: for shift in 0..s2.len() - min_overlap {
let mut diffs = 0;
let overlap_len = (s2.len() - shift).min(s1.len());
for i in 0..overlap_len {
if s1[i] != s2[i + shift] {
diffs += 1;
if diffs > max_diffs {
continue 'shift_loop;
}
}
}
// since were here, we have a large enough overlap
return Some(PairOverlap { shift, overlap: overlap_len, diffs, adapter: true });
}

None
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_simple_overlap() {
// shift: 0
// i: 0123456789
// s1: ACGAAAAA
// s2: AAAAAGGC
// shift: 3
// i+shift 34567
// s1: ACGAAAAA
// i: 01234567
// s2: AAAAAGGC
let r1 = b"ACGAAAAA";
let r2 = b"AAAAAGGC".iter().rev().map(|x| complement(*x)).collect::<Vec<u8>>();
let overlap = find_overlap(r1, &r2, 4, 0);
assert_eq!(overlap, Some(PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: false }));
}

#[test]
fn test_overlap_with_adapters() {
// sequenced adapter
// we know this is the case when end of read1 is beyond end of read2 (here beyond == to the right of)
// shift: 0
// i: 0123456789
// s1: AAAAAGGC
// s2: ACGAAAAA
// shift: 3
// s1: AAAAAGGC
// s2: ACGAAAAA
let r1 = b"AAAAAGGC";
let r2 = b"ACGAAAAA".iter().rev().map(|x| complement(*x)).collect::<Vec<u8>>();
let overlap = find_overlap(r1, &r2, 4, 0);
assert_eq!(overlap, Some(PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: true }));
}

#[test]
fn test_overlap_correction() {
let overlap = PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: false };
let mut r1 = OwnedRecord {
head: b"r1".to_vec(),
seq: b"ACGAAATA".to_vec(),
qual: b"IIIIIIII".to_vec(),
};
let mut r2 = OwnedRecord {
head: b"r2".to_vec(),
seq: reverse_complement(b"AAAAAGGC"),
qual: b"IIIIIIII".to_vec(),
};

overlap.correct(&mut r1, &mut r2, 5, true);

// the sequences are different, but not corrected becase BQs are the same.
assert_eq!(r1.seq, b"ACGAAATA".to_vec());
assert_eq!(r2.seq, reverse_complement(b"AAAAAGGC"));

// now adjust r2 to have higher BQs than r1 for that base.
// ACGAAATA
// AAAAAGGC
// IIIQIIII

r2.qual = b"IIIQIIII".iter().rev().map(|b| *b).collect::<Vec<u8>>();
overlap.correct(&mut r1, &mut r2, 5, true);
assert_eq!(r1.seq, b"ACGAAAAA".to_vec());
}

#[test]
fn test_overlap_correction_with_adapter() {
let overlap = PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: true };
// IIIQIIII
// r1: AAAAAGGC
// r2: ACGAAATA

let mut r1 = OwnedRecord {
head: b"r1".to_vec(),
seq: b"AAAAAGGC".to_vec(),
qual: b"IIIQIIII".to_vec(),
};
let mut r2 = OwnedRecord {
head: b"r2".to_vec(),
seq: reverse_complement(b"ACGAAATA").to_vec(),
qual: b"IIIIIIII".to_vec(),
};

overlap.correct(&mut r1, &mut r2, 5, true);

// sequence 2 is corrected.
assert_eq!(r2.seq, reverse_complement(b"AAAAA"));
assert_eq!(r1.seq, b"AAAAA");

assert_eq!(r2.qual, b"~Q~~~");
assert_eq!(r1.qual, b"~~~Q~");
}
}

0 comments on commit c0d9550

Please sign in to comment.