From 150c4be7db7a6108786d44c8e5f81fa040f66699 Mon Sep 17 00:00:00 2001 From: Brent Pedersen Date: Wed, 16 Aug 2023 12:23:53 +0200 Subject: [PATCH] WIP: fq stuff implements overlap detection and correction --- src/lib/mod.rs | 1 + src/lib/pair_overlap.rs | 242 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 src/lib/pair_overlap.rs diff --git a/src/lib/mod.rs b/src/lib/mod.rs index bd50949..022a1e7 100644 --- a/src/lib/mod.rs +++ b/src/lib/mod.rs @@ -1,4 +1,5 @@ pub mod barcode_matching; +pub mod pair_overlap; pub mod samples; /// Checks whether a given u8 byte is a "No-call"-ed base, signified by the bytes 'N', 'n' and '.' diff --git a/src/lib/pair_overlap.rs b/src/lib/pair_overlap.rs new file mode 100644 index 0000000..e48316e --- /dev/null +++ b/src/lib/pair_overlap.rs @@ -0,0 +1,242 @@ +use seq_io::fastq::OwnedRecord; + +fn complement(base: u8) -> u8 { + match base { + b'A' => b'T', + b'C' => b'G', + b'G' => b'C', + b'T' => b'A', + b'a' => b't', + b'c' => b'g', + b'g' => b'c', + b't' => b'a', + _ => b'N', + } +} + +/// Overlap describes the overlap between two reads. +/// if `adapter` is then `shift` is the amount of adapter sequence at ends of the reads. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct PairOverlap { + // shift required to get read1 to overlap with read2 + pub shift: usize, + // length of the overlap + pub overlap: usize, + // number of single-base differences within the overlap + pub diffs: usize, + // true if the end of read1 is "aligned" right of end of read2. + pub adapter: bool, +} + +impl PairOverlap { + /// Corrects bases and adjusts base-qualities in the pair given the + /// previously calculated overlap. + /// Only bases where the base-quality difference between the reads is higher than `bq_delta` are corrected. + /// (currently) Base-qualities are summed for bases that agree. + pub fn correct( + &self, + r1: &mut OwnedRecord, + r2: &mut OwnedRecord, + bq_delta: u8, + drop_adapter: bool, + ) { + // NOTE: copying everything for now. This is not ideal but simplifies code. + // Only applies to subset of reads that actually overlap. + let mut r1_seq = r1.seq.clone(); + let mut r2_seq = reverse_complement(&r2.seq); + let mut r1_quals = r1.qual.clone(); + let mut r2_quals = r2.qual.clone(); + r2_quals.reverse(); + + if self.adapter { + std::mem::swap(&mut r1_seq, &mut r2_seq); + std::mem::swap(&mut r1_quals, &mut r2_quals); + } + + for i in 0..self.overlap { + let agree = r1_seq[i + self.shift] == r2_seq[i]; + + // when bases agree, set both to the sum of the qualities. + if agree { + // avoid overflow and cap bq to 127 (last ascii character) + r2_quals[i] = (r1_quals[i + self.shift] as u16 + r2_quals[i] as u16).min(126) as u8; + r1_quals[i + self.shift] = r2_quals[i]; + } else { + // when bases disagree, set the lower quality base to the higher quality base. + if r1_quals[i + self.shift] >= r2_quals[i] + bq_delta { + r2_seq[i] = r1_seq[i + self.shift]; + r2_quals[i] = r1_quals[i + self.shift]; + } else if r2_quals[i] >= r1_quals[i + self.shift] + bq_delta { + r1_seq[i + self.shift] = r2_seq[i]; + r1_quals[i + self.shift] = r2_quals[i]; + } + } + } + + if self.adapter { + std::mem::swap(&mut r1_seq, &mut r2_seq); + std::mem::swap(&mut r1_quals, &mut r2_quals); + if drop_adapter { + // drop right end of r1 + r1_seq = r1_seq[..r1_seq.len() - self.shift].to_vec(); + r1_quals = r1_quals[..r1_quals.len() - self.shift].to_vec(); + // we're still reversed here, so we also drop of left end of r2 + r2_seq = r2_seq[self.shift..].to_vec(); + r2_quals = r2_quals[self.shift..].to_vec(); + } + } + + r1.seq = r1_seq; + r1.qual = r1_quals; + + r2.seq = reverse_complement(&r2_seq); + r2.qual = r2_quals.iter().rev().copied().collect::>(); + } +} + +fn reverse_complement(s: &[u8]) -> Vec { + s.iter().rev().map(|x| complement(*x)).collect::>() +} + +/// find overlap given the sequence of a pair of reads. +pub fn find_overlap( + s1: &[u8], + s2: &[u8], + min_overlap: usize, + // TODO: add max_overlap_error_rate param. + max_diffs: usize, +) -> Option { + // note that it's possible to do this without allocating, but then we must (re)complement many times. + let s2 = s2.iter().rev().map(|x| complement(*x)).collect::>(); + + 'shift_loop: for shift in 0..s1.len() - min_overlap { + let mut diffs = 0; + let overlap_len = (s1.len() - shift).min(s2.len()); + for i in 0..overlap_len { + if s1[i + shift] != s2[i] { + diffs += 1; + if diffs > max_diffs { + continue 'shift_loop; + } + } + } + // since were here, we have a large enough overlap + return Some(PairOverlap { shift, overlap: overlap_len, diffs, adapter: false }); + } + + // now handle case where there is adapter sequenced. + // shift is in s2 this time. + 'shift_loop: for shift in 0..s2.len() - min_overlap { + let mut diffs = 0; + let overlap_len = (s2.len() - shift).min(s1.len()); + for i in 0..overlap_len { + if s1[i] != s2[i + shift] { + diffs += 1; + if diffs > max_diffs { + continue 'shift_loop; + } + } + } + // since were here, we have a large enough overlap + return Some(PairOverlap { shift, overlap: overlap_len, diffs, adapter: true }); + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_overlap() { + // shift: 0 + // i: 0123456789 + // s1: ACGAAAAA + // s2: AAAAAGGC + // shift: 3 + // i+shift 34567 + // s1: ACGAAAAA + // i: 01234567 + // s2: AAAAAGGC + let r1 = b"ACGAAAAA"; + let r2 = b"AAAAAGGC".iter().rev().map(|x| complement(*x)).collect::>(); + let overlap = find_overlap(r1, &r2, 4, 0); + assert_eq!(overlap, Some(PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: false })); + } + + #[test] + fn test_overlap_with_adapters() { + // sequenced adapter + // we know this is the case when end of read1 is beyond end of read2 (here beyond == to the right of) + // shift: 0 + // i: 0123456789 + // s1: AAAAAGGC + // s2: ACGAAAAA + // shift: 3 + // s1: AAAAAGGC + // s2: ACGAAAAA + let r1 = b"AAAAAGGC"; + let r2 = b"ACGAAAAA".iter().rev().map(|x| complement(*x)).collect::>(); + let overlap = find_overlap(r1, &r2, 4, 0); + assert_eq!(overlap, Some(PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: true })); + } + + #[test] + fn test_overlap_correction() { + let overlap = PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: false }; + let mut r1 = OwnedRecord { + head: b"r1".to_vec(), + seq: b"ACGAAATA".to_vec(), + qual: b"IIIIIIII".to_vec(), + }; + let mut r2 = OwnedRecord { + head: b"r2".to_vec(), + seq: reverse_complement(b"AAAAAGGC"), + qual: b"IIIIIIII".to_vec(), + }; + + overlap.correct(&mut r1, &mut r2, 5, true); + + // the sequences are different, but not corrected becase BQs are the same. + assert_eq!(r1.seq, b"ACGAAATA".to_vec()); + assert_eq!(r2.seq, reverse_complement(b"AAAAAGGC")); + + // now adjust r2 to have higher BQs than r1 for that base. + // ACGAAATA + // AAAAAGGC + // IIIQIIII + + r2.qual = b"IIIQIIII".iter().rev().map(|b| *b).collect::>(); + overlap.correct(&mut r1, &mut r2, 5, true); + assert_eq!(r1.seq, b"ACGAAAAA".to_vec()); + } + + #[test] + fn test_overlap_correction_with_adapter() { + let overlap = PairOverlap { shift: 3, overlap: 5, diffs: 0, adapter: true }; + // IIIQIIII + // r1: AAAAAGGC + // r2: ACGAAATA + + let mut r1 = OwnedRecord { + head: b"r1".to_vec(), + seq: b"AAAAAGGC".to_vec(), + qual: b"IIIQIIII".to_vec(), + }; + let mut r2 = OwnedRecord { + head: b"r2".to_vec(), + seq: reverse_complement(b"ACGAAATA").to_vec(), + qual: b"IIIIIIII".to_vec(), + }; + + overlap.correct(&mut r1, &mut r2, 5, true); + + // sequence 2 is corrected. + assert_eq!(r2.seq, reverse_complement(b"AAAAA")); + assert_eq!(r1.seq, b"AAAAA"); + + assert_eq!(r2.qual, b"~Q~~~"); + assert_eq!(r1.qual, b"~~~Q~"); + } +}