diff --git a/src/bin/commands/trimmer.rs b/src/bin/commands/trimmer.rs index 03befa6..ccc3693 100644 --- a/src/bin/commands/trimmer.rs +++ b/src/bin/commands/trimmer.rs @@ -50,6 +50,18 @@ pub(crate) struct TrimmerOpts { #[clap(long, short = 'L', default_value = "15")] filter_longer: usize, + /// Size of window to look for oscillations. + #[clap(long, default_value = "15")] + osc_window: usize, + + /// Number of oscillations in a window to trigger trimming/masking. + #[clap(long, default_value = "4")] + osc_max_oscillations: usize, + + /// Difference between adjacent bases to be considered on oscilation. + #[clap(long, default_value = "10")] + osc_delta: usize, + /// Minimum difference in base-quality for one read to correct an overlapping /// base from the other read. #[clap(long, short = 'd', default_value = "15")] @@ -149,7 +161,24 @@ impl Command for TrimmerOpts { } } Operation::Osc => { - unimplemented!("OSC not implemented") + if let Some(i) = base_quality::identify_trim_point( + r1.qual(), + self.osc_delta as i32, + self.osc_window, + self.osc_max_oscillations, + ) { + // TODO [Brent]: allow clip or mask with flag + base_quality::clip_read(&mut r1, 0usize..i); + } + if let Some(i) = base_quality::identify_trim_point( + r2.qual(), + self.osc_delta as i32, + self.osc_window, + self.osc_max_oscillations, + ) { + // TODO [Brent]: allow clip or mask with flag + base_quality::clip_read(&mut r2, 0usize..i); + } } Operation::LenFilter => { if r1.seq().len().min(r2.seq().len()) < self.filter_shorter diff --git a/src/lib/base_quality.rs b/src/lib/base_quality.rs index 8397a61..6afd974 100644 --- a/src/lib/base_quality.rs +++ b/src/lib/base_quality.rs @@ -3,8 +3,48 @@ use seq_io::fastq::OwnedRecord; use std::ops::Range; -pub fn find_oscillating_quals(_bqs: &[u8]) -> Range { - unimplemented!("find_oscillating_quals"); +/// Returns an optional point at which to trim the read due to quality oscillations. +/// +/// # Arguments +/// +/// * `quals` - The quality scores for the read. +/// * `osc_delta` - The minimum difference in quality scores to be considered an oscillation. +/// * `window_size` - The number of bases to consider in a window. +/// * `max_oscillations` - The maximum number of oscillations allowed in a window. +pub fn identify_trim_point( + quals: &[u8], + osc_delta: i32, + window_size: usize, + max_oscillations: usize, +) -> Option { + // Compute a vector of booleans to represent whether each position is immediately + // after an oscillation + let mut is_osc = vec![false; quals.len()]; + + for i in 1..quals.len() { + is_osc[i] = (quals[i] as i32 - quals[i - 1] as i32).abs() >= osc_delta; + } + + // TODO [brent]: this is ~O(n^2/2) and could be ~O(n) + // instead, convert is_osc into a vector of indices of oscillations. then can use a + // window of `max_oscillations` and check if there are at least `window_size` oscillations in the window. + for i in 1..is_osc.len() { + if !is_osc[i] { + continue; + } + let mut n = 1; + #[allow(clippy::needless_range_loop)] // will refactor this later anyway + for j in (i + 1)..std::cmp::min(i + window_size, quals.len()) { + if is_osc[j] { + n += 1; + if n > max_oscillations { + return Some(i); + } + } + } + } + + None } // Indicates which tail(s) to clip. @@ -82,4 +122,41 @@ mod tests { let range = find_high_quality_bases(bqs, 'I' as u8, 1, Tail::Right); assert_eq!(range, 0..bqs.len() - 1); } + + #[test] + fn test_identify_trim_point() { + let quals = vec![33; 100]; + let oscillation = 10; + let window_size = 20; + let max_oscillations = 3; + + let trim_point = identify_trim_point(&quals, oscillation, window_size, max_oscillations); + + assert_eq!(trim_point, None); + } + + #[test] + fn test_identify_trim_point_not_quite_big_enough_changes() { + let base = [9u8, 18, 27, 36, 27, 18]; + let quals: Vec = base.into_iter().cycle().take(base.len() * 20).collect(); + + let oscillation = 10; + let window_size = 20; + let max_oscillations = 3; + + let trim_point = identify_trim_point(&quals, oscillation, window_size, max_oscillations); + assert_eq!(trim_point, None); + } + + #[test] + fn test_identify_trim_point_finds_point() { + let left = vec![30u8; 50]; + let right = [15u8, 22, 35, 20, 32]; + let quals = + [&left[..], &right.into_iter().cycle().take(right.len() * 5).collect::>()] + .concat(); + + let trim_point = identify_trim_point(&quals, 10, 20, 3); + assert_eq!(trim_point, Some(50)); + } }