From 3caccf0a065bb01e32d718b9c0f9128bc871b515 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 16 Aug 2022 16:10:35 -0700 Subject: [PATCH 1/2] Add an option to skip the mate cigar in EstimateRnaSeqInsertSize --- .../rnaseq/EstimateRnaSeqInsertSize.scala | 43 ++++++------------ .../rnaseq/EstimateRnaSeqInsertSizeTest.scala | 45 +++++++++++++++++++ 2 files changed, 58 insertions(+), 30 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala b/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala index f8fdb36ff..c37724c5b 100644 --- a/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala +++ b/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala @@ -37,11 +37,8 @@ import com.fulcrumgenomics.util.GeneAnnotations._ import com.fulcrumgenomics.util._ import htsjdk.samtools.SamPairUtil.PairOrientation import htsjdk.samtools._ -import htsjdk.samtools.filter._ import htsjdk.samtools.util.{CoordMath, Interval, OverlapDetector} -import scala.jdk.CollectionConverters._ - @clp(description = """Computes the insert size for RNA-Seq experiments. | @@ -54,6 +51,7 @@ import scala.jdk.CollectionConverters._ |chromosomes. Finally, skips transcripts where too few mapped read bases overlap exonic sequence. | |This tool requires each mapped pair to have the mate cigar (`MC`) tag. Use `SetMateInformation` to add the mate cigar. + |Use the `--skip-missing-mate-cigar` to skip these. | |The output metric file will have the extension `.rna_seq_insert_size.txt` and the output histogram file will have |the extension `.rna_seq_insert_size_histogram.txt`. The histogram file gives for each orientation (`FR`, `RF`, `tandem`), @@ -74,7 +72,8 @@ class EstimateRnaSeqInsertSize """" ) val deviations: Double = 10.0, @arg(flag='q', doc="Ignore reads with mapping quality less than this value.") val minimumMappingQuality: Int = 30, - @arg(flag='m', doc="The minimum fraction of read bases that must overlap exonic sequence in a transcript") val minimumOverlap: Double = 0.95 + @arg(flag='m', doc="The minimum fraction of read bases that must overlap exonic sequence in a transcript") val minimumOverlap: Double = 0.95, + @arg(doc="Skip reads who are missing their mate cigar") val skipMissingMateCigar: Boolean = false ) extends FgBioTool with LazyLogging { import EstimateRnaSeqInsertSize._ @@ -90,12 +89,20 @@ class EstimateRnaSeqInsertSize val in = SamSource(input) val refFlatSource = RefFlatSource(refFlat, Some(in.dict)) val counters = pairOrientations.map { pairOrientation => (pairOrientation, new NumericCounter[Long]()) }.toMap - val filter = new AggregateFilter(EstimateRnaSeqInsertSize.filters(minimumMappingQuality=minimumMappingQuality, includeDuplicates=includeDuplicates).asJava) var numReadPairs = 0L val recordIterator = in.iterator.filter { rec => progress.record(rec) if (rec.paired && rec.firstOfPair) numReadPairs += 1 - !filter.filterOut(rec.asSam) + + rec.firstOfPair && // so templates are considered only once + rec.pf && // don't want low quality reads + rec.paired && // only want paired reads + !rec.secondary && !rec.supplementary && // no secondary or supplementary + rec.mapq >= minimumMappingQuality && // decent mapping quality + (includeDuplicates || !rec.duplicate) && // no duplicates, unless specified + rec.mapped && rec.mateMapped && // both ends of a pair are mapped + rec.refIndex == rec.mateRefIndex && // both ends of a pair map to the same contig + (!skipMissingMateCigar || rec.mateCigar.isDefined) // skip records with missing mate cigars if specified } for (gene <- refFlatSource; locus <- gene.loci) { @@ -181,30 +188,6 @@ object EstimateRnaSeqInsertSize { val RnaSeqInsertSizeMetricExtension: String = ".rnaseq_insert_size.txt" val RnaSeqInsertSizeMetricHistogramExtension: String = ".rnaseq_insert_size_histogram.txt" - private trait SingleEndSamRecordFilter extends SamRecordFilter { - override def filterOut(r1: SAMRecord, r2: SAMRecord): Boolean = filterOut(r1) || filterOut(r2) - } - - def filter(f: SamRecord => Boolean): SamRecordFilter = new SingleEndSamRecordFilter { - override def filterOut(r: SAMRecord): Boolean = f(r.asInstanceOf[SamRecord]) - } - - private def filters(minimumMappingQuality: Int, includeDuplicates: Boolean) = List( - MateMappedFilter, // also filters out unpaired reads - new FailsVendorReadQualityFilter, - new AlignedFilter(true), - new SecondaryOrSupplementaryFilter, - new MappingQualityFilter(minimumMappingQuality), - DuplicatesFilter(includeDuplicates=includeDuplicates), - FirstOfPairOnlyFilter, - DifferentReferenceIndexFilter - ) - - private def MateMappedFilter = filter(r => !r.paired || r.mateUnmapped) - private def DuplicatesFilter(includeDuplicates: Boolean) = filter(r => !includeDuplicates && r.duplicate) - private def FirstOfPairOnlyFilter = filter(_.firstOfPair) - private def DifferentReferenceIndexFilter = filter(r => r.refIndex != r.mateRefIndex) - private[rnaseq] def getAndRequireMateCigar(rec: SamRecord): Cigar = { rec.mateCigar.getOrElse { throw new IllegalStateException(s"Mate CIGAR (Tag 'MC') not found for $rec, consider using SetMateInformation.") diff --git a/src/test/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSizeTest.scala b/src/test/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSizeTest.scala index 291acb69e..fc6668936 100644 --- a/src/test/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSizeTest.scala +++ b/src/test/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSizeTest.scala @@ -347,4 +347,49 @@ class EstimateRnaSeqInsertSizeTest extends UnitSpec with OptionValues { } } } + + it should "run end-to-end and ignore reads missing a mate cigar" in { + val builder = new SamBuilder() + // FR = (133804075 + 100) - 133801671 = 2504 + builder.addPair(contig=2, start1=133801671, start2=133804075, strand1=Plus, strand2=Minus) // overlaps ACKR4 by 100% + builder.addPair(contig=2, start1=133801671, start2=133804075, strand1=Plus, strand2=Minus).foreach { rec => + rec.remove("MC") // remove the mate cigar + } + + // fails, since the default is to require the mate cigar + assertThrows[Exception] { + val bam = builder.toTempFile() + new EstimateRnaSeqInsertSize(input=bam, refFlat=RefFlatFile).execute() + } + + val bam = builder.toTempFile() + val out = PathUtil.pathTo(PathUtil.removeExtension(bam).toString + EstimateRnaSeqInsertSize.RnaSeqInsertSizeMetricExtension) + new EstimateRnaSeqInsertSize(input=bam, refFlat=RefFlatFile, skipMissingMateCigar=true).execute() + val metrics = Metric.read[InsertSizeMetric](path=out) + metrics.length shouldBe PairOrientation.values().length + + val expectedMetrics = Seq( + InsertSizeMetric( + pair_orientation = PairOrientation.FR, + read_pairs = 1, + standard_deviation = 0, + mean = 2504, + min = 1, + max = 1, + median = 2504, + median_absolute_deviation = 0 + ), + ) + + metrics.zip(expectedMetrics).foreach { case (actual, expected) => + actual shouldBe expected + } + + val histogramPath = PathUtil.pathTo(PathUtil.removeExtension(bam).toString + EstimateRnaSeqInsertSize.RnaSeqInsertSizeMetricHistogramExtension) + Io.readLines(path=histogramPath).mkString("\n") shouldBe + """ + |insert_size fr rf tandem + |2504 1 0 0 + """.stripMargin.trim + } } From 24eb2a96c13984b7261459b830b004ea42e96785 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 16 Aug 2022 19:29:38 -0700 Subject: [PATCH 2/2] Update src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala Co-authored-by: Nathan Roach --- .../com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala b/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala index c37724c5b..b564e65ba 100644 --- a/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala +++ b/src/main/scala/com/fulcrumgenomics/rnaseq/EstimateRnaSeqInsertSize.scala @@ -51,7 +51,7 @@ import htsjdk.samtools.util.{CoordMath, Interval, OverlapDetector} |chromosomes. Finally, skips transcripts where too few mapped read bases overlap exonic sequence. | |This tool requires each mapped pair to have the mate cigar (`MC`) tag. Use `SetMateInformation` to add the mate cigar. - |Use the `--skip-missing-mate-cigar` to skip these. + |Use the `--skip-missing-mate-cigar` to skip reads missing the mate cigar tag. | |The output metric file will have the extension `.rna_seq_insert_size.txt` and the output histogram file will have |the extension `.rna_seq_insert_size_histogram.txt`. The histogram file gives for each orientation (`FR`, `RF`, `tandem`),