From 27b140d54226ef73dec3084015bc6f73dfddb79c Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 16 Dec 2024 10:52:47 -0800 Subject: [PATCH] feat: output umi grouping metrics --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 29 +++++++++++++++++++ .../umi/GroupReadsByUmiTest.scala | 5 +++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index aece16860..ddbcc9c71 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -466,6 +466,20 @@ case class TagFamilySizeMetric(family_size: Int, var fraction: Proportion = 0, var fraction_gt_or_eq_family_size: Proportion = 0) extends Metric +/** + * Metrics produced by `GroupReadsByUmi` to describe reads passed through UMI grouping + * @param reads The number of reads accepted for grouping + * @param filteredNonPf The number of non-PF reads + * @param filteredPoorAlignment The number of templates discarded for poor + * @param filteredNsInUmi The number of templates discarded due to one or more Ns in the UMI + * @param filterUmisTooShort The number of templates discarded due to a shorter than expected UMI + */ +case class UmiGroupingMetric(reads: Long, + filteredNonPf: Long, + filteredPoorAlignment: Long, + filteredNsInUmi: Long, + filterUmisTooShort: Long) extends Metric + /** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/ sealed trait Strategy extends EnumEntry { def newStrategy(edits: Int, threads: Int): UmiAssigner @@ -568,6 +582,7 @@ class GroupReadsByUmi (@arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn, @arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut, @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, + @arg(flag='g', doc="Optional output of UMI grouping metrics.") val groupingMetrics: Option[FilePath] = None, @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = ConsensusTags.UmiBases, @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = ConsensusTags.MolecularId, @arg(flag='d', doc="Turn on duplicate marking mode.") val markDuplicates: Boolean = false, @@ -742,6 +757,20 @@ class GroupReadsByUmi ms.tails.foreach { tail => tail.headOption.foreach(m => m.fraction_gt_or_eq_family_size = tail.map(_.fraction).sum) } Metric.write(p, ms) } + + // Write out UMI grouping metrics + this.groupingMetrics match { + case None => () + case Some(p) => + val groupingMetrics = UmiGroupingMetric( + reads = kept, + filteredNonPf = filteredNonPf, + filteredPoorAlignment = filteredPoorAlignment, + filteredNsInUmi = filteredNsInUmi, + filterUmisTooShort = filterUmisTooShort, + ) + Metric.write(p, groupingMetrics) + } } private def logStats(): Unit = { diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 09c8ce0b6..4d38217ed 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -247,7 +247,8 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30)) + val metrics = Files.createTempFile("umi_grouped.", ".metrics.txt") + val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), groupingMetrics=Some(metrics), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30)) val logs = executeFgbioTool(tool) val groups = readBamRecs(out).groupBy(_.name.charAt(0)) @@ -267,6 +268,8 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT hist.toFile.exists() shouldBe true + metrics.toFile.exists() shouldBe true + // Make sure that we skip sorting for TemplateCoordinate val sortMessage = "Sorting the input to TemplateCoordinate order" logs.exists(_.contains(sortMessage)) shouldBe (sortOrder != TemplateCoordinate)