Skip to content

Commit

Permalink
feat: FastqToBam can extract UMI(s) from the comment in the read name
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed May 20, 2024
1 parent ba0788e commit a316211
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 9 deletions.
19 changes: 15 additions & 4 deletions src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,15 @@ import htsjdk.samtools.{ReservedTagConstants, SAMFileHeader, SAMReadGroupRecord}
|For more information on read structures see the
|[Read Structure Wiki Page](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures)
|
|UMIs may be extracted from the read sequences, the read names, or both. If `--extract-umis-from-read-names` is
|UMIs may be extracted from the read sequences, the read names (or comment), or both. If `--extract-umis-from-read-names` is
|specified, any UMIs present in the read names are extracted; read names are expected to be `:`-separated with
|any UMIs present in the 8th field. If this option is specified, the `--umi-qual-tag` option may not be used as
|qualities are not available for UMIs in the read name. If UMI segments are present in the read structures those
|will also be extracted. If UMIs are present in both, the final UMIs are constructed by first taking the UMIs
|from the read names, then adding a hyphen, then the UMIs extracted from the reads.
|from the read names, then adding a hyphen, then the UMIs extracted from the reads. If `--extract-umis-from-read-comment` is
|specified, any UMIs present in the read name comments are extracted; the read name comment is the text _after_
|the first white space in the read name (like a FASTA). If the comment is `:`-separated, then the UMI will be
|extracted from the last field, otherwise the full comment will be used.
|
|The same number of input files and read structures must be provided, with one exception: if supplying exactly
|1 or 2 fastq files, both of which are solely template reads, no read structures need be provided.
Expand All @@ -93,7 +96,10 @@ class FastqToBam
@arg(flag='u', doc="Tag in which to store molecular barcodes/UMIs.") val umiTag: String = ConsensusTags.UmiBases,
@arg(flag='q', doc="Tag in which to store molecular barcode/UMI qualities.") val umiQualTag: Option[String] = None,
@arg(flag='Q', doc="Store the sample barcode qualities in the QT Tag.") val storeSampleBarcodeQualities: Boolean = false,
@arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.") val extractUmisFromReadNames: Boolean = false,
@arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadComment"))
val extractUmisFromReadNames: Boolean = false,
@arg(flag='c', doc="Extract UMI(s) from read name comment and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadNames"))
val extractUmisFromReadComment: Boolean = false,
@arg( doc="Read group ID to use in the file header.") val readGroupId: String = "A",
@arg( doc="The name of the sequenced sample.") val sample: String,
@arg( doc="The name/ID of the sequenced library.") val library: String,
Expand All @@ -117,6 +123,7 @@ class FastqToBam
validate(input.length == actualReadStructures.length, "input and read-structure must be supplied the same number of times.")
validate(1 to 2 contains actualReadStructures.flatMap(_.templateSegments).size, "read structures must contain 1-2 template reads total.")
validate(!extractUmisFromReadNames || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read names.")
validate(!extractUmisFromReadComment || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read description.")

override def execute(): Unit = {
val encoding = qualityEncoding
Expand Down Expand Up @@ -166,7 +173,11 @@ class FastqToBam
val templates = subs.iterator.filter(_.kind == Template).toList

// If requested, pull out the UMI(s) from the read name
val umiFromReadName = if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) else None
val umiFromReadName = {
if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true)
else if (extractUmisFromReadComment) fqs.head.comment.flatMap(comment => Umis.extractUmisFromReadComment(comment, strict=true))
else None
}

templates.zipWithIndex.map { case (read, index) =>
// If the template read had no bases, we'll substitute in a single N @ Q2 below to keep htsjdk happy
Expand Down
40 changes: 38 additions & 2 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ object Umis {

/**
* Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as:
* @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>
* `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>``
*
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
* field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be
* translated to hyphens before returning.
*
* If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments,
with the UMI being the last in the case of 8 and `None` in the case of 7.
* with the UMI being the last in the case of 8 and `None` in the case of 7.
*
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
*/
Expand All @@ -88,6 +88,42 @@ object Umis {
else umi
}

/**
* Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as:
* `@<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>``
*
* The index in the description may be replaced with a UMI field.
*
* See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
* The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI
* field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be
* translated to hyphens before returning.
*
* If `strict` is true the comment _must_ contain either 4 colon-separated segments,
* with the UMI being the last in the case of 4.
*
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
*/
def extractUmisFromReadComment(comment: String, delimiter: Char = ':', strict: Boolean): Option[String] = {
// If strict, check that the read name actually has eight parts, which is expected
val rawUmi = if (strict) {
val colons = comment.count(_ == delimiter)
if (colons == 3) Some(comment.substring(comment.lastIndexOf(delimiter) + 1, comment.length))
else throw new IllegalArgumentException(s"Trying to extract UMI from read comment with ${colons + 1} parts (4 expected): ${comment}")
} else {
val idx = comment.lastIndexOf(delimiter)
if (idx == -1) Some(comment)
else Some(comment.substring(idx + 1, comment.length))
}

val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
val valid = umi.forall(u => u.forall(isValidUmiCharacter))

if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${comment}")
else if (!valid) None
else umi
}

@inline private def isValidUmiCharacter(ch: Char): Boolean = {
ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-'
}
Expand Down
31 changes: 31 additions & 0 deletions src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,37 @@ class FastqToBamTest extends UnitSpec {
recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA"
}


it should "extract UMIs from the comment in read names when requested" in {
val r1 = fq(
FastqRecord("q1:2:3:4:5:6:7:8", "AAAAAAAAAA", "==========", comment=Some("0:1:2:ACGT")),
FastqRecord("q2:2:3:4:5:6:7:8", "TAAAAAAAAA", "==========", comment=Some("0:1:2:TTGA")),
FastqRecord("q3:2:3:4:5:6:7", "TAAAAAAAAA", "=========="),
)
val bam = makeTempFile("fastqToBamTest.", ".bam")
new FastqToBam(input=Seq(r1), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute()
val recs = readBamRecs(bam)
recs should have size 3
recs(0).apply[String]("RX") shouldBe "ACGT"
recs(1).apply[String]("RX") shouldBe "TTGA"
recs(2).get[String]("RX") shouldBe None
}

it should "extract UMIs from the comment in read names and read sequences" in {
val r1 = fq(
FastqRecord("q1:2:3:4:5:6:7:8", "GGNCCGAAAAAAA", "=============", comment=Some("0:1:2:ACGT+CGTA")),
FastqRecord("q2:2:3:4:5:6:7:8", "TANAACAAAAAAA", "=============", comment=Some("0:1:2:TTGA+TAAT")),
)
val rs = ReadStructure("2M1S2M+T")
val bam = makeTempFile("fastqToBamTest.", ".bam")
new FastqToBam(input=Seq(r1), readStructures=Seq(rs), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute()
val recs = readBamRecs(bam)
recs should have size 2
recs(0).apply[String]("RX") shouldBe "ACGT-CGTA-GG-CC"
recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA"
}


it should "fail when read names don't match up" in {
val r1 = fq(FastqRecord("q1", "AAAAAAAAAA", "=========="))
val r2 = fq(FastqRecord("x1", "CCCCCCCCCC", "##########"))
Expand Down
41 changes: 38 additions & 3 deletions src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class UmisTest extends UnitSpec with OptionValues {
Umis.extractUmisFromReadName("1:2:3:4:5:6:7:ACGTACGT", strict=true).value shouldBe "ACGTACGT"
}

it should "return None if the read name has only 7 parts in strict mode" in {
it should "return None if the read has only 7 parts in strict mode" in {
Umis.extractUmisFromReadName("1:2:3:4:5:6:7", strict=true) shouldBe None
}

Expand Down Expand Up @@ -78,6 +78,41 @@ class UmisTest extends UnitSpec with OptionValues {
Umis.extractUmisFromReadName("1:2:3:4:5:6:7:8", strict=false) shouldBe None
}

"Umis.extractUmisFromReadComment" should "extract a UMI from a well-formed read name" in {
Umis.extractUmisFromReadComment("0:1:2:ACGTACGT", strict=true).value shouldBe "ACGTACGT"
}

it should "throw an exception in strict mode if the read has too many or too few segments" in {
an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:ACGTACGT", strict=true) }
an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:2:3:ACGTACGT", strict=true) }
}

it should "translate pluses to hyphens when multiple UMIs are present" in {
Umis.extractUmisFromReadComment("0:1:2:ACGTACGT+TTGCGGCT", strict=true).value shouldBe "ACGTACGT-TTGCGGCT"
}

it should "extract a UMI from the last segment in non-strict mode, if it looks like a UMI" in {
Umis.extractUmisFromReadComment("ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:3:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:3:4:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:3:4:5:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:3:4:5:6:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:ACGT", strict=false).value shouldBe "ACGT"
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:9:ACGT", strict=false).value shouldBe "ACGT"
}

it should "return None in non-strict mode if the last segment doesn't look like a UMI" in {
Umis.extractUmisFromReadComment("1:2", strict=false) shouldBe None
Umis.extractUmisFromReadComment("1:2:3", strict=false) shouldBe None
Umis.extractUmisFromReadComment("1:2:3:4", strict=false) shouldBe None
Umis.extractUmisFromReadComment("1:2:3:4:5", strict=false) shouldBe None
Umis.extractUmisFromReadComment("1:2:3:4:5:6", strict=false) shouldBe None
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7", strict=false) shouldBe None
Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8", strict=false) shouldBe None
}

"Umis.copyUmiFromReadName" should "copy the UMI from the read name" in {
copyUmiFromReadName(rec=rec("UMI:A")).nameAndUmi shouldBe ("UMI:A", "A")
Expand All @@ -91,7 +126,7 @@ class UmisTest extends UnitSpec with OptionValues {
copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT"), removeUmi=true).nameAndUmi shouldBe ("UMI:C", "ACC-GGT")
}

it should "split on a different name delimiter if specified" in {
it should "split on a different delimiter if specified" in {
copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A")
copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT")
Expand All @@ -101,7 +136,7 @@ class UmisTest extends UnitSpec with OptionValues {
copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT")).nameAndUmi shouldBe ("UMI:C:ACC+GGT", "ACC-GGT")
}

it should "fail if the read name has only one field" in {
it should "fail if the read has only one field" in {
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME"))
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("1-2"))
}
Expand Down

0 comments on commit a316211

Please sign in to comment.