From e52f2c1aabdb88b565c132e5587d05b91c4e6c0f Mon Sep 17 00:00:00 2001 From: ekiernan <55763654+ekiernan@users.noreply.github.com> Date: Thu, 11 Jul 2024 09:30:14 -0400 Subject: [PATCH 1/2] added nhash_id to multiome, optimus, paired-tag, and atac (#1316) * added nhash_id to multiome, optimus, paired-tag, and atac * changed to null input * Fix testing wdls for nhash_id * fixing nhash id errors * fixed nhash_id examples and verification wdls * made gex_nash_id optional in STARAlign mergestaroutputs task * fixing h5ad utils * fixing nash id python variable in atac wdl * fixed h5ad variable name in atac fragment file creation * fixed echo in STARAlign for nhash id * passing nhash id through pipeline * updated changelogs * updated documentation for nhash_id --- .../skylab/multiome/Multiome.changelog.md | 4 +- pipelines/skylab/multiome/Multiome.wdl | 7 +++- pipelines/skylab/multiome/atac.changelog.md | 3 +- pipelines/skylab/multiome/atac.wdl | 14 +++++-- .../Plumbing/10k_pbmc_downsampled.json | 3 +- .../test_inputs/Scientific/10k_pbmc.json | 3 +- pipelines/skylab/optimus/Optimus.changelog.md | 5 +++ pipelines/skylab/optimus/Optimus.wdl | 12 ++++-- .../Plumbing/human_v3_example.json | 3 +- .../Plumbing/mouse_v2_example.json | 1 + .../Plumbing/mouse_v2_snRNA_example.json | 1 + .../Scientific/inputs_8k_pbmc.json | 1 + .../Scientific/inputs_8k_pbmc_stranded.json | 1 + .../skylab/paired_tag/PairedTag.changelog.md | 4 +- pipelines/skylab/paired_tag/PairedTag.wdl | 10 +++-- .../Plumbing/10k_pbmc_downsampled.json | 3 +- .../Plumbing/BC011_BC015_downsampled.json | 3 +- .../Plumbing/BI015_downsampled.json | 3 +- .../test_inputs/Scientific/10k_pbmc.json | 3 +- .../test_inputs/Scientific/BC011_10kPBMC.json | 3 +- .../skylab/slideseq/SlideSeq.changelog.md | 5 +++ pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- ...iSampleSmartSeq2SingleNucleus.changelog.md | 4 ++ .../MultiSampleSmartSeq2SingleNucleus.wdl | 2 +- tasks/skylab/H5adUtils.wdl | 37 +++++++++++++++++++ tasks/skylab/StarAlign.wdl | 8 +++- verification/test-wdls/TestMultiome.wdl | 5 ++- verification/test-wdls/TestOptimus.wdl | 4 +- verification/test-wdls/TestPairedTag.wdl | 4 +- website/docs/Pipelines/ATAC/README.md | 1 + .../Pipelines/ATAC/count-matrix-overview.md | 1 + .../Pipelines/Multiome_Pipeline/README.md | 3 +- .../Optimus_Pipeline/Library-metrics.md | 1 + .../Pipelines/Optimus_Pipeline/Loom_schema.md | 1 + .../docs/Pipelines/Optimus_Pipeline/README.md | 3 +- 35 files changed, 138 insertions(+), 30 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 3063a9389f..94a1c07022 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,8 +1,10 @@ -# 5.1.2 +# 5.2.0 2024-07-09 (Date of Last Commit) +* Added new optional input parameter of nhash_id, an optional identifier for a library aliquot that is echoed in the ATAC fragment h5ad, the gene expression h5ad (in the data.uns), and the gene expression library metrics CSV output; default is set to null * Added test statements again for GH action (to release from develop). Will probably revert + # 5.1.0 2024-06-28 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index f57d25ec0a..607d78d8f9 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -6,10 +6,13 @@ import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/cellbender_remove_background.wdl" as CellBender workflow Multiome { - String pipeline_version = "5.1.2" + + String pipeline_version = "5.2.0" input { String input_id + # Additional library aliquot ID + String? nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -57,6 +60,7 @@ workflow Multiome { i1_fastq = gex_i1_fastq, input_id = input_id + "_gex", output_bam_basename = input_id + "_gex", + gex_nhash_id = nhash_id, tar_star_reference = tar_star_reference, annotations_gtf = annotations_gtf, mt_genes = mt_genes, @@ -82,6 +86,7 @@ workflow Multiome { whitelist = atac_whitelist, adapter_seq_read1 = adapter_seq_read1, annotations_gtf = annotations_gtf, + atac_nhash_id = nhash_id, adapter_seq_read3 = adapter_seq_read3 } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index 89ac275882..34587f7f6f 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,6 +1,7 @@ -# 2.0.2 +# 2.1.0 2024-07-09 (Date of Last Commit) +* Added new optional input parameter of atac_nhash_id, an identifier for a library aliquot that is echoed in the atac fragment metrics h5ad (in the data.uns); default is set to null * Added test statements again for GH action (to release from develop). Will probably revert # 2.0.0 diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 08ad6649de..2a43694bca 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -18,6 +18,8 @@ workflow ATAC { # Output prefix/base name for all intermediate files and pipeline outputs String input_id + # Additional library aliquot ID + String? atac_nhash_id # Option for running files with preindex Boolean preindex = false @@ -41,7 +43,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.0.2" + String pipeline_version = "2.1.0" parameter_meta { read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads" @@ -105,7 +107,8 @@ workflow ATAC { bam = BBTag.bb_bam, chrom_sizes = chrom_sizes, annotations_gtf = annotations_gtf, - preindex = preindex + preindex = preindex, + atac_nhash_id = atac_nhash_id } } if (!preindex) { @@ -114,7 +117,8 @@ workflow ATAC { bam = BWAPairedEndAlignment.bam_aligned_output, chrom_sizes = chrom_sizes, annotations_gtf = annotations_gtf, - preindex = preindex + preindex = preindex, + atac_nhash_id = atac_nhash_id } } @@ -447,6 +451,7 @@ task CreateFragmentFile { Int mem_size = 16 Int nthreads = 4 String cpuPlatform = "Intel Cascade Lake" + String atac_nhash_id = "" } String bam_base_name = basename(bam, ".bam") @@ -470,6 +475,7 @@ task CreateFragmentFile { chrom_sizes = "~{chrom_sizes}" atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" + atac_nhash_id = "~{atac_nhash_id}" # calculate chrom size dictionary based on text file chrom_size_dict={} @@ -494,6 +500,8 @@ task CreateFragmentFile { # those settings allow us to retain all barcodes pp.import_data("~{bam_base_name}.fragments.tsv", file="temp_metrics.h5ad", chrom_sizes=chrom_size_dict, min_num_fragments=0) atac_data = ad.read_h5ad("temp_metrics.h5ad") + # Add nhash_id to h5ad file as unstructured metadata + atac_data.uns['NHashID'] = atac_nhash_id # calculate tsse metrics snap.metrics.tsse(atac_data, atac_gtf) # Write new atac file diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json index 7d15111f38..297bdfa2dc 100644 --- a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -23,5 +23,6 @@ "Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake", "Multiome.Atac.num_threads_bwa":"16", "Multiome.Atac.mem_size_bwa":"64", - "Multiome.soloMultiMappers":"Uniform" + "Multiome.soloMultiMappers":"Uniform", + "Multiome.nhash_id":"example_1234" } diff --git a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json index a5ddf2c947..c4965dd9a4 100644 --- a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json +++ b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json @@ -30,5 +30,6 @@ "Multiome.chrom_sizes":"gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", "Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake", "Multiome.Atac.num_threads_bwa":"24", - "Multiome.Atac.mem_size_bwa":"175" + "Multiome.Atac.mem_size_bwa":"175", + "Multiome.nhash_id":"example_1234" } diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index 67813f5920..4860fcb13a 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,8 @@ +# 7.3.0 +2024-07-09 (Date of Last Commit) + +* Added new optional input parameter of gex_nhash_id, a string identifier for a library aliquot that is echoed in the h5ad cell by gene matrix (in the data.uns) and the library metrics CSV output; default is set to null + # 7.2.0 2024-06-28 (Date of Last Commit) diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 1cdb128f17..fd79a6d50a 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -22,6 +22,8 @@ workflow Optimus { Array[File] r2_fastq Array[File]? i1_fastq String input_id + # String for additional library aliquot ID + String? gex_nhash_id = "" String output_bam_basename = input_id String? input_name String? input_id_metadata_field @@ -66,7 +68,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "7.2.0" + String pipeline_version = "7.3.0" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays @@ -170,7 +172,8 @@ workflow Optimus { umipercell = STARsoloFastq.umipercell, input_id = input_id, counting_mode = counting_mode, - expected_cells = expected_cells + expected_cells = expected_cells, + gex_nhash_id = gex_nhash_id } if (counting_mode == "sc_rna"){ call RunEmptyDrops.RunEmptyDrops { @@ -186,6 +189,7 @@ workflow Optimus { call H5adUtils.OptimusH5adGeneration{ input: input_id = input_id, + gex_nhash_id = gex_nhash_id, input_name = input_name, input_id_metadata_field = input_id_metadata_field, input_name_metadata_field = input_name_metadata_field, @@ -212,11 +216,13 @@ workflow Optimus { summary = STARsoloFastq.summary_sn_rna, align_features = STARsoloFastq.align_features_sn_rna, umipercell = STARsoloFastq.umipercell_sn_rna, - input_id = input_id + input_id = input_id, + gex_nhash_id = gex_nhash_id } call H5adUtils.SingleNucleusOptimusH5adOutput as OptimusH5adGenerationWithExons{ input: input_id = input_id, + gex_nhash_id = gex_nhash_id, input_name = input_name, input_id_metadata_field = input_id_metadata_field, input_name_metadata_field = input_name_metadata_field, diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json index 612659d25c..087a8667d5 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json @@ -15,5 +15,6 @@ "Optimus.input_id": "pbmc_human_v3", "Optimus.tenx_chemistry_version": "3", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf", - "Optimus.star_strand_mode": "Forward" + "Optimus.star_strand_mode": "Forward", + "Optimus.gex_nhash_id":"example_1234" } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json index 0dc26af9fd..39e5cf9b83 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json @@ -27,5 +27,6 @@ "Optimus.input_id": "neurons2k_mouse", "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Unstranded", + "Optimus.gex_nhash_id":"example_1234", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf" } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json index 787a1a8347..c4f712a56b 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json @@ -25,5 +25,6 @@ "Optimus.star_strand_mode": "Unstranded", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf", "Optimus.counting_mode": "sn_rna", + "Optimus.gex_nhash_id":"example_1234", "Optimus.count_exons": true } diff --git a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json index 773af4f2f4..10d30103b3 100644 --- a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json +++ b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc.json @@ -15,6 +15,7 @@ "Optimus.input_id": "8k_pbmc", "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Unstranded", + "Optimus.gex_nhash_id":"example_1234", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf" } diff --git a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json index 98c9c9912d..bcdf703f58 100644 --- a/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json +++ b/pipelines/skylab/optimus/test_inputs/Scientific/inputs_8k_pbmc_stranded.json @@ -15,6 +15,7 @@ "Optimus.input_id": "8k_pbmc", "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Forward", + "Optimus.gex_nhash_id":"example_1234", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf" } diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index be80a53604..747762b380 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,8 +1,10 @@ -# 1.1.2 +# 1.2.0 2024-07-09 (Date of Last Commit) +* Added new optional input parameter of nhash_id, an optional identifier for a library aliquot that is echoed in the workflow fragment h5ad, the Optimus workflow gene expression h5ad (in the data.uns), and the Optimus gene expression library metrics CSV output; default is set to null * Added test statements again for GH action (to release from develop). Will probably revert + # 1.1.0 2024-06-28 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 8417bfbe50..b647ade474 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -5,10 +5,12 @@ import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/skylab/PairedTagUtils.wdl" as Demultiplexing workflow PairedTag { - String pipeline_version = "1.1.2" + String pipeline_version = "1.2.0" input { String input_id + # Additional library aliquot id + String? nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -63,7 +65,8 @@ workflow PairedTag { ignore_r1_read_length = ignore_r1_read_length, star_strand_mode = star_strand_mode, count_exons = count_exons, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + gex_nhash_id = nhash_id } # Call the ATAC workflow @@ -91,7 +94,8 @@ workflow PairedTag { adapter_seq_read1 = adapter_seq_read1, adapter_seq_read3 = adapter_seq_read3, annotations_gtf = annotations_gtf, - preindex = preindex + preindex = preindex, + atac_nhash_id = nhash_id } if (preindex) { diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json index 44e7247682..c2ad3acc9c 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -23,5 +23,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.nhash_id":"example_1234" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json index 9e7b18b679..6401549cae 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json @@ -23,5 +23,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.nhash_id":"example_1234" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json index 2bdd7a8fe2..2c2f9e5afa 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json @@ -23,5 +23,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.nhash_id":"example_1234" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json index d4265b3c2a..2e600d192f 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json +++ b/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json @@ -32,5 +32,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"24", "PairedTag.Atac_preindex.mem_size_bwa":"175", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.nhash_id":"example_1234" } \ No newline at end of file diff --git a/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json b/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json index 7fd31930c9..d1f5030d5a 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json +++ b/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json @@ -29,5 +29,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.nhash_id":"example_1234" } \ No newline at end of file diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index dd4d9f9248..cb7c2cebbf 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,8 @@ +# 3.1.8 +2024-07-09 (Date of Last Commit) + +* Added new optional input parameter of gex_nhash_id to the STARAlign task; this does not impact the SlideSeq workflow + # 3.1.7 2024-06-28 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 09dac4dff8..3f8ddc3548 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -23,7 +23,7 @@ import "../../../tasks/skylab/MergeSortBam.wdl" as Merge workflow SlideSeq { - String pipeline_version = "3.1.7" + String pipeline_version = "3.1.8" input { Array[File] r1_fastq diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index 4813b84c11..cfe0955206 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,3 +1,7 @@ +# 1.3.6 +2024-07-09 (Date of Last Commit) +* Added new optional input parameter of gex_nhash_id to the STARAlign task; this does not impact the MultiSampleSmartSeq2SingleNucleus workflow + # 1.3.5 2024-06-28 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index 9de5a0f92b..f31a3cf253 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -40,7 +40,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { String? input_id_metadata_field } # Version of this pipeline - String pipeline_version = "1.3.5" + String pipeline_version = "1.3.6" if (false) { String? none = "None" diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 924b19a770..3c5e60b585 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -9,6 +9,7 @@ task OptimusH5adGeneration { String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" # name of the sample String input_id + String gex_nhash_id = "" # user provided id String? input_name String? input_id_metadata_field @@ -85,6 +86,23 @@ task OptimusH5adGeneration { --expression_data_type "whole_transcript"\ --pipeline_version ~{pipeline_version} fi + + # modify h5ad + python3 <>> runtime { @@ -108,6 +126,8 @@ task SingleNucleusOptimusH5adOutput { String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" # name of the sample String input_id + # additional aliquot id + String gex_nhash_id = "" # user provided id String? input_name String? input_id_metadata_field @@ -167,6 +187,23 @@ task SingleNucleusOptimusH5adOutput { ~{"--input_name_metadata_field " + input_name_metadata_field} \ --expression_data_type "whole_transcript" \ --pipeline_version ~{pipeline_version} + + # modify h5ad + python3 < ~{input_id}_~{gex_nhash_id}_library_metrics.csv + echo "tarring STAR txt files" tar -zcvf ~{input_id}.star_metrics.tar *.txt else echo "No text files found in the folder." @@ -599,7 +605,7 @@ task MergeStarOutput { File col_index = "~{input_id}_sparse_counts_col_index.npy" File sparse_counts = "~{input_id}_sparse_counts.npz" File? cell_reads_out = "~{input_id}.star_metrics.tar" - File? library_metrics="~{input_id}_library_metrics.csv" + File? library_metrics="~{input_id}_~{gex_nhash_id}_library_metrics.csv" File? mtx_files ="~{input_id}.mtx_files.tar" } } diff --git a/verification/test-wdls/TestMultiome.wdl b/verification/test-wdls/TestMultiome.wdl index 9a4a0ec83a..1f8b71ba0b 100644 --- a/verification/test-wdls/TestMultiome.wdl +++ b/verification/test-wdls/TestMultiome.wdl @@ -10,6 +10,7 @@ workflow TestMultiome { input { String input_id + String nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -85,8 +86,8 @@ workflow TestMultiome { chrom_sizes = chrom_sizes, atac_whitelist = atac_whitelist, run_cellbender = run_cellbender, - soloMultiMappers = soloMultiMappers - + soloMultiMappers = soloMultiMappers, + nhash_id = nhash_id } diff --git a/verification/test-wdls/TestOptimus.wdl b/verification/test-wdls/TestOptimus.wdl index 82bdf03adc..cd9097f70c 100644 --- a/verification/test-wdls/TestOptimus.wdl +++ b/verification/test-wdls/TestOptimus.wdl @@ -17,6 +17,7 @@ workflow TestOptimus { Array[File] r2_fastq Array[File]? i1_fastq String input_id + String gex_nhash_id String output_bam_basename = input_id String? input_name String? input_id_metadata_field @@ -84,7 +85,8 @@ workflow TestOptimus { star_strand_mode = star_strand_mode, count_exons = count_exons, ignore_r1_read_length = ignore_r1_read_length, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + gex_nhash_id = gex_nhash_id } # Collect all of the pipeling output into single Array diff --git a/verification/test-wdls/TestPairedTag.wdl b/verification/test-wdls/TestPairedTag.wdl index 9695fb98e6..2cfd56cebd 100644 --- a/verification/test-wdls/TestPairedTag.wdl +++ b/verification/test-wdls/TestPairedTag.wdl @@ -10,6 +10,7 @@ workflow TestPairedTag { input { String input_id + String nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -86,7 +87,8 @@ workflow TestPairedTag { adapter_seq_read3 = adapter_seq_read3, chrom_sizes = chrom_sizes, atac_whitelist = atac_whitelist, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + nhash_id = nhash_id } diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index ac4c411e6e..3f922e86a7 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -50,6 +50,7 @@ The following describes the inputs of the ATAC workflow. For more details on how | read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | | read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | | input_id | Output prefix/base name for all intermediate files and pipeline outputs. | +| atac_nhash_id | String that represents an optional library aliquot identifier. When used, it is echoed in the h5ad unstructured data. | | preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | | tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | | num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | diff --git a/website/docs/Pipelines/ATAC/count-matrix-overview.md b/website/docs/Pipelines/ATAC/count-matrix-overview.md index d18c1eb41d..ea03e788c1 100644 --- a/website/docs/Pipelines/ATAC/count-matrix-overview.md +++ b/website/docs/Pipelines/ATAC/count-matrix-overview.md @@ -18,6 +18,7 @@ The global attributes (unstuctured metadata) in the h5ad apply to the whole file | Attribute | Program | Details | | --- | --- | --- | | `reference_sequences` | [SnapATAC2](https://github.com/kaizhang/SnapATAC2) | Data frame containing the chromosome sizes for the genome build (i.e., hg38); created using the [`chrom_sizes` pipeline input](README.md). | +| `NHashID` | N/A | A string that represents the NHashID if specified in the workflow | ## Table 2. Cell metrics diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index dfa8d4bd91..308a7e8bab 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -55,6 +55,7 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | Input name | Description | Type | | --- | --- | --- | | input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | +| nhash_id | Optional identifier for the library aliquot; when specified, the workflow will echo the ID in the ATAC and gene expression output h5ads (in the adata.uns section) and in the library-level metrics CSV. | | annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | | gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | | gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | @@ -120,7 +121,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | multimappers_PropUnique_matrix | `UniqueAndMult-PropUnique.mtx` | Optional output produced when `soloMultiMappers` is "PropUnique"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | gex_aligner_metrics | `.star_metrics.tar` | Text file containing per barcode metrics (`CellReads.stats`) produced by the GEX pipeline STARsolo aligner. | | mtx_files | `.mtx_files.tar` | TAR file with STARsolo matrix market files (barcodes.tsv, features.tsv, and matrix.mtx) | TAR | -| library_metrics | `_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | +| library_metrics | `__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | | cell_barcodes_csv | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information.| | checkpoint_file | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | | h5_array | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | diff --git a/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md b/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md index 38cd48398e..46f0811b7d 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md +++ b/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md @@ -11,6 +11,7 @@ To produce the library-level metrics here, the [combined_mtx.py script](https:// | Metric | Description | | ---| --- | +| nhash_id | The first line of of the metrics CSV echos the NHash ID if specified in the workflow run | | number_of_reads | Total number of reads.| | sequencing_saturation | Proportion of unique molecular identifiers (UMIs) observed relative to the total number of possible UMIs. | | fraction_of_unique_reads_mapped_to_genome | Fraction of unique reads that map to the genome. | diff --git a/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md b/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md index 8bf61109e8..ce811e1621 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md +++ b/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md @@ -32,6 +32,7 @@ The global attributes (unstuctured metadata) in the h5ad apply to the whole file | `input_id_metadata_field` | Optional string that describes, when applicable, the metadata field containing the `input_id`. | | `input_name_metadata_field` | Optional string that describes, when applicable, the metadata field containing the `input_name`. | | `pipeline_version` | String describing the version of the Optimus pipeline run on the data. | +| `NHashID` | String that represents NHashID (an optional library aliquot identifier) if specified during the worfklow run. | ## Table 2. Cell metrics diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index e6ca0b8187..2c5979d658 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -89,6 +89,7 @@ The example configuration files also contain metadata for the reference files, d | read_struct | String describing the structure of reads; the workflow automatically selects the [10x Genomics](https://www.10xgenomics.com/) read structure that corresponds to the v2 or v3 chemistry based on the input `tenx_chemistry_version`. A custom read structure can also be provided if the input data was generated with a chemistry different from 10x Genomics v2 or v3. To use a custom read structure, set the input `force_no_check` to "true". | N/A | | tar_star_reference | TAR file containing a species-specific reference genome and GTF; it is generated using the [BuildIndices workflow](https://github.com/broadinstitute/warp/tree/master/pipelines/skylab/build_indices/BuildIndices.wdl). | N/A | | input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | N/A | +| gex_nhash_id | Optional string to identify the library aliquot; will be echoed in the output h5ad file in the adata.uns and the library-level metrics CSV; default is null (`""`) | N/A | | input_name | Optional string that can be used to further identify the original biological sample. | N/A | | input_id_metadata_field | Optional string describing, when applicable, the metadata field containing the input_id. | N/A | | input_name_metadata_field | Optional string describing, when applicable, the metadata field containing the input_name. | N/A | @@ -256,7 +257,7 @@ The following table lists the output files produced from the pipeline. For sampl | cell_metrics | `.cell-metrics.csv.gz` | Matrix of metrics by cells. | Compressed CSV | | gene_metrics | `.gene-metrics.csv.gz` | Matrix of metrics by genes. | Compressed CSV | | aligner_metrics | `.star_metrics.tar` | Tarred metrics files produced by the STARsolo aligner; contains align features, cell reads, summary, and UMI per cell metrics files. | TXT | -| library_metrics | `_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. See the [Library-level metrics](./Library-metrics.md) for how metrics are calculated. | CSV | +| library_metrics | `__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. See the [Library-level metrics](./Library-metrics.md) for how metrics are calculated. | CSV | | multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX | | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX | | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | MTX | From b8a753edd8e761a835164cc7b84f136c7fbe51d1 Mon Sep 17 00:00:00 2001 From: Kevin Palis Date: Thu, 11 Jul 2024 13:11:54 -0400 Subject: [PATCH 2/2] Fixing branch detection. Adding error handling if branch provided is neither develop or master (#1329) --- .github/workflows/warp_release.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/warp_release.yml b/.github/workflows/warp_release.yml index 495c047f7b..37d1c0e3b7 100644 --- a/.github/workflows/warp_release.yml +++ b/.github/workflows/warp_release.yml @@ -1,4 +1,3 @@ - # This action releases from develop/master for all changed pipelines name: WARP Release @@ -40,16 +39,20 @@ jobs: run: | source scripts/common.sh set -e - if [[ "${GIT_BRANCH}" == "develop" ]]; then + BRANCH_NAME=$(echo "${GITHUB_REF#refs/heads/}") + if [[ "${BRANCH_NAME}" == "develop" ]]; then ENV=dev - elif [[ "${GIT_BRANCH}" == "master" ]]; then + elif [[ "${BRANCH_NAME}" == "master" ]]; then ENV=prod + else + echo "Error: Branch ${BRANCH_NAME} is not a valid release branch." + exit 1 fi echo $ENV echo "Getting all changed pipelines since last commit before releasing from develop" previous_commit_hash=$(git rev-parse HEAD^1) changed_pipelines=$(get_modified_pipelines ${previous_commit_hash}) - echo branch: ${GIT_BRANCH} previous_commit_hash: ${previous_commit_hash} env: ${ENV} + echo branch: ${BRANCH_NAME} previous_commit_hash: ${previous_commit_hash} env: ${ENV} if [[ -n ${ENV} ]]; then if [[ -n ${changed_pipelines[@]} ]]; then for pipeline in ${changed_pipelines[@]}; do @@ -62,6 +65,4 @@ jobs: echo "Releases are only made on merge to develop and master" fi env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file