diff --git a/pipeline_versions.txt b/pipeline_versions.txt index f7dfdd8c4a..fe5b88cb04 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -28,15 +28,15 @@ BroadInternalUltimaGenomics 1.1.0 2024-09-06 BroadInternalImputation 1.1.13 2024-09-06 BroadInternalArrays 1.1.12 2024-09-06 MultiSampleSmartSeq2 2.2.22 2024-09-11 -MultiSampleSmartSeq2SingleNucleus 2.0.0 2024-09-11 -PairedTag 1.6.1 2024-09-11 +MultiSampleSmartSeq2SingleNucleus 2.0.1 2024-09-24 +PairedTag 1.7.0 2024-09-24 SmartSeq2SingleSample 5.1.21 2024-09-11 scATAC 1.3.2 2023-08-03 -Optimus 7.6.1 2024-09-11 -Multiome 5.6.1 2024-09-11 +Optimus 7.7.0 2024-09-24 +Multiome 5.7.0 2024-09-24 snm3C 4.0.4 2024-08-06 BuildIndices 3.0.0 2023-12-06 atac 2.3.1 2024-09-11 -SlideSeq 3.4.1 2024-09-11 +SlideSeq 3.4.2 2024-09-24 BuildCembaReferences 1.0.0 2020-11-15 CEMBA 1.1.7 2024-09-06 diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 843e4baced..40e399d729 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,8 @@ +# 5.7.0 +2024-09-24 (Date of Last Commit) +* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad +* Updated gene_names in the final h5ad to be unique + # 5.6.1 2024-09-11 (Date of Last Commit) * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Multiome pipeline diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 3979a4fa7d..ce14e6c476 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -9,14 +9,15 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.6.1" + String pipeline_version = "5.7.0" input { String cloud_provider String input_id # Additional library aliquot ID - String? nhash_id + String? gex_nhash_id + String? atac_nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -89,7 +90,7 @@ workflow Multiome { i1_fastq = gex_i1_fastq, input_id = input_id + "_gex", output_bam_basename = input_id + "_gex", - gex_nhash_id = nhash_id, + gex_nhash_id = gex_nhash_id, tar_star_reference = tar_star_reference, annotations_gtf = annotations_gtf, mt_genes = mt_genes, @@ -118,7 +119,7 @@ workflow Multiome { adapter_seq_read1 = adapter_seq_read1, vm_size = vm_size, annotations_gtf = annotations_gtf, - atac_nhash_id = nhash_id, + atac_nhash_id = atac_nhash_id, adapter_seq_read3 = adapter_seq_read3 } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json index 108c300744..41e68c3307 100644 --- a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -25,5 +25,6 @@ "Multiome.Atac.num_threads_bwa":"16", "Multiome.Atac.mem_size_bwa":"64", "Multiome.soloMultiMappers":"Uniform", - "Multiome.nhash_id":"example_1234" + "Multiome.gex_nhash_id":"example_1234", + "Multiome.atac_nhash_id":"example_1234" } diff --git a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json index 2937b6f4b9..7e35f1a86e 100644 --- a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json +++ b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json @@ -32,5 +32,6 @@ "Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake", "Multiome.Atac.num_threads_bwa":"24", "Multiome.Atac.mem_size_bwa":"175", - "Multiome.nhash_id":"example_1234" + "Multiome.gex_nhash_id":"example_1234", + "Multiome.gex_nhash_id":"example_1234" } diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index 9a51ef009d..f8418bce8d 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,9 @@ +# 7.7.0 +2024-09-24 (Date of Last Commit) + +* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad +* Updated gene_names in the final h5ad to be unique + # 7.6.1 2024-09-11 (Date of Last Commit) * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Optimus pipeline diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index eb03dfb30a..70402c6ced 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -26,7 +26,7 @@ workflow Optimus { Array[File]? i1_fastq String input_id # String for additional library aliquot ID - String? gex_nhash_id = "" + String? gex_nhash_id String output_bam_basename = input_id String? input_name String? input_id_metadata_field @@ -71,7 +71,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "7.6.1" + String pipeline_version = "7.7.0" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays @@ -91,8 +91,9 @@ workflow Optimus { String pytools_docker = "pytools:1.0.0-1661263730" String empty_drops_docker = "empty-drops:1.0.1-4.2" String star_docker = "star:1.0.1-2.7.11a-1692706072" - String warp_tools_docker_2_2_0 = "warp-tools:2.3.0" - String star_merge_docker = "star-merge-npz:1.2" + String warp_tools_docker_2_2_0 = "warp-tools:2.4.0" + String star_merge_docker = "star-merge-npz:1.3.0" + #TODO how do we handle these? String alpine_docker = "alpine-bash@sha256:965a718a07c700a5204c77e391961edee37477634ce2f9cf652a8e4c2db858ff" @@ -241,11 +242,13 @@ workflow Optimus { input: input_id = input_id, gex_nhash_id = gex_nhash_id, + expected_cells = expected_cells, input_name = input_name, input_id_metadata_field = input_id_metadata_field, input_name_metadata_field = input_name_metadata_field, annotation_file = annotations_gtf, library_metrics = MergeStarOutputs.library_metrics, + cellbarcodes = MergeStarOutputs.outputbarcodes, cell_metrics = CellMetrics.cell_metrics, gene_metrics = GeneMetrics.gene_metrics, sparse_count_matrix = MergeStarOutputs.sparse_counts, @@ -276,11 +279,14 @@ workflow Optimus { input: input_id = input_id, gex_nhash_id = gex_nhash_id, + expected_cells = expected_cells, input_name = input_name, + counting_mode = counting_mode, input_id_metadata_field = input_id_metadata_field, input_name_metadata_field = input_name_metadata_field, annotation_file = annotations_gtf, library_metrics = MergeStarOutputs.library_metrics, + cellbarcodes = MergeStarOutputs.outputbarcodes, cell_metrics = CellMetrics.cell_metrics, gene_metrics = GeneMetrics.gene_metrics, sparse_count_matrix = MergeStarOutputs.sparse_counts, diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index 7ad1571702..ab5104b3b8 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,8 @@ +# 1.7.0 +2024-09-24 (Date of Last Commit) +* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad +* Updated gene_names in the final h5ad to be unique + # 1.6.1 2024-09-11 (Date of Last Commit) * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the PairedTag pipeline diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index c401d25928..938acb24b9 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,13 +8,14 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.6.1" + String pipeline_version = "1.7.0" input { String input_id # Additional library aliquot id - String? nhash_id + String? gex_nhash_id + String? atac_nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -97,7 +98,7 @@ workflow PairedTag { count_exons = count_exons, cloud_provider = cloud_provider, soloMultiMappers = soloMultiMappers, - gex_nhash_id = nhash_id + gex_nhash_id = gex_nhash_id } # Call the ATAC workflow @@ -131,7 +132,7 @@ workflow PairedTag { preindex = preindex, cloud_provider = cloud_provider, vm_size = vm_size, - atac_nhash_id = nhash_id + atac_nhash_id = atac_nhash_id } if (preindex) { diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json index d5cfaf7181..a1df3f587c 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -25,5 +25,6 @@ "PairedTag.Atac_preindex.mem_size_bwa":"64", "PairedTag.soloMultiMappers":"Uniform", "PairedTag.cloud_provider": "gcp", - "PairedTag.nhash_id":"example_1234" + "PairedTag.gex_nhash_id":"example_1234", + "PairedTag.atac_nhash_id":"example_1234" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json index 1a22504c14..fd2ffd1510 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json @@ -25,5 +25,6 @@ "PairedTag.Atac_preindex.mem_size_bwa":"64", "PairedTag.soloMultiMappers":"Uniform", "PairedTag.cloud_provider": "gcp", - "PairedTag.nhash_id":"example_1234" + "PairedTag.gex_nhash_id":"example_1234", + "PairedTag.atac_nhash_id":"example_1234" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json index 27e1b1b124..1b185c8d47 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json @@ -25,5 +25,6 @@ "PairedTag.Atac_preindex.mem_size_bwa":"64", "PairedTag.soloMultiMappers":"Uniform", "PairedTag.cloud_provider": "gcp", - "PairedTag.nhash_id":"example_1234" + "PairedTag.gex_nhash_id":"example_1234", + "PairedTag.atac_nhash_id":"example_1234" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json index b34c3986aa..47c8ab54bc 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json +++ b/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json @@ -33,6 +33,7 @@ "PairedTag.Atac_preindex.num_threads_bwa":"24", "PairedTag.Atac_preindex.mem_size_bwa":"175", "PairedTag.soloMultiMappers":"Uniform", - "PairedTag.nhash_id":"example_1234", + "PairedTag.gex_nhash_id":"example_1234", + "PairedTag.atac_nhash_id":"example_1234", "PairedTag.cloud_provider": "gcp" } \ No newline at end of file diff --git a/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json b/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json index f68801ccfc..b4ffd4d14c 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json +++ b/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json @@ -30,6 +30,7 @@ "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", "PairedTag.soloMultiMappers":"Uniform", - "PairedTag.nhash_id":"example_1234", + "PairedTag.gex_nhash_id":"example_1234", + "PairedTag.atac_nhash_id":"example_1234", "PairedTag.cloud_provider": "gcp" } \ No newline at end of file diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index dbbe866338..b9cb1f7a56 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,8 @@ +# 3.4.2 +2024-09-24 (Date of Last Commit) + +* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not impact the slideseq workflow + # 3.4.1 2024-09-11 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 0ca5d4edc7..0cd1f29e4c 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.4.1" + String pipeline_version = "3.4.2" input { Array[File] r1_fastq @@ -48,8 +48,8 @@ workflow SlideSeq { # docker images String pytools_docker = "pytools:1.0.0-1661263730" String picard_cloud_docker = "picard-cloud:2.26.10" - String warp_tools_docker_2_2_0 = "warp-tools:2.3.0" - String star_merge_docker = "star-merge-npz:1.2" + String warp_tools_docker_2_2_0 = "warp-tools:2.4.0" + String star_merge_docker = "star-merge-npz:1.3.0" String ubuntu_docker = "ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index f6556b3bbb..16ed6cb5c8 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,3 +1,7 @@ +# 2.0.1 +2024-09-24 (Date of Last Commit) +* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not affect the snSS2 workflow + # 2.0.0 2024-09-11 (Dat of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index 38ae12ff23..debce094b0 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { } # Version of this pipeline - String pipeline_version = "2.0.0" + String pipeline_version = "2.0.1" if (false) { String? none = "None" diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 890b044680..0ac5a3dd66 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -9,13 +9,16 @@ task OptimusH5adGeneration { String warp_tools_docker_path # name of the sample String input_id - String gex_nhash_id = "" + String? gex_nhash_id # user provided id + String? counting_mode + Int expected_cells = 3000 String? input_name String? input_id_metadata_field String? input_name_metadata_field # gene annotation file in GTF format File annotation_file + File? cellbarcodes File? library_metrics # the file "merged-cell-metrics.csv.gz" that contains the cellwise metrics File cell_metrics @@ -88,40 +91,18 @@ task OptimusH5adGeneration { --pipeline_version ~{pipeline_version} fi - # modify h5ad - python3 <>> runtime { @@ -135,10 +116,11 @@ task OptimusH5adGeneration { output { File h5ad_output = "~{input_id}.h5ad" - File library_metrics = "~{input_id}_~{gex_nhash_id}_library_metrics.csv" + File library_metrics = "~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv" } } + task SingleNucleusOptimusH5adOutput { input { @@ -147,8 +129,10 @@ task SingleNucleusOptimusH5adOutput { # name of the sample String input_id # additional aliquot id - String gex_nhash_id = "" + String? gex_nhash_id # user provided id + String? counting_mode + Int expected_cells = 3000 String? input_name String? input_id_metadata_field String? input_name_metadata_field @@ -172,6 +156,8 @@ task SingleNucleusOptimusH5adOutput { File gene_id_exon # library-level metrics File? library_metrics + # Cell calls from starsolo in TSV format + File? cellbarcodes String pipeline_version @@ -210,43 +196,20 @@ task SingleNucleusOptimusH5adOutput { --expression_data_type "whole_transcript" \ --pipeline_version ~{pipeline_version} - # modify h5ad - python3 <>> + # modify h5ad to include doublets, NHASHID, and build library metrics + python3 /warptools/scripts/add_library_tso_doublets.py \ + --gex_h5ad "~{input_id}.h5ad" \ + --cellbarcodes ~{cellbarcodes} \ + ~{"--gex_nhash_id " + gex_nhash_id} \ + --library_csv ~{library_metrics} \ + --input_id ~{input_id} \ + --counting_mode ~{counting_mode} \ + --expected_cells ~{expected_cells} + + mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv + + >>> runtime { docker: warp_tools_docker_path cpu: cpu # note that only 1 thread is supported by pseudobam @@ -258,7 +221,7 @@ task SingleNucleusOptimusH5adOutput { output { File h5ad_output = "~{input_id}.h5ad" - File library_metrics = "~{input_id}_~{gex_nhash_id}_library_metrics.csv" + File library_metrics = "~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv" } } diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 6cfc286746..ffceb7ce17 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -619,6 +619,7 @@ task MergeStarOutput { File? cell_reads_out = "~{input_id}.star_metrics.tar" File? library_metrics="~{input_id}_library_metrics.csv" File? mtx_files ="~{input_id}.mtx_files.tar" + File? outputbarcodes = "outputbarcodes.tsv" } } diff --git a/verification/VerifyTasks.wdl b/verification/VerifyTasks.wdl index 3fdbfef910..d21435c039 100644 --- a/verification/VerifyTasks.wdl +++ b/verification/VerifyTasks.wdl @@ -508,12 +508,27 @@ task CompareH5adFilesGEX { print("Now running equivalence check") - if truth_obs.equals(test_obs)==True and truth_var.equals(test_var)==True and truth_sum==test_sum: + # Check if obs, var, and sum match + if truth_obs.equals(test_obs) and truth_var.equals(test_var) and truth_sum == test_sum: print("pass") else: - exit("Files are not identical") - - print("Done running matrix equivalence check") + # If obs does not match, check if the only difference is in the 'doublet_score' column + if not truth_obs.equals(test_obs): + # Create a boolean DataFrame where True indicates differences + differences = truth_obs.ne(test_obs) # .ne() is the 'not equal' comparison for pandas + + # Identify columns with any differences + differing_columns = differences.any(axis=0) # Check if any value in a column is True + differing_columns = differing_columns[differing_columns].index.tolist() # Get column names with differences + + # Check if the only differing column is 'doublet_score' + if len(differing_columns) == 1 and 'doublet_score' in differing_columns: + print("Files differ in the doublet score") + else: + print(differing_columns) + exit("Multiple columns different") + + print("Done running matrix equivalence check") CODE >>> diff --git a/verification/test-wdls/TestMultiome.wdl b/verification/test-wdls/TestMultiome.wdl index eb1497d6e4..3b214d9ea8 100644 --- a/verification/test-wdls/TestMultiome.wdl +++ b/verification/test-wdls/TestMultiome.wdl @@ -11,7 +11,8 @@ workflow TestMultiome { input { String input_id String cloud_provider - String nhash_id + String gex_nhash_id + String atac_nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -84,7 +85,8 @@ workflow TestMultiome { run_cellbender = run_cellbender, soloMultiMappers = soloMultiMappers, cloud_provider = cloud_provider, - nhash_id = nhash_id + gex_nhash_id = gex_nhash_id, + atac_nhash_id = atac_nhash_id } diff --git a/verification/test-wdls/TestPairedTag.wdl b/verification/test-wdls/TestPairedTag.wdl index 912db6f99e..1838f9d9f3 100644 --- a/verification/test-wdls/TestPairedTag.wdl +++ b/verification/test-wdls/TestPairedTag.wdl @@ -10,7 +10,8 @@ workflow TestPairedTag { input { String input_id - String nhash_id + String gex_nhash_id + String atac_nhash_id # Optimus Inputs String counting_mode = "sn_rna" @@ -90,7 +91,8 @@ workflow TestPairedTag { atac_whitelist = atac_whitelist, soloMultiMappers = soloMultiMappers, cloud_provider = cloud_provider, - nhash_id = nhash_id + gex_nhash_id = gex_nhash_id, + atac_nhash_id = atac_nhash_id } diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 0df6fbcfa8..1062b121a4 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Multiome_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Multiome v5.6.0](https://github.com/broadinstitute/warp/releases) | July, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [Multiome v5.7.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![Multiome_diagram](./multiome_diagram.png) @@ -56,8 +56,9 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | Input name | Description | Type | | --- | --- | --- | | input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | -| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | -| nhash_id | Optional identifier for the library aliquot; when specified, the workflow will echo the ID in the ATAC and gene expression output h5ads (in the adata.uns section) and in the library-level metrics CSV. | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | +| gex_nhash_id | Optional identifier for the library aliquot; when specified, the gene expression workflow will echo the ID in the gene expression output h5ads (in the adata.uns section) and in the library-level metrics CSV. | +| atac_nhash_id | Optional identifier for the library aliquot; when specified, the workflow will echo the ID in the ATAC output h5ads (in the adata.uns section) and in the library-level metrics CSV. | annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | | gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | | gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | diff --git a/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md b/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md index 143b8f0730..8336e5e878 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md +++ b/website/docs/Pipelines/Optimus_Pipeline/Library-metrics.md @@ -6,7 +6,7 @@ sidebar_position: 5 The following table describes the library level metrics of the produced by the Optimus workflow. These are calcuated using custom python scripts available in the warp-tools repository. The Optimus workflow aligns files in shards to parallelize computationally intensive steps. This results in multiple matrix market files and shard-level library metrics. -To produce the library-level metrics here, the [combined_mtx.py script](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/star-merge-npz/scripts/combined_mtx.py) combines all the shard-level matrix market files into one raw mtx file. Then, STARsolo is run to filter this matrix to only those barcodes that meet STARsolo's criteria of cells (using the Emptydrops_CR parameter). Lastly, the [combine_shard_metrics.py script](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/star-merge-npz/scripts/combine_shard_metrics.py) uses the filtered matrix and the all of the shard-level metrics files produced by STARsolo to calculate the metrics below. Each of the scripts are called from [MergeStarOutput task](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/StarAlign.wdl) of the Optimus workflow. +To produce the library-level metrics here, the [combined_mtx.py script](https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/star-merge-npz/scripts/combined_mtx.py) combines all the shard-level matrix market files into one raw mtx file. Then, STARsolo is run to filter this matrix to only those barcodes that meet STARsolo's criteria of cells (using the Emptydrops_CR parameter). This matrix is then used as input during h5ad generation, and metrics are calculated from the final h5ad using the custom [add_library_tso_doublets.py](https://github.com/broadinstitute/warp-tools/tree/develop/tools/scripts) script. | Metric | Description | @@ -37,9 +37,10 @@ To produce the library-level metrics here, the [combined_mtx.py script](https:// | total_genes_unique_detected | Total number of unique genes detected. | | percent_target | Percentage of target cells. Calculated as: estimated_number_of_cells / barcoded_cell_sample_number_of_expected_cells | | percent_intronic_reads | Percentage of intronic reads. Calculated as: reads_mapped_confidently_to_intronic_regions / number_of_reads | -| keeper_mean_reads_per_cell | Mean reads per cell for cells with >1500 genes or nuclei with >1000 genes. | -| keeper_median_genes | Median genes per cell for cells with >1500 genes or nuclei with >1000 genes. | -| keeper_cells | Number of cells with >1500 genes or nuclei with >1000 genes.| +| percent_doublets | Percentage of cells flagged as doublets based on doublet scores calculated from a modified [DoubletFinder](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6853612/) algorithm. | +| keeper_mean_reads_per_cell | Mean reads per cell for cells with >1500 genes or nuclei with >1000 genes, and doublet_score < 0.3. | +| keeper_median_genes | Median genes per cell for cells with >1500 genes or nuclei with >1000 genes, and doublet_score < 0.3>. | +| keeper_cells | Number of cells with >1500 genes or nuclei with >1000 genes, and doublet score < 0.3.| | percent_keeper | Percentage of keeper cells. Calculated as: keeper_cells / estimated_cells | | percent_usable | Percentage of usable cells. Calculated as: keeper_cells / expected_cells | | frac_tso | Fraction of reads containing TSO sequence. Calculated as the number of reads that have 20 bp or more of TSO Sequence clipped from 5' end/ total number of reads. | \ No newline at end of file diff --git a/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md b/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md index 83e07ba73a..cf861eb91e 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md +++ b/website/docs/Pipelines/Optimus_Pipeline/Loom_schema.md @@ -41,6 +41,7 @@ The global attributes (unstuctured metadata) in the h5ad apply to the whole file |`CellID` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The unique identifier for each cell based on cell barcodes (sequences used to identify unique cells); identical to `cell_names`. Learn more about cell barcodes in the [Definitions](#definitions) section below. | |`cell_names` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The unique identifier for each cell based on cell barcodes; identical to `CellID`. | | `input_id` | Provided as pipeline input | The sample or cell ID listed in the pipeline configuration file. This can be any string, but we recommend it be consistent with any sample metadata. | +| `star_IsCell` | STARsolo | A true/false flag demarcating if the STARsolo aligner called a cell barcode as a cell. | |`n_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The number of reads associated with the cell. Like all metrics, `n_reads` is calculated from the Optimus output BAM file. Prior to alignment, reads are checked against the whitelist and any within one edit distance (Hamming distance) are corrected. These CB-corrected reads are aligned using STARsolo, where they get further CB correction. For this reason, most reads in the aligned BAM file have both `CB` and `UB` tags. Therefore, `n_reads` represents CB-corrected reads, rather than all reads in the input FASTQ files. | | `tso_reads` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads that have 20 or more bp of TSO sequence clipped from the 5' end. Calculated using the first number of cN tag in the BAM, which is specific to the number of TSO nucleotides clipped. | |`noise_reads`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| Number of reads that are categorized by 10x Genomics Cell Ranger as "noise". Refers to long polymers, or reads with high numbers of N (ambiguous) nucleotides. | @@ -85,6 +86,7 @@ The global attributes (unstuctured metadata) in the h5ad apply to the whole file | `reads_mapped_intergenic` | STARsolo and [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The number of reads counted as intergenic; counted when the BAM file's `sF` tag is assigned to a `7` and the `NH:i` tag is `1`. | | `reads_unmapped` | [TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort) | The total number of reads that are unmapped; counted when the BAM file's `sF` tag is `0`. | |`reads_per_molecule`|[TagSort](https://github.com/broadinstitute/warp-tools/tree/develop/tools/TagSort)| The average number of reads associated with each molecule in the cell. | +| `doublet_score` | Modified version of [DoubletFinder](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6853612/) | A score produced by a modified version of the DoubletFinder software that normalizes data using scanpy and then uses the k-nearest neighbors algorithm to determine cells. This program is non-deterministic, so results will vary across runs of the workflow. The metrics are used to determine overall library quality. | ## Table 3. Gene metrics diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index f2753a92dc..6bbc24f6b7 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Optimus_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v7.6.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | September, 2024 | Elizabeth Kiernan | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | +| [optimus_v7.7.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | September, 2024 | Elizabeth Kiernan | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | ![Optimus_diagram](Optimus_diagram.png) diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index 0b56dd3847..64d0b956f8 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | |:---:| :---: | :---: | :---: | -| [PairedTag_v1.1.0](https://github.com/broadinstitute/warp/releases) | July, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [PairedTag_v1.7.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ## Introduction to the Paired-Tag workflow @@ -60,7 +60,8 @@ The Paired-Tag workflow inputs are specified in JSON configuration files. Exampl | Parameter name | Description | Type | | --- | --- | --- | | input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | -| nhash_id | Optional identifier that can be used to demarcate the library aliquot or sample. | +| gex_nhash_id | Optional identifier that can be used to demarcate the gene expression library aliquot or sample. | +| atac_nhash_id | Optional identifier that can be used to demarcate the ATAC library aliquot or sample. | | counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String | | gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | | gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | diff --git a/website/docs/Pipelines/SlideSeq_Pipeline/README.md b/website/docs/Pipelines/SlideSeq_Pipeline/README.md index b9cd1d3def..6e7900a217 100644 --- a/website/docs/Pipelines/SlideSeq_Pipeline/README.md +++ b/website/docs/Pipelines/SlideSeq_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/SlideSeq_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [SlideSeq v3.1.6](https://github.com/broadinstitute/warp/releases) | May, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [SlideSeq v3.4.2](https://github.com/broadinstitute/warp/releases) | September, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![SlideSeq_diagram](./slide-seq_diagram.png) diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md index 0838117702..512f69fd98 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [MultiSampleSmartSeq2SingleNuclei_v1.3.4](https://github.com/broadinstitute/warp/releases) | May, 2024 | Elizabeth Kiernan | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [MultiSampleSmartSeq2SingleNuclei_v2.0.1](https://github.com/broadinstitute/warp/releases) | September, 2024 | Elizabeth Kiernan | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![](./snSS2.png)