diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 721ce3b..60a415d 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -8,21 +8,21 @@ jobs: # Only run if comment is on a PR with the main repo, and if it contains the magic keywords if: > contains(github.event.comment.html_url, '/pull/') && - contains(github.event.comment.body, '@nf-core-bot fix linting') && + contains(github.event.comment.body, '@sanger-tolsoft fix linting') && github.repository == 'sanger-tol/readmapping' runs-on: ubuntu-latest steps: - # Use the @nf-core-bot token to check out so we can push later + # Use the @sanger-tolsoft token to check out so we can push later - uses: actions/checkout@v3 with: - token: ${{ secrets.nf_core_bot_auth_token }} + token: ${{ secrets.sangertolsoft_access_token }} # Action runs on the issue comment, so we don't get the PR by default # Use the gh cli to check out the PR - name: Checkout Pull Request run: gh pr checkout ${{ github.event.issue.number }} env: - GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }} - uses: actions/setup-node@v3 @@ -34,9 +34,9 @@ jobs: id: prettier_status run: | if prettier --check ${GITHUB_WORKSPACE}; then - echo "name=result::pass" >> $GITHUB_OUTPUT + echo "result=pass" >> $GITHUB_OUTPUT else - echo "name=result::fail" >> $GITHUB_OUTPUT + echo "result=fail" >> $GITHUB_OUTPUT fi - name: Run 'prettier --write' @@ -46,8 +46,8 @@ jobs: - name: Commit & push changes if: steps.prettier_status.outputs.result == 'fail' run: | - git config user.email "core@nf-co.re" - git config user.name "nf-core-bot" + git config user.email "105875386+sanger-tolsoft@users.noreply.github.com" + git config user.name "sanger-tolsoft" git config push.default upstream git add . git status diff --git a/.github/workflows/sangertest.yml b/.github/workflows/sanger_test.yml similarity index 71% rename from .github/workflows/sangertest.yml rename to .github/workflows/sanger_test.yml index 9423879..e69af1e 100644 --- a/.github/workflows/sangertest.yml +++ b/.github/workflows/sanger_test.yml @@ -8,16 +8,22 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ github.sha }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }} parameters: | { "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}", } - profiles: test,sanger,singularity + profiles: test,sanger,singularity,cleanup + + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/sangerfulltest.yml b/.github/workflows/sanger_test_full.yml similarity index 79% rename from .github/workflows/sangerfulltest.yml rename to .github/workflows/sanger_test_full.yml index 9d266be..e028c6b 100644 --- a/.github/workflows/sangerfulltest.yml +++ b/.github/workflows/sanger_test_full.yml @@ -22,16 +22,22 @@ jobs: if: github.event_name == 'workflow_dispatch' - name: Launch workflow via tower - uses: nf-core/tower-action@v2 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ env.REVISION }} workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }} parameters: | { "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}", } - profiles: test_full,sanger,singularity + profiles: test_full,sanger,singularity,cleanup + + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 10e09a8..384497c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,44 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.2.0](https://github.com/sanger-tol/readmapping/releases/tag/1.2.0)] – Norwegian Ridgeback - [2023-12-19] + +### Enhancements & fixes + +- Restored recording read-groups (`@RG`) in the BAM/CRAM files. +- Updated the CI procedure to use "sanger-tol" rather than "nf-core" names. +- [crumble](https://github.com/jkbonfield/crumble) now used to compress the + PacBio HiFi alignments. +- Execution statistics now under `pipeline_info/readmapping/` (to be consistent + with the other sanger-tol pipelines). +- All resource requirements (memory, time, CPUs) now fit the actual usage. This + is achieved by automatically adjusting to the size of the input whenever + possible. +- Added the `--use_work_dir_as_temp` parameter to make SAMTOOLS_COLLATE use its + work directory for temporary files instead of `$TMPDIR`. It can be used to avoid + leaving unwanted temporary files on a HPC. + +### Parameters + +| Old parameter | New parameter | +| ------------- | ------------------------ | +| | `--use_work_dir_as_temp` | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. + +| Dependency | Old version | New version | +| ---------- | --------------- | ------------- | +| `blast` | 2.12.0 | 2.13.0 | +| `crumble` | | 0.9.1 | +| `samtools` | 1.14 and 1.16.1 | 1.14 and 1.17 | +| `multiqc` | 1.13 | 1.14 | + +> **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. + ## [[1.1.0](https://github.com/sanger-tol/readmapping/releases/tag/1.1.0)] – Hebridean Black - [2023-03-16] ### Enhancements & fixes diff --git a/conf/base.config b/conf/base.config index 284ac0b..cdffac4 100644 --- a/conf/base.config +++ b/conf/base.config @@ -2,64 +2,135 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sanger-tol/readmapping Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - A 'blank slate' config file, appropriate for general use on most high performance - compute environments. Assumes that all software is installed and available on - the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. ----------------------------------------------------------------------------------------- */ -process { +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Increasing the number of CPUs often gives diminishing returns, so we increase it + following a logarithm curve. Example: + - 0 < value <= 1: start + step + - 1 < value <= 2: start + 2*step + - 2 < value <= 4: start + 3*step + - 4 < value <= 8: start + 4*step + In order to support re-runs, the step increase may be multiplied by the attempt + number prior to calling this function. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } +// Modified logarithm function that doesn't return negative numbers +def positive_log(value, base) { + if (value <= 1) { + return 0 + } else { + return Math.log(value)/Math.log(base) + } +} + +def log_increase_cpus(start, step, value, base) { + return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus') +} - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 1 + +process { + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 5 maxErrors = '-1' - // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. - // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. - // If possible, it would be nice to keep the same label naming convention when - // adding in your local modules too. - // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors - withLabel:process_single { - cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + // In this configuration file, we give little resources by default and + // explicitly bump them up for some processes. + // All rules should still increase resources every attempt to allow the + // pipeline to self-heal from MEMLIMIT/RUNLIMIT. + + // Default + cpus = 1 + memory = { check_max( 50.MB * task.attempt, 'memory' ) } + time = { check_max( 30.min * task.attempt, 'time' ) } + + withName: 'SAMTOOLS_(CONVERT|FILTER)' { + time = { check_max( 1.hour * task.attempt, 'time' ) } + } + + withName: 'SAMTOOLS_(FASTA)' { + time = { check_max( 2.hour * task.attempt, 'time' ) } + } + + withName: 'SAMTOOLS_(STATS)' { + // Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C + time = { check_max( 4.hour * task.attempt, 'time' ) } } - withLabel:process_low { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } - time = { check_max( 4.h * task.attempt, 'time' ) } + + withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' { + time = { check_max( 8.hour * task.attempt, 'time' ) } } - withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } - time = { check_max( 8.h * task.attempt, 'time' ) } + + withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS)' { + memory = { check_max( 250.MB * task.attempt, 'memory' ) } } - withLabel:process_high { - cpus = { check_max( 12 * task.attempt, 'cpus' ) } - memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + + withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' { + memory = { check_max( 1.GB * task.attempt, 'memory' ) } } - withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } + withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' { + memory = { check_max( 2.GB * task.attempt, 'memory' ) } } - withLabel:process_high_memory { - memory = { check_max( 200.GB * task.attempt, 'memory' ) } + + withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' { + cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } + memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } } - withLabel:error_ignore { - errorStrategy = 'ignore' + + withName: 'SAMTOOLS_SORMADUP' { + cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } + memory = { check_max( 10.GB + 0.6.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) } + time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 + + withName: SAMTOOLS_SORT { + cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } + // Memory increases by 768M for each thread + memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) } + time = { check_max( 8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) } } - withName:BWAMEM2_INDEX { - memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1000000000 ) * task.attempt, 'memory' ) } + + withName: BLAST_BLASTN { + time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } + // The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default) } + + withName: BWAMEM2_INDEX { + memory = { check_max( 24.GB * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) } + time = { check_max( 30.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time' ) } + // Not multithreaded + } + + withName: BWAMEM2_MEM { + // Corresponds to 12 threads as the minimum, 24 threads if 3 billion reads + cpus = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) } + // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size + // Runtime is considered proportional to the number of reads and inversely to number of threads + time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) } + // Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM + // Memory usage of SAMTOOLS_VIEW is negligible. + memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) } + } + + withName: MINIMAP2_ALIGN { + cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } + memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) } + time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + + withName: CRUMBLE { + // No correlation between memory usage and the number of reads or the genome size. + // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. + // The formula below tries to mimic that growth and relies on job retries being allowed. + memory = { check_max( task.attempt * (task.attempt + 1) * 512.MB, 'memory' ) } + // Slightly better correlation between runtime and the number of reads. + time = { check_max( 1.5.h + 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } diff --git a/conf/modules.config b/conf/modules.config index 7984dca..31f4c6b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -16,7 +16,11 @@ process { } withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' { - ext.args = '-5SPCp' + ext.args = { "-5SPCp -R ${meta.read_group}" } + } + + withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' { + ext.args = { "-R ${meta.read_group}" } } withName: SAMTOOLS_SORT { @@ -24,22 +28,15 @@ process { } withName: SAMTOOLS_MERGE { + ext.args = { "-c -p" } ext.prefix = { "${meta.id}.merge" } } withName: SAMTOOLS_COLLATE { + ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } ext.prefix = { "${meta.id}.collate" } } - withName: SAMTOOLS_FIXMATE { - ext.args = '-m' - ext.prefix = { "${meta.id}.fixmate" } - } - - withName: SAMTOOLS_MARKDUP { - ext.prefix = { "${meta.id}.markdup" } - } - withName: BLAST_BLASTN { ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' } @@ -53,15 +50,22 @@ process { } withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { - ext.args = '-ax map-hifi --cs=short' + // minimap2 2.24 can only work with genomes up to 4 Gbp. For larger genomes, add the -I option with the genome size in Gbp. + // In fact, we can also use -I to *decrease* the memory requirements for smaller genomes + // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp + // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. + // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes + // NOTE: Use `reference.size()` for now, and switch to `meta2.genome_size` once we update the modules. + // ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(reference.size()/1e9) + 'G' } } withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { - ext.args = '-ax map-pb' + ext.args = { "-ax map-pb -R ${meta.read_group}" } } withName: '.*:.*:ALIGN_ONT:MINIMAP2_ALIGN' { - ext.args = '-ax map-ont' + ext.args = { "-ax map-ont -R ${meta.read_group}" } } withName: '.*:CONVERT_STATS:SAMTOOLS_VIEW' { @@ -81,9 +85,19 @@ process { ext.prefix = { "${input.baseName}" } } + withName: CRUMBLE { + ext.prefix = { "${input.baseName}.crumble" } + ext.args = '-y pbccs -O cram' + publishDir = [ + path: { "${params.outdir}/read_mapping/pacbio" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: SAMPLESHEET_CHECK { publishDir = [ - path: { "${params.outdir}/readmapping_info" }, + path: { "${params.outdir}/pipeline_info/readmapping" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -131,7 +145,7 @@ process { withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ - path: { "${params.outdir}/readmapping_info" }, + path: { "${params.outdir}/pipeline_info/readmapping" }, mode: params.publish_dir_mode, pattern: '*_versions.yml' ] diff --git a/docs/images/readmapping_v2.pptx b/docs/images/readmapping_v2.pptx new file mode 100644 index 0000000..c4c44c6 Binary files /dev/null and b/docs/images/readmapping_v2.pptx differ diff --git a/docs/images/readmapping_v3.pptx b/docs/images/readmapping_v3.pptx new file mode 100644 index 0000000..3a73498 Binary files /dev/null and b/docs/images/readmapping_v3.pptx differ diff --git a/docs/output.md b/docs/output.md index f1fdd7c..9722d11 100644 --- a/docs/output.md +++ b/docs/output.md @@ -34,7 +34,7 @@ PacBio reads generated using both CLR and CCS technology are filtered using `BLA ### Short reads -Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` markduplicate workflow. The mark duplicate alignments is output in the CRAM format, along with the index. +Short read data from HiC and Illumina technologies is aligned with `BWAMEM2_MEM`. The sorted and merged alignment files are processed using the `SAMTOOLS` [mark-duplicate workflow](https://www.htslib.org/algorithms/duplicate.html#workflow). The mark duplicate alignments is output in the CRAM format, along with the index.
Output files @@ -113,7 +113,7 @@ A number of genome-specific files are generated by the pipeline because they are
Output files -- `readmapping_info/` +- `pipeline_info/readmapping/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index a9ff24b..4088387 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -132,7 +132,7 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/readmapping_info/") + def output_d = new File("${params.outdir}/pipeline_info/readmapping/") if (!output_d.exists()) { output_d.mkdirs() } diff --git a/modules.json b/modules.json index af09a39..a9a06be 100644 --- a/modules.json +++ b/modules.json @@ -7,98 +7,94 @@ "nf-core": { "blast/blastn": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "bwamem2/index": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "bwamem2/mem": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", "installed_by": ["modules"] }, + "crumble": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/crumble/crumble.diff" + }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "gunzip": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["modules"] }, "samtools/collate": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", "installed_by": ["modules"] }, "samtools/fasta": { "branch": "master", - "git_sha": "5a31b92f8298f361ed232e8a6657560ac135f9af", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/fastq": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] - }, - "samtools/fixmate": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/idxstats": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", - "installed_by": ["modules"] - }, - "samtools/markdup": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "samtools/merge": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "0460d316170f75f323111b4a2c0a2989f0c32013", "installed_by": ["modules"] }, "samtools/sort": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", "installed_by": ["modules"] }, "samtools/stats": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", "installed_by": ["modules"] }, "samtools/view": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", "installed_by": ["modules"], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "untar": { "branch": "master", - "git_sha": "cc1f997fab6d8fde5dc0e6e2a310814df5b53ce7", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", "installed_by": ["modules"] } } diff --git a/modules/local/pacbio_filter.nf b/modules/local/pacbio_filter.nf index 18dd11c..e4deaa4 100644 --- a/modules/local/pacbio_filter.nf +++ b/modules/local/pacbio_filter.nf @@ -5,7 +5,7 @@ process PACBIO_FILTER { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(txt) diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 9c44c61..f0a3073 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -5,7 +5,7 @@ process SAMPLESHEET_CHECK { conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'quay.io/biocontainers/python:3.8.3' }" + 'biocontainers/python:3.8.3' }" input: path samplesheet diff --git a/modules/local/samtools_sormadup.nf b/modules/local/samtools_sormadup.nf new file mode 100644 index 0000000..5aadab5 --- /dev/null +++ b/modules/local/samtools_sormadup.nf @@ -0,0 +1,77 @@ +// Copied from https://github.com/nf-core/modules/pull/3310 +// Author: Matthias De Smet, https://github.com/matthdsm +process SAMTOOLS_SORMADUP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.{bam,cram}") , emit: bam + tuple val(meta), path("*.{bai,crai}") , optional:true, emit: bam_index + tuple val(meta), path("*.metrics") , emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + def reference = fasta ? "--reference ${fasta}" : "" + + """ + samtools collate \\ + $args \\ + -O \\ + -u \\ + -T ${prefix}.collate \\ + --threads $task.cpus \\ + ${reference} \\ + ${input} \\ + - \\ + | \\ + samtools fixmate \\ + $args2 \\ + -m \\ + -u \\ + --threads $task.cpus \\ + - \\ + - \\ + | \\ + samtools sort \\ + $args3 \\ + -u \\ + -T ${prefix}.sort \\ + --threads $task.cpus \\ + - \\ + | \\ + samtools markdup \\ + -T ${prefix}.markdup \\ + -f ${prefix}.metrics \\ + --threads $task.cpus \\ + $args4 \\ + - \\ + ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/unmask.nf b/modules/local/unmask.nf index 482eefc..72d1a07 100644 --- a/modules/local/unmask.nf +++ b/modules/local/unmask.nf @@ -5,7 +5,7 @@ process UNMASK { conda "conda-forge::gawk=5.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : - 'quay.io/biocontainers/gawk:5.1.0' }" + 'biocontainers/gawk:5.1.0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf index d459e5f..9a1f3a5 100644 --- a/modules/nf-core/blast/blastn/main.nf +++ b/modules/nf-core/blast/blastn/main.nf @@ -2,10 +2,10 @@ process BLAST_BLASTN { tag "$meta.id" label 'process_medium' - conda "bioconda::blast=2.12.0" + conda "bioconda::blast=2.13.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/blast:2.12.0--pl5262h3289130_0' : - 'quay.io/biocontainers/blast:2.12.0--pl5262h3289130_0' }" + 'https://depot.galaxyproject.org/singularity/blast:2.13.0--hf3cf87c_0' : + 'biocontainers/blast:2.13.0--hf3cf87c_0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/bwamem2/index/main.nf b/modules/nf-core/bwamem2/index/main.nf index a236121..3094085 100644 --- a/modules/nf-core/bwamem2/index/main.nf +++ b/modules/nf-core/bwamem2/index/main.nf @@ -5,7 +5,7 @@ process BWAMEM2_INDEX { conda "bioconda::bwa-mem2=2.2.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bwa-mem2:2.2.1--he513fc3_0' : - 'quay.io/biocontainers/bwa-mem2:2.2.1--he513fc3_0' }" + 'biocontainers/bwa-mem2:2.2.1--he513fc3_0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/bwamem2/mem/main.nf b/modules/nf-core/bwamem2/mem/main.nf index 489b170..d427dea 100644 --- a/modules/nf-core/bwamem2/mem/main.nf +++ b/modules/nf-core/bwamem2/mem/main.nf @@ -5,7 +5,7 @@ process BWAMEM2_MEM { conda "bioconda::bwa-mem2=2.2.1 bioconda::samtools=1.16.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' : - 'quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' }" + 'biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/bwamem2/mem/meta.yml b/modules/nf-core/bwamem2/mem/meta.yml index a465551..bc3dfcd 100644 --- a/modules/nf-core/bwamem2/mem/meta.yml +++ b/modules/nf-core/bwamem2/mem/meta.yml @@ -28,6 +28,11 @@ input: description: | List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference/index information + e.g. [ id:'test' ] - index: type: file description: BWA genome index files diff --git a/modules/nf-core/crumble/crumble.diff b/modules/nf-core/crumble/crumble.diff new file mode 100644 index 0000000..2c4cb1e --- /dev/null +++ b/modules/nf-core/crumble/crumble.diff @@ -0,0 +1,21 @@ +Changes in module 'nf-core/crumble' +--- modules/nf-core/crumble/main.nf ++++ modules/nf-core/crumble/main.nf +@@ -30,11 +30,14 @@ + args.contains("-O cram") ? "cram" : + "sam" + def bedin = keepbed ? "-R ${keepbed}" : "" +- def bedout = bedout ? "-b ${prefix}.out.bed" : "" ++ def bedout = bedout ? "-b ${prefix}.suspicious_regions.bed" : "" + if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ ++ # Need to fake REF_PATH to force crumble to use the Fasta file defined in ++ # the UR field of the @SQ headers. (bug reported to the samtools team). ++ env REF_PATH=/missing \\ + crumble \\ + $args \\ + $bedin \\ + +************************************************************ diff --git a/modules/nf-core/crumble/main.nf b/modules/nf-core/crumble/main.nf new file mode 100644 index 0000000..44c0c59 --- /dev/null +++ b/modules/nf-core/crumble/main.nf @@ -0,0 +1,53 @@ +process CRUMBLE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::crumble=0.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/crumble:0.9.1--hb0d9459_0': + 'biocontainers/crumble:0.9.1--hb0d9459_0' }" + + input: + tuple val(meta), path(input) + path keepbed + val bedout + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bed"), emit: bed, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-O sam") ? "sam" : + args.contains("-O bam") ? "bam" : + args.contains("-O cram") ? "cram" : + "sam" + def bedin = keepbed ? "-R ${keepbed}" : "" + def bedout = bedout ? "-b ${prefix}.suspicious_regions.bed" : "" + if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + def CRUMBLE_VERSION = '0.9.1' //WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + # Need to fake REF_PATH to force crumble to use the Fasta file defined in + # the UR field of the @SQ headers. (bug reported to the samtools team). + env REF_PATH=/missing \\ + crumble \\ + $args \\ + $bedin \\ + $bedout \\ + $input \\ + ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + crumble: $CRUMBLE_VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/crumble/meta.yml b/modules/nf-core/crumble/meta.yml new file mode 100644 index 0000000..cbc0045 --- /dev/null +++ b/modules/nf-core/crumble/meta.yml @@ -0,0 +1,62 @@ +name: "crumble" +description: Controllable lossy compression of BAM/CRAM files +keywords: + - compress + - bam + - sam + - cram +tools: + - "crumble": + description: "Controllable lossy compression of BAM/CRAM files" + homepage: "https://github.com/jkbonfield/crumble" + documentation: "https://github.com/jkbonfield/crumble" + tool_dev_url: "https://github.com/jkbonfield/crumble" + doi: "10.1093/bioinformatics/bty608" + licence: "['multiple BSD style licenses']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - keepbed: + type: file + description: BED file defining regions to keep quality + - bedout: + type: boolean + description: set to true to ouput suspicious regions to a BED file + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: optional filtered/compressed BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/compressed CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/compressed SAM file + pattern: "*.{sam}" + - bed: + type: file + description: optional suspicious regions BED file + pattern: "*{bed}" + +authors: + - "@priyanka-surana" diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df2176..ebc8727 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a..c32657d 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index d906034..e7189d2 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -5,7 +5,7 @@ process GUNZIP { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(archive) diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml index 2e0e405..4cdcdf4 100644 --- a/modules/nf-core/gunzip/meta.yml +++ b/modules/nf-core/gunzip/meta.yml @@ -3,6 +3,7 @@ description: Compresses and decompresses files. keywords: - gunzip - compression + - decompression tools: - gunzip: description: | diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf index 430dbab..4da47c1 100644 --- a/modules/nf-core/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -6,7 +6,7 @@ process MINIMAP2_ALIGN { conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : - 'quay.io/biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/samtools/collate/main.nf b/modules/nf-core/samtools/collate/main.nf index 0e4ba5d..b23246b 100644 --- a/modules/nf-core/samtools/collate/main.nf +++ b/modules/nf-core/samtools/collate/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_COLLATE { tag "$meta.id" label 'process_medium' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1': - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0': + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input) diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf index ce6580d..59ed308 100644 --- a/modules/nf-core/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -2,18 +2,20 @@ process SAMTOOLS_FAIDX { tag "$fasta" label 'process_single' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(fasta) + tuple val(meta2), path(fai) output: - tuple val(meta), path ("*.fai"), emit: fai - tuple val(meta), path ("*.gzi"), emit: gzi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -23,8 +25,8 @@ process SAMTOOLS_FAIDX { """ samtools \\ faidx \\ - $args \\ - $fasta + $fasta \\ + $args cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -33,8 +35,12 @@ process SAMTOOLS_FAIDX { """ stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' """ + ${fastacmd} touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml index fe2fe9a..957b25e 100644 --- a/modules/nf-core/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -3,6 +3,7 @@ description: Index FASTA file keywords: - index - fasta + - faidx tools: - samtools: description: | @@ -17,12 +18,21 @@ input: - meta: type: map description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: type: file description: FASTA file pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" output: - meta: type: map diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf index 3b287ee..3145965 100644 --- a/modules/nf-core/samtools/fasta/main.nf +++ b/modules/nf-core/samtools/fasta/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FASTA { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input) diff --git a/modules/nf-core/samtools/fastq/main.nf b/modules/nf-core/samtools/fastq/main.nf index c0b36f6..15d8976 100644 --- a/modules/nf-core/samtools/fastq/main.nf +++ b/modules/nf-core/samtools/fastq/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FASTQ { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input) diff --git a/modules/nf-core/samtools/fixmate/main.nf b/modules/nf-core/samtools/fixmate/main.nf deleted file mode 100644 index d53d8ab..0000000 --- a/modules/nf-core/samtools/fixmate/main.nf +++ /dev/null @@ -1,37 +0,0 @@ -process SAMTOOLS_FIXMATE { - tag "$meta.id" - label 'process_low' - - conda "bioconda::samtools=1.16.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" - - input: - tuple val(meta), path(bam) - - output: - tuple val(meta), path("*.bam"), emit: bam - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - fixmate \\ - $args \\ - --threads ${task.cpus-1} \\ - $bam \\ - ${prefix}.bam \\ - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/fixmate/meta.yml b/modules/nf-core/samtools/fixmate/meta.yml deleted file mode 100644 index a72c5ca..0000000 --- a/modules/nf-core/samtools/fixmate/meta.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: samtools_fixmate -description: Samtools fixmate is a tool that can fill in information (insert size, cigar, mapq) about paired end reads onto the corresponding other read. Also has options to remove secondary/unmapped alignments and recalculate whether reads are proper pairs. -keywords: - - fixmate - - samtools - - insert size - - repair - - bam - - paired - - read pairs -tools: - - samtools: - description: | - SAMtools is a set of utilities for interacting with and post-processing - short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. - These files are generated as output by short read aligners like BWA. - homepage: http://www.htslib.org/ - documentation: http://www.htslib.org/doc/samtools.html - tool_dev_url: https://github.com/samtools/samtools - doi: 10.1093/bioinformatics/btp352 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file, must be sorted by name, not coordinate - pattern: "*.{bam,cram,sam}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - bam: - type: file - description: A BAM/CRAM/SAM file with mate information added and/or proper pairs recalled - pattern: "*.{bam,cram,sam}" - -authors: - - "@sppearce" diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf index 2120cd7..eb7e72f 100644 --- a/modules/nf-core/samtools/flagstat/main.nf +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FLAGSTAT { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(bam), path(bai) diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf index a7b87d8..a257d70 100644 --- a/modules/nf-core/samtools/idxstats/main.nf +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_IDXSTATS { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(bam), path(bai) diff --git a/modules/nf-core/samtools/markdup/main.nf b/modules/nf-core/samtools/markdup/main.nf deleted file mode 100644 index dfed201..0000000 --- a/modules/nf-core/samtools/markdup/main.nf +++ /dev/null @@ -1,47 +0,0 @@ -process SAMTOOLS_MARKDUP { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::samtools=1.16.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" - - input: - tuple val(meta), path(input) - path fasta - - output: - tuple val(meta), path("*.bam"), emit: bam, optional: true - tuple val(meta), path("*.cram"), emit: cram, optional: true - tuple val(meta), path("*.sam"), emit: sam, optional: true - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def reference = fasta ? "--reference ${fasta}" : "" - def extension = args.contains("--output-fmt sam") ? "sam" : - args.contains("--output-fmt bam") ? "bam" : - args.contains("--output-fmt cram") ? "cram" : - "bam" - if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - samtools \\ - markdup \\ - $args \\ - ${reference} \\ - -@ $task.cpus \\ - -T $prefix \\ - $input \\ - ${prefix}.${extension} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/markdup/meta.yml b/modules/nf-core/samtools/markdup/meta.yml deleted file mode 100644 index 9ced7a0..0000000 --- a/modules/nf-core/samtools/markdup/meta.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: "samtools_markdup" -description: mark duplicate alignments in a coordinate sorted file -keywords: - - markdup -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-markdup.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: "['MIT']" - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - output: - type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -authors: - - "@priyanka-surana" diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf index a80ff3a..b73b7cb 100644 --- a/modules/nf-core/samtools/merge/main.nf +++ b/modules/nf-core/samtools/merge/main.nf @@ -2,15 +2,15 @@ process SAMTOOLS_MERGE { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input_files, stageAs: "?/*") - path fasta - path fai + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) output: tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml index 644b768..3a815f7 100644 --- a/modules/nf-core/samtools/merge/meta.yml +++ b/modules/nf-core/samtools/merge/meta.yml @@ -25,13 +25,23 @@ input: type: file description: BAM/CRAM file pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] - fasta: - type: optional file - description: Reference file the CRAM was created with + type: file + description: Reference file the CRAM was created with (optional) pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] - fai: - type: optional file - description: Index of the reference file the CRAM was created with + type: file + description: Index of the reference file the CRAM was created with (optional) pattern: "*.fai" output: - meta: @@ -60,3 +70,4 @@ authors: - "@yuukiiwa " - "@maxulysse" - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf index 84c167c..2b7753f 100644 --- a/modules/nf-core/samtools/sort/main.nf +++ b/modules/nf-core/samtools/sort/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_SORT { tag "$meta.id" label 'process_medium' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(bam) @@ -23,7 +23,13 @@ process SAMTOOLS_SORT { def prefix = task.ext.prefix ?: "${meta.id}" if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ - samtools sort $args -@ $task.cpus -o ${prefix}.bam -T $prefix $bam + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + cat <<-END_VERSIONS > versions.yml "${task.process}": samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf index 0a2a364..4a2607d 100644 --- a/modules/nf-core/samtools/stats/main.nf +++ b/modules/nf-core/samtools/stats/main.nf @@ -2,14 +2,14 @@ process SAMTOOLS_STATS { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input), path(input_index) - path fasta + tuple val(meta2), path(fasta) output: tuple val(meta), path("*.stats"), emit: stats diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml index 1d68a5d..90e6345 100644 --- a/modules/nf-core/samtools/stats/meta.yml +++ b/modules/nf-core/samtools/stats/meta.yml @@ -30,9 +30,14 @@ input: type: file description: BAI/CRAI file from alignment pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] - fasta: - type: optional file - description: Reference file the CRAM was created with + type: file + description: Reference file the CRAM was created with (optional) pattern: "*.{fasta,fa}" output: - meta: @@ -51,3 +56,4 @@ output: authors: - "@drpatelh" - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index 9aba5ee..aea5a42 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -2,14 +2,14 @@ process SAMTOOLS_VIEW { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.16.1" + conda "bioconda::samtools=1.17" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.16.1--h6899075_1' : - 'quay.io/biocontainers/samtools:1.16.1--h6899075_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: tuple val(meta), path(input), path(index) - path fasta + tuple val(meta2), path(fasta) path qname output: diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 2e597d3..3b05450 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -26,12 +26,17 @@ input: description: BAM/CRAM/SAM file pattern: "*.{bam,cram,sam}" - index: - type: optional file - description: BAM.BAI/CRAM.CRAI file - pattern: "*.{.bai,.crai}" + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) + pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] - fasta: - type: optional file - description: Reference file the CRAM was created with + type: file + description: Reference file the CRAM was created with (optional) pattern: "*.{fasta,fa}" - qname: type: file diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf index 3384847..8cd1856 100644 --- a/modules/nf-core/untar/main.nf +++ b/modules/nf-core/untar/main.nf @@ -5,7 +5,7 @@ process UNTAR { conda "conda-forge::sed=4.7 bioconda::grep=3.4 conda-forge::tar=1.34" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(archive) diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml index ea7a3f3..db241a6 100644 --- a/modules/nf-core/untar/meta.yml +++ b/modules/nf-core/untar/meta.yml @@ -3,6 +3,7 @@ description: Extract files. keywords: - untar - uncompress + - extract tools: - untar: description: | diff --git a/nextflow.config b/nextflow.config index ad35da2..f9d5027 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,10 +15,13 @@ params { bwamem2_index = null fasta = null + // Execution options + use_work_dir_as_temp = false + // Boilerplate options outdir = "./results" - tracedir = "${params.outdir}/readmapping_info" + tracedir = "${params.outdir}/pipeline_info/readmapping" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -69,6 +72,7 @@ try { profiles { + cleanup { cleanup = true } debug { process.beforeScript = 'echo $HOSTNAME' } conda { conda.enabled = true @@ -89,6 +93,7 @@ profiles { } docker { docker.enabled = true + docker.registry = 'quay.io' docker.userEmulation = true singularity.enabled = false podman.enabled = false @@ -164,6 +169,7 @@ report { trace { enabled = true file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + fields = 'task_id,hash,native_id,process,tag,status,exit,cpus,memory,time,attempt,submit,start,complete,duration,%cpu,%mem,peak_rss,rchar,wchar' } dag { enabled = true @@ -177,7 +183,7 @@ manifest { description = 'Pipeline to map reads generated using different sequencing technologies against a genome assembly.' mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.1.0' + version = '1.2.0' doi = '10.5281/zenodo.6563577' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 54f50ba..737b3c2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -67,6 +67,21 @@ } } }, + "execution": { + "title": "Execution", + "type": "object", + "description": "Control the execution of the pipeline.", + "default": "", + "properties": { + "use_work_dir_as_temp": { + "type": "boolean", + "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK) use the work directory for their temporary files, rather than the system default.", + "fa_icon": "fas fa-arrow-circle-down", + "hidden": true + } + }, + "fa_icon": "fas fa-running" + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -208,7 +223,7 @@ "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/readmapping_info", + "default": "${params.outdir}/pipeline_info/readmapping", "fa_icon": "fas fa-cogs", "hidden": true }, @@ -236,6 +251,9 @@ { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/execution" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index 4b22f41..c1d2263 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -23,33 +23,36 @@ workflow ALIGN_ONT { | map { meta, file -> file } | set { ch_fasta } + // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM MINIMAP2_ALIGN ( reads, ch_fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } - // Merge - SAMTOOLS_MERGE ( ch_bams, [], [] ) + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORT.out.bam + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } - CONVERT_STATS ( ch_sort, ch_fasta ) + CONVERT_STATS ( ch_sort, fasta ) ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 767563b..01cd1ac 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,7 +5,6 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' @@ -30,33 +29,36 @@ workflow ALIGN_PACBIO { | map { meta, file -> file } | set { ch_fasta } + // Align with minimap2. bam_format is set to true, making the output a *sorted* BAM MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, ch_fasta, true, false, false ) ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) // Collect all alignment output by sample name MINIMAP2_ALIGN.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple ( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } - // Merge - SAMTOOLS_MERGE ( ch_bams, [], [] ) + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_MERGE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - // Convert merged BAM to CRAM and calculate indices and statistics - SAMTOOLS_SORT.out.bam + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) | map { meta, bam -> [ meta, bam, [] ] } | set { ch_sort } - CONVERT_STATS ( ch_sort, ch_fasta ) + CONVERT_STATS ( ch_sort, fasta ) ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) diff --git a/subworkflows/local/convert_stats.nf b/subworkflows/local/convert_stats.nf index 7c381fd..2ea16b9 100644 --- a/subworkflows/local/convert_stats.nf +++ b/subworkflows/local/convert_stats.nf @@ -11,7 +11,7 @@ include { SAMTOOLS_IDXSTATS } from '../../modules/nf-core/samtools/idxstats/main workflow CONVERT_STATS { take: bam // channel: [ val(meta), /path/to/bam, /path/to/bai] - fasta // channel: /path/to/fasta + fasta // channel: [ val(meta), /path/to/fasta ] main: diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index 6078d4d..3d55282 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -28,7 +28,7 @@ workflow FILTER_PACBIO { | map { meta, bam -> [ meta, bam, [] ] } | set { ch_pacbio } - SAMTOOLS_CONVERT ( ch_pacbio, [], [] ) + SAMTOOLS_CONVERT ( ch_pacbio, [ [], [] ], [] ) ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions.first() ) @@ -62,7 +62,7 @@ workflow FILTER_PACBIO { | join ( SAMTOOLS_CONVERT.out.csi ) | set { ch_reads } - SAMTOOLS_FILTER ( ch_reads, [], PACBIO_FILTER.out.list ) + SAMTOOLS_FILTER ( ch_reads, [ [], [] ], PACBIO_FILTER.out.list ) ch_versions = ch_versions.mix ( SAMTOOLS_FILTER.out.versions.first() ) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 379d2fa..d8a8a53 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -3,6 +3,7 @@ // include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' workflow INPUT_CHECK { @@ -11,20 +12,35 @@ workflow INPUT_CHECK { main: + ch_versions = Channel.empty() + + // Read the samplesheet SAMPLESHEET_CHECK ( samplesheet ).csv | splitCsv ( header:true, sep:',' ) - | map { create_data_channel( it ) } + // Prepare the channel for SAMTOOLS_FLAGSTAT + | map { row -> [row + [id: file(row.datafile).baseName], file(row.datafile, checkIfExists: true), []] } + | set { samplesheet_rows } + ch_versions = ch_versions.mix ( SAMPLESHEET_CHECK.out.versions.first() ) + + // Get stats from each input file + SAMTOOLS_FLAGSTAT ( samplesheet_rows ) + ch_versions = ch_versions.mix ( SAMTOOLS_FLAGSTAT.out.versions.first() ) + + // Create the read channel for the rest of the pipeline + samplesheet_rows + | join( SAMTOOLS_FLAGSTAT.out.flagstat ) + | map { meta, datafile, meta2, stats -> create_data_channel( meta, datafile, stats ) } | set { reads } emit: reads // channel: [ val(meta), /path/to/datafile ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } // Function to get list of [ meta, reads ] -def create_data_channel ( LinkedHashMap row ) { +def create_data_channel ( LinkedHashMap row, datafile, stats ) { // create meta map def meta = [:] meta.id = row.sample @@ -39,13 +55,15 @@ def create_data_channel ( LinkedHashMap row ) { } meta.read_group = "\'@RG\\tID:" + row.datafile.split('/')[-1].split('\\.')[0] + "\\tPL:" + platform + "\\tSM:" + meta.id.split('_')[0..-2].join('_') + "\'" - - // add path(s) of the read file(s) to the meta map - def data_meta = [] - if ( !file(row.datafile).exists() ) { - exit 1, "ERROR: Please check input samplesheet -> Data file does not exist!\n${row.datafile}" - } else { - data_meta = [ meta, file(row.datafile) ] + // Read the first line of the flagstat file + // 3127898040 + 0 in total (QC-passed reads + QC-failed reads) + // and make the sum of both integers + stats.withReader { + line = it.readLine() + def lspl = line.split() + def read_count = lspl[0].toLong() + lspl[2].toLong() + meta.read_count = read_count } - return data_meta + + return [meta, datafile] } diff --git a/subworkflows/local/markdup_stats.nf b/subworkflows/local/markdup_stats.nf index 2dc8312..9b271f8 100644 --- a/subworkflows/local/markdup_stats.nf +++ b/subworkflows/local/markdup_stats.nf @@ -3,9 +3,10 @@ // Convert to CRAM and calculate statistics // -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { MARKDUPLICATE } from '../../subworkflows/local/markduplicate' -include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' +include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_SORMADUP } from '../../modules/local/samtools_sormadup' +include { CONVERT_STATS } from '../../subworkflows/local/convert_stats' workflow MARKDUP_STATS { @@ -25,26 +26,38 @@ workflow MARKDUP_STATS { // Collect all BWAMEM2 output by sample name SAMTOOLS_SORT.out.bam - | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], bam] } + | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } | groupTuple( by: [0] ) + | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } + | branch { + meta, bams -> + single_bam: bams.size() == 1 + multi_bams: true + } | set { ch_bams } + // Merge, but only if there is more than 1 file + SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + + + SAMTOOLS_MERGE.out.bam + | mix ( ch_bams.single_bam ) + | set { ch_bam } + + // Mark duplicates - MARKDUPLICATE ( ch_bams ) - ch_versions = ch_versions.mix ( MARKDUPLICATE.out.versions ) + SAMTOOLS_SORMADUP ( ch_bam, fasta ) + ch_versions = ch_versions.mix ( SAMTOOLS_SORMADUP.out.versions ) // Convert merged BAM to CRAM and calculate indices and statistics - MARKDUPLICATE.out.bam + SAMTOOLS_SORMADUP.out.bam | map { meta, bam -> [ meta, bam, [] ] } | set { ch_stat } - fasta - | map { meta, file -> file } - | set { ch_fasta } - - CONVERT_STATS ( ch_stat, ch_fasta ) + CONVERT_STATS ( ch_stat, fasta ) ch_versions = ch_versions.mix ( CONVERT_STATS.out.versions ) diff --git a/subworkflows/local/markduplicate.nf b/subworkflows/local/markduplicate.nf deleted file mode 100644 index 04ceb96..0000000 --- a/subworkflows/local/markduplicate.nf +++ /dev/null @@ -1,49 +0,0 @@ -// -// Merge BAM files and mark duplicates -// - -include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_COLLATE } from '../../modules/nf-core/samtools/collate/main' -include { SAMTOOLS_FIXMATE } from '../../modules/nf-core/samtools/fixmate/main' -include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_MARKDUP } from '../../modules/nf-core/samtools/markdup/main' - - -workflow MARKDUPLICATE { - take: - bams // channel: [ val(meta), [ /path/to/bams ] ] - - - main: - ch_versions = Channel.empty() - - - // Merge position sorted bam files - SAMTOOLS_MERGE ( bams, [], [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) - - - // Collate merged BAM file - SAMTOOLS_COLLATE ( SAMTOOLS_MERGE.out.bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_COLLATE.out.versions.first() ) - - - // Fill in mate coordinates and insert size fields - SAMTOOLS_FIXMATE ( SAMTOOLS_COLLATE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_FIXMATE.out.versions.first() ) - - - // Position sort BAM file - SAMTOOLS_SORT ( SAMTOOLS_FIXMATE.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_SORT.out.versions.first() ) - - - // Mark duplicates - SAMTOOLS_MARKDUP ( SAMTOOLS_SORT.out.bam, [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MARKDUP.out.versions.first() ) - - - emit: - bam = SAMTOOLS_MARKDUP.out.bam // channel: [ val(meta), /path/to/bam ] - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index b5f23fd..5264569 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -19,12 +19,15 @@ workflow PREPARE_GENOME { // Uncompress genome fasta file if required if ( params.fasta.endsWith('.gz') ) { - ch_fasta = GUNZIP ( fasta ).gunzip + ch_unzipped = GUNZIP ( fasta ).gunzip ch_versions = ch_versions.mix ( GUNZIP.out.versions ) } else { - ch_fasta = fasta + ch_unzipped = fasta } + ch_unzipped + | map { meta, fa -> [ meta + [id: fa.baseName, genome_size: fa.size()], fa] } + | set { ch_fasta } // Unmask genome fasta UNMASK ( ch_fasta ) diff --git a/workflows/readmapping.nf b/workflows/readmapping.nf index 19641a1..9d12b0b 100644 --- a/workflows/readmapping.nf +++ b/workflows/readmapping.nf @@ -45,6 +45,7 @@ include { ALIGN_ONT } from '../subworkflows/local/align_ont' // include { UNTAR } from '../modules/nf-core/untar/main' +include { CRUMBLE } from '../modules/nf-core/crumble/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -80,7 +81,7 @@ workflow READMAPPING { // SUBWORKFLOW: Uncompress and prepare reference genome files // ch_fasta - | map { [ [ id: it.baseName.tokenize(".")[0..1].join(".") ], it ] } + | map { [ [ id: it.baseName ], it ] } | set { ch_genome } PREPARE_GENOME ( ch_genome ) @@ -125,6 +126,13 @@ workflow READMAPPING { ch_versions = ch_versions.mix ( ALIGN_ONT.out.versions ) + // + // MODULE: To compress PacBio HiFi aligned CRAM files + // + CRUMBLE ( ALIGN_HIFI.out.cram, [], true ) + ch_versions = ch_versions.mix ( CRUMBLE.out.versions ) + + // // MODULE: Combine different versions.yml //