diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bda92351..ead7feb3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,8 +1,11 @@ -Many thanks to contributing to nf-core/kmer-similarity! +Many thanks to contributing to nf-core/kmermaid! + +To ensure that your build passes, please make sure your pull request is to the `dev` branch rather than to `master`. Thank you! Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs). ## PR checklist + - [ ] PR is to `dev` rather than `master` - [ ] This comment contains a description of changes (with reason) - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] If necessary, also make a PR on the [nf-core/kmer-similarity branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/kmer-similarity) diff --git a/.gitignore b/.gitignore index 6b611500..75b11857 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ work/ results my-results +__pycache__ # Nextflow outputs timeline.html* @@ -89,4 +90,4 @@ atlassian-ide-plugin.xml com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties -fabric.properties \ No newline at end of file +fabric.properties diff --git a/.travis.yml b/.travis.yml index 64e50193..c538905d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,32 +6,51 @@ python: '3.6' cache: pip matrix: fast_finish: true + include: + - name: "Minimum Nextflow version, regular test suite" + env: NXF_VER='0.32.0' SUITE=test FLAGS= + language: java + jdk: openjdk8 + - name: "Latest Nextflow version, regular test suite" + env: NXF_VER='' SUITE=test FLAGS= + - name: "Latest Nextflow version, regular test suite with splitKmers, ensure that `protein` can't be specified" + # Check exit code to make sure it is nonzero for --splitKmer + --molecules protein + script: + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --splitKmer ; if [ $? -eq 0 ]; then echo "--splitKmer + --molecules protein should fail but did not" && exit 1 ; else echo "Correctly failed --splitKmer + --molecules protein" ; fi + - name: "Latest Nextflow version, split k-mer test suite" + env: NXF_VER='' SUITE=test_ska FLAGS= + - name: "Latest Nextflow version, split k-mer test suite, test subsampling" + env: NXF_VER='' SUITE=test_ska FLAGS=--subsample 10 + - name: "Lint the pipeline code" + install: + # Install nf-core/tools + - pip install --upgrade pip + - pip install nf-core + script: nf-core lint ${TRAVIS_BUILD_DIR} + python: '3.6' + jdk: openjdk8 + - name: "Lint the documentation" + script: markdownlint ${TRAVIS_BUILD_DIR} -c ${TRAVIS_BUILD_DIR}/.github/markdownlint.yml + python: '3.6' before_install: # PRs to master are only ok if coming from dev branch - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' # Pull the docker image first so the test doesn't wait for this - - docker pull czbiohub/nf-kmer-similarity:dev + - docker pull nfcore/kmermaid:dev # Fake the tag locally so that the pipeline runs properly - - docker tag czbiohub/nf-kmer-similarity:dev czbiohub/nf-kmer-similarity:dev + - docker tag nfcore/kmermaid:dev nfcore/kmermaid:dev install: # Install Nextflow - mkdir /tmp/nextflow && cd /tmp/nextflow - wget -qO- get.nextflow.io | bash - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow - # Install nf-core/tools - - pip install nf-core # Reset - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests - -env: - - NXF_VER='19.03.0-edge' # Specify a minimum NF version that should be tested and work - - NXF_VER='' # Plus: get the latest NF version and check that it works + # Install markdownlint-cli + - sudo apt-get install npm && npm install -g markdownlint-cli script: - # Lint the pipeline code - # Skip linting for now since container is built by czbiohub - # - nf-core lint ${TRAVIS_BUILD_DIR} # Run the pipeline with the test profile - - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker + - nextflow run ${TRAVIS_BUILD_DIR} -profile ${SUITE},docker ${FLAGS} diff --git a/CHANGELOG.md b/CHANGELOG.md index 2229927f..5c03aedd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,20 @@ # nf-core/nf-kmer-similarity: Changelog -## v1.0dev - 6 March 2019 +## v1.1dev + +* Add option to use Dayhoff encoding for sourmash +* Add `ska` and `seqtk` to container dependencies + +## v1.0 - 6 March 2019 + Initial release of nf-core/nf-kmer-similarity, created with the [nf-core](http://nf-co.re/) template. + +## v1.1dev - 9 September 2019 + +#### Pipeline Updates +* Added fastq subsampling/truncating optional parameter using [seqtk](https://github.com/lh3/seqtk) +* Added support for kmer comparisons using Split Kmer Analysis [SKA](https://github.com/simonrharris/SKA) + +#### Dependency Updates +* seqtk -> 1.3 +* ska -> 1.0 diff --git a/Dockerfile b/Dockerfile index 57b7dd94..f2910d9c 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,17 @@ -FROM continuumio/anaconda3 -MAINTAINER olga.botvinnik@czbiohub.org +FROM nfcore/base +LABEL description="Docker image containing all requirements for nf-core/kmermaid pipeline" + +COPY environment.yml / +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/nfcore-kmermaid-0.1dev/bin:$PATH # Suggested tags from https://microbadger.com/labels ARG VCS_REF LABEL org.label-schema.vcs-ref=$VCS_REF \ -org.label-schema.vcs-url="e.g. https://github.com/czbiohub/nf-kmer-similarity" - +org.label-schema.vcs-url="e.g. https://github.com/nf-core/kmermaid" WORKDIR /home -USER root - -# Add user "main" because that's what is expected by this image -RUN useradd -ms /bin/bash main - - ENV PACKAGES zlib1g git g++ make ca-certificates gcc zlib1g-dev libc6-dev procps ### don't modify things below here for version updates etc. @@ -25,28 +22,16 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends ${PACKAGES} && \ apt-get clean -RUN conda install --yes Cython bz2file pytest numpy matplotlib scipy sphinx alabaster +RUN which -a pip +RUN which -a python +ENV SOURMASH_VERSION 'olgabot/dayhoff' RUN cd /home && \ - git clone https://github.com/dib-lab/khmer.git -b master && \ - cd khmer && \ - python3 setup.py install - -# Check that khmer was installed properly -RUN trim-low-abund.py --help -RUN trim-low-abund.py --version - - -# Required for multiprocessing of 10x bam file -# RUN pip install pathos bamnostic - -# ENV SOURMASH_VERSION master -RUN cd /home && \ - git clone https://github.com/dib-lab/sourmash.git && \ + git clone --branch $SOURMASH_VERSION https://github.com/czbiohub/sourmash.git && \ cd sourmash && \ - python3 setup.py install + python setup.py install + +RUN which -a sourmash -RUN which -a python3 -RUN python3 --version RUN sourmash info COPY docker/sysctl.conf /etc/sysctl.conf diff --git a/Dockerfile.ska b/Dockerfile.ska new file mode 100644 index 00000000..8e10672e --- /dev/null +++ b/Dockerfile.ska @@ -0,0 +1,7 @@ +FROM nfcore/base +LABEL authors="Phoenix Logan" \ + description="Docker image containing all requirements for ska portion of the nfcore/kmermaid pipeline" + +COPY environment.ska.yml / +RUN conda env create -f environment.ska.yml && conda clean -a +ENV PATH /opt/conda/envs/nf-core-splitkmeranalysis-1.0.0/bin:$PATH diff --git a/README.md b/README.md index d87ea0da..c789396a 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,58 @@ # nf-kmer-similarity -This is a [Nextflow](nextflow.io) workflow for running k-mer similarity +This is a [Nextflow](nextflow.io) workflow for running k-mer similarity. [![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/czbiohub/nf-kmer-similarity.svg)](https://cloud.docker.com/u/czbiohub/repository/docker/czbiohub/nf-kmer-similarity) ## Usage -### With a samples.csv file: +By default, this pipeline creates a [MinHash](https://en.wikipedia.org/wiki/MinHash) sketch of sequencing reads using [sourmash](https://sourmash.readthedocs.io), then compares them all using a [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) . Here are the default parameters: -``` +- log2 sketch sizes of 10, 12, 14, 16 (as if `--log2_sketch_sizes 10,12,14,16` was specified on the command line), so 2^10, 2^12, 2^14, 2^16 = 1024, 4096, 16 384, 65 536 hashed k-mers for each sample +- Compute both DNA and protein signatures (as if `--molecules dna,protein` was specified on the command line). The protein k-mers are obtained by doing [six-frame translation](https://en.wikipedia.org/wiki/Reading_frame#/media/File:Open_reading_frame.jpg) on the DNA k-mers +- K-mer sizes of 21, 27, 33, 51 (as if `--ksizes 21,27,33,51` was specified on the command line). + - If using the `--splitKmer` option, keep in mind that the k-mer size in this case is the two halves of the split k-mer, which you can visualize as `[---ksize---]N[---ksize---]`. So the default k-mer sizes for `--splitKmer` is 9 and 15, for a total sequence unit size of `2*15+1 = 31` and `2*9+1 = 19` which is as if you specified on the command line `--splitKmer --ksize 9,15`. Additionally k-mer sizes with `--splitKmer` must be divisible by 3 (yes, this is inconvenient) + +### With a samples.csv file + +This is where you'd have a csv file with a `sample_id,read1,read2` header containing the sample id and paths to each of your R1 and R2 read files. + +```bash nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --samples samples.csv ``` -### With R1, R2 read pairs: +### With R1, R2 read pairs -``` +```bash nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ \ --read_pairs 's3://olgabot-maca/sra/homo_sapiens/smartseq2_quartzseq/*{R1,R2}*.fastq.gz,s3://olgabot-maca/sra/danio_rerio/smart-seq/whole_kidney_marrow_prjna393431/*{1,2}.fastq.gz' ``` -### With SRA ids: +### With SRA ids -``` +```bash nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --sra SRP016501 ``` -### With fasta files: +### With fasta files -``` +```bash nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ \ --fastas '*.fasta' ``` + +### With Split Kmer Analysis [SKA](https://github.com/simonrharris/SKA) + +Note: the meaning of `ksize` is different with split k-mers, so now the value specified by `--ksize` is just under half of the total sampled sequence size, where the middle base can be any base (`N`) `[---ksize---]N[---ksize---]`. Note that `--splitKmer` can only work with DNA sequence and does not work with `protein` specified in `--molecules`. + +```bash +nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --samples samples.csv --splitKmer +``` + +### With Split Kmer Analysis [SKA](https://github.com/simonrharris/SKA) and fastq subsampling with [seqtk](https://github.com/lh3/seqtk) + +The `subsample` command is often necessary because the `ska` tool uses ALL the reads rather than a MinHash subsampling of them. If your input files are rather big, then the `ska` sketching command (`ska fastq`) runs out of memory, or it takes so long that it's untenable. The `--subsample` command specifies the number of reads to be used. + +```bash +nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --samples samples.csv --splitKmer --subsample 1000 +``` diff --git a/conf/base.config b/conf/base.config index 70a3a1a1..b3f191d3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -13,7 +13,7 @@ process { cpus = { check_max( 2, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } - time = { check_max( 2.h * task.attempt, 'time' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'terminate' } maxRetries = 1 @@ -21,7 +21,7 @@ process { // Process-specific resource requirements withLabel: low_memory { - memory = { check_max( 16.GB * task.attempt, 'memory' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } } withLabel: mid_memory { memory = { check_max( 32.GB * task.attempt, 'memory' ) } diff --git a/conf/test.config b/conf/test.config index f42adbeb..5179860d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -19,7 +19,7 @@ params { // fastas = 'testing/fastas/*.fasta' ksizes = '3,9' log2_sketch_sizes = '2,4' - molecules = 'dna,protein' + molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" read_paths = [ diff --git a/conf/test_ska.config b/conf/test_ska.config new file mode 100644 index 00000000..dd401209 --- /dev/null +++ b/conf/test_ska.config @@ -0,0 +1,27 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/rnaseq -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + // Input data + ksizes = '3,6' + molecules = 'dna' + splitKmer = true + read_paths = [ + ['SRR4050379', ['https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050380_pass_2.fastq.gz']], + ] +} diff --git a/docs/installation.md b/docs/installation.md index a72a62a2..903fe48c 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,6 +1,6 @@ -# nf-core/nf-kmer-similarity: Installation +# nf-core/kmermaid: Installation -To start using the nf-core/nf-kmer-similarity pipeline, follow the steps below: +To start using the nf-core/kmermaid pipeline, follow the steps below: 1. [Install Nextflow](#1-install-nextflow) 2. [Install the pipeline](#2-install-the-pipeline) @@ -72,7 +72,7 @@ Be warned of two important points about this default configuration: #### 3.1) Software deps: Docker First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/) -Then, running the pipeline with the option `-profile standard,docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from dockerhub (https://hub.docker.com/r/nfcore/nf-kmer-similarity). +Then, running the pipeline with the option `-profile standard,docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from [dockerhub](https://hub.docker.com/r/nfcore/nf-kmer-similarity). #### 3.1) Software deps: Singularity If you're not able to use Docker then [Singularity](http://singularity.lbl.gov/) is a great alternative. diff --git a/docs/output.md b/docs/output.md index 72ecfe76..c5606fa8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,28 +8,21 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: * [FastQC](#fastqc) - read quality control -* [Sourmash sketch](#sourmash-sketch) - Compute a k-mer sketch of each sample -* [Sourmash compare](#sourmash-compare) - Compare all samples on k-mer sketches +* [Sourmash](#sourmash) - MinHash to subset the reads before comparing samples + * [Sourmash sketch](#sourmash-sketch) - Compute a k-mer sketch of each sample + * [Sourmash compare](#sourmash-compare) - Compare all samples on k-mer sketches +* [Split K-mer Analysis (SKA)](#split-k-mer-analysis-ska) + * [SKA sketch](#ska-sketch) - Compute a k-mer sketch of each sample + * [SKA compare](#ska-compare) - Compare all samples on k-mer sketches * [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline -## FastQC -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences. -For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. - -**Output directory: `results/fastqc`** - -* `sample_fastqc.html` - * FastQC report, containing quality metrics for your untrimmed raw fastq files -* `zips/sample_fastqc.zip` - * zip file containing the FastQC report, tab-delimited data file and plot images - -## Sourmash Sketch +## Sourmash [Sourmash](https://sourmash.readthedocs.io/en/latest/) is a tool to compute MinHash sketches on nucleotide (DNA/RNA) and protein sequences. It allows for fast comparisons of sequences based on their nucleotide content. +### Sourmash sketch + **Output directory: `results/sourmash/sketches`** For each sample and provided `molecule`, `ksize` and `log2_sketch_size`, a file is created: @@ -38,7 +31,7 @@ For each sample and provided `molecule`, `ksize` and `log2_sketch_size`, a file For example: -``` +```bash SRR4050379_molecule-dayhoff_ksize-3_log2sketchsize-2.sig SRR4050379_molecule-dayhoff_ksize-3_log2sketchsize-4.sig SRR4050379_molecule-dayhoff_ksize-9_log2sketchsize-2.sig @@ -53,9 +46,9 @@ SRR4050379_molecule-protein_ksize-9_log2sketchsize-2.sig SRR4050379_molecule-protein_ksize-9_log2sketchsize-4.sig ``` -## Sourmash Compare +### Sourmash compare -**Output directory: `results/sourmash`** +**Output directory: `results/sourmash/compare`** For each provided `molecule`, `ksize` and `log2_sketch_size`, a file is created containing a symmetric matrix of the similarity between all samples, written as a comma-separated variable file: @@ -63,7 +56,7 @@ For each provided `molecule`, `ksize` and `log2_sketch_size`, a file is created For example, -``` +```bash similarities_molecule-dna_ksize-3_log2sketchsize-2.csv similarities_molecule-dna_ksize-3_log2sketchsize-4.csv similarities_molecule-dna_ksize-9_log2sketchsize-2.csv @@ -74,6 +67,22 @@ similarities_molecule-protein_ksize-9_log2sketchsize-2.csv similarities_molecule-protein_ksize-9_log2sketchsize-4.csv ``` +## Split K-mer Analysis (SKA) + +[Split K-mer analysis (SKA)](https://github.com/simonrharris/SKA) is a program to take ALL the reads from a sample and find split k-mers. + +### SKA sketch + +**Output directory: `results/ska/sketches`** + + + +### SKA compare + +**Output directory: `results/ska/compare`** + + + ## MultiQC [MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. @@ -86,4 +95,4 @@ The pipeline has special steps which allow the software versions used to be repo * `Project_multiqc_data/` * Directory containing parsed statistics from the different tools used in the pipeline -For more information about how to use MultiQC reports, see http://multiqc.info +For more information about how to use MultiQC reports, see [here](http://multiqc.info). diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index bfbbdbd5..c8063eeb 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -1,4 +1,4 @@ -# nf-core/nf-kmer-similarity: Troubleshooting +# nf-core/kmermaid: Troubleshooting ## Input files not found @@ -7,11 +7,11 @@ If only no file, only one input file , or only read one and not read two is pick 1. The path must be enclosed in quotes (`'` or `"`) 2. The path must have at least one `*` wildcard character. This is even if you are only running one paired end sample. 3. When using the pipeline with paired end data, the path must use `{1,2}` or `{R1,R2}` notation to specify read pairs. -4. If you are running Single end data make sure to specify `--singleEnd` +4. If you are running Single end data make sure to specify `--singleEnd` If the pipeline can't find your files then you will get the following error -``` +```bash ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz ``` diff --git a/docs/usage.md b/docs/usage.md index 3b57c903..16f4c014 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,4 +1,4 @@ -# nf-core/nf-kmer-similarity: Usage +# nf-core/kmermaid: Usage ## Table of contents @@ -7,41 +7,43 @@ * [Updating the pipeline](#updating-the-pipeline) * [Reproducibility](#reproducibility) * [Main arguments](#main-arguments) - * [`-profile`](#-profile-single-dash) - * [`docker`](#docker) - * [`awsbatch`](#awsbatch) - * [`standard`](#standard) - * [`none`](#none) - * [Read inputs](#read-inputs) - * [`--read_pairs`](#--read_pairs) - * [`--read_singles`](#--read_singles) - * [`--csv_pairs`](#--csv_pairs) - * [`--csv_singles`](#--csv_singles) - * [`--fastas`](#--fastas) - * [`--sra`](#--sra) - * [Sketch parameters](#sketch-parameters) - * [`--molecule`](#--molecule) - * [`--ksize`](#--ksize) - * [`--log2_sketch_size`](#--log2_sketch_size) + * [`-profile`](#-profile-single-dash) + * [`docker`](#docker) + * [`awsbatch`](#awsbatch) + * [`standard`](#standard) + * [`none`](#none) + * [Read inputs](#read-inputs) + * [`--read_pairs`](#--read_pairs) + * [`--read_singles`](#--read_singles) + * [`--csv_pairs`](#--csv_pairs) + * [`--csv_singles`](#--csv_singles) + * [`--fastas`](#--fastas) + * [`--sra`](#--sra) + * [K-merization/Sketching program options](#k-merization-sketching-program-options) + * [`--splitKmer`](#--splitKmer) + * [Sketch parameters](#sketch-parameters) + * [`--molecule`](#--molecule) + * [`--ksize`](#--ksize) + * [`--log2_sketch_size`](#--log2_sketch_size) * [Job Resources](#job-resources) * [Automatic resubmission](#automatic-resubmission) * [Custom resource requests](#custom-resource-requests) * [AWS batch specific parameters](#aws-batch-specific-parameters) - * [`-awsbatch`](#-awsbatch) - * [`--awsqueue`](#--awsqueue) - * [`--awsregion`](#--awsregion) + * [`-awsbatch`](#-awsbatch) + * [`--awsqueue`](#--awsqueue) + * [`--awsregion`](#--awsregion) * [Other command line parameters](#other-command-line-parameters) - * [`--outdir`](#--outdir) - * [`--email`](#--email) - * [`-name`](#-name-single-dash) - * [`-resume`](#-resume-single-dash) - * [`-c`](#-c-single-dash) - * [`--max_memory`](#--max_memory) - * [`--max_time`](#--max_time) - * [`--max_cpus`](#--max_cpus) - * [`--plaintext_emails`](#--plaintext_emails) - * [`--sampleLevel`](#--sampleLevel) - * [`--multiqc_config`](#--multiqc_config) + * [`--outdir`](#--outdir) + * [`--email`](#--email) + * [`-name`](#-name-single-dash) + * [`-resume`](#-resume-single-dash) + * [`-c`](#-c-single-dash) + * [`--max_memory`](#--max_memory) + * [`--max_time`](#--max_time) + * [`--max_cpus`](#--max_cpus) + * [`--plaintext_emails`](#--plaintext_emails) + * [`--sampleLevel`](#--sampleLevel) + * [`--multiqc_config`](#--multiqc_config) ## General Nextflow info @@ -55,6 +57,7 @@ NXF_OPTS='-Xms1g -Xmx4g' ## Running the pipeline The typical command for running the pipeline is as follows: + ```bash nextflow run nf-core/nf-kmer-similarity --reads '*_R{1,2}.fastq.gz' -profile standard,docker ``` @@ -91,24 +94,24 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Note that multiple profiles can be loaded, for example: `-profile standard,docker` - the order of arguments is important! * `standard` - * The default profile, used if `-profile` is not specified at all. - * Runs locally and expects all software to be installed and available on the `PATH`. + * The default profile, used if `-profile` is not specified at all. + * Runs locally and expects all software to be installed and available on the `PATH`. * `docker` - * A generic configuration profile to be used with [Docker](http://docker.com/) - * Pulls software from dockerhub: [`nfcore/nf-kmer-similarity`](http://hub.docker.com/r/nfcore/nf-kmer-similarity/) + * A generic configuration profile to be used with [Docker](http://docker.com/) + * Pulls software from dockerhub: [`nfcore/nf-kmer-similarity`](http://hub.docker.com/r/nfcore/nf-kmer-similarity/) * `singularity` - * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) - * Pulls software from singularity-hub + * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) + * Pulls software from singularity-hub * `conda` - * A generic configuration profile to be used with [conda](https://conda.io/docs/) - * Pulls most software from [Bioconda](https://bioconda.github.io/) + * A generic configuration profile to be used with [conda](https://conda.io/docs/) + * Pulls most software from [Bioconda](https://bioconda.github.io/) * `awsbatch` - * A generic configuration profile to be used with AWS Batch. + * A generic configuration profile to be used with AWS Batch. * `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters + * A profile with a complete configuration for automated testing + * Includes links to test data so needs no other parameters * `none` - * No configuration at all. Useful if you want to build your own config from scratch and want to avoid loading in the default `base` config profile (not recommended). + * No configuration at all. Useful if you want to build your own config from scratch and want to avoid loading in the default `base` config profile (not recommended). ## Read inputs @@ -205,6 +208,35 @@ Please note the following requirements: If left unspecified, no samples are used. +## K-merization/Sketching program Options + +By default, the k-merization and sketch creation program is [sourmash](https://sourmash.readthedocs.io). + +### `--splitKmer` + +If `--splitKmer` is specified, then the [Split K-mer Analysis (SKA)](https://github.com/simonrharris/SKA) program ([publication](https://www.biorxiv.org/content/10.1101/453142v1)) is used to obtain k-mers from the data. This allows for a SNP to be present in the middle of a k-mer which can be advantageous for metagenomic analyses or with single-cell ATAC-seq data. + +#### What does `--ksize` mean when `--splitKmer` is set + +The meaning of `ksize` is different with split k-mers, so now the value specified by `--ksize` is just under half of the total sampled sequence size, where the middle base can be any base (`N`) `[---ksize---]N[---ksize---]`. When `--splitKmer` is set, then the default k-mer sizes are 9 and 15, for a total sequence unit size of `2*15+1 = 31` and `2*9+1 = 19` which is as if you specified on the command line `--splitKmer --ksize 9,15`. Additionally k-mer sizes with `--splitKmer` must be divisible by 3 (yes, this is inconvenient) and between 3 and 60 (inclusive). So the "total" `2*k+1` sizes can be: + +* k = 3 --> 2*3 + 1 = 7 total length +* k = 6 --> 2*6 + 1 = 13 total length +* k = 9 --> 2*9 + 1 = 18 total length +* k = 12 --> 2*12 + 1 = 25 total length +* k = 15 --> 2*15 + 1 = 31 total length +* ... +* k = 60 --> 2*60 + 1 = 121 total length + +#### `--subsample` reads when `--splitKmer` is set + +The `subsample` command is often necessary because the `ska` tool uses ALL the reads rather than a MinHash subsampling of them. If your input files are rather big, then the `ska` sketching command (`ska fastq`) runs out of memory, or it takes so long that it's untenable. The `--subsample` command specifies the number of reads to be used. When e.g. `--subsample 1000` is set, then 1000 reads (or read pairs) are randomly subsampled from the data using [seqtk](https://github.com/lh3/seqtk). + + +#### Which `--molecules` are valid when `--splitKmer` is set + +Currently, `--splitKmer` only works with DNA sequence and not protein sequence, and thus will fail if `protein` or `dayhoff` is specified in `--molecules`. + ## Sketch parameters [K-mer](https://en.wikipedia.org/wiki/K-mer) [MinHash](https://en.wikipedia.org/wiki/MinHash) sketches are defined by three parameters: @@ -230,7 +262,7 @@ The molecule can be either `dna`, `protein`, or `dayhoff`, and if all of them ar | I, L, M, V | Hydrophobic | e | | F, W, Y | Aromatic | f | -**Example parameters** +#### Example parameters * Default: * `--molecule dna,protein,dayhoff` @@ -244,7 +276,7 @@ The fundamental unit of the sketch is a [hashed](https://en.wikipedia.org/wiki/H *NB: if either `protein` or `dayhoff` is specified, the k-mer size must be divisible by 3* -**Example parameters** +#### Example parameters * Default: * `--ksize 21,27,33,51` @@ -255,7 +287,7 @@ The fundamental unit of the sketch is a [hashed](https://en.wikipedia.org/wiki/H The log2 sketch size specifies the number of k-mers to use for the sketch. We use the log2 of the sketch size instead of the raw number of k-mers to be compatible for comparison with [`dashing`](https://github.com/dnbaker/dashing) that uses HyperLogLog instead of MinHash. -**Example parameters** +#### Example parameters * Default: * `--log2_sketch_size 10,12,14,16` diff --git a/environment.ska.yml b/environment.ska.yml new file mode 100644 index 00000000..45c326bd --- /dev/null +++ b/environment.ska.yml @@ -0,0 +1,8 @@ +name: nf-core-splitkmeranalysis-1.0.0 +channels: + - bioconda + - defaults + - conda-forge +dependencies: + - ska=1.0 + - seqtk=1.3 diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000..ae7ef7cc --- /dev/null +++ b/environment.yml @@ -0,0 +1,23 @@ +# You can use this file to create a conda environment for this pipeline: +# conda env create -f environment.yml +name: nfcore-kmermaid-0.1dev +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - conda-forge::r-data.table=1.11.4 + - conda-forge::r-gplots=3.0.1 + - bioconda::bioconductor-edger=3.22.5 + - conda-forge::r-markdown=0.8 + - bioconda::khmer=2.1.2 + - bioconda::sourmash=2.0.1 + - pytest + - sphinx + - matplotlib + - alabaster + - conda-forge::gxx_linux-64 + - ska=1.0 + - seqtk=1.3 + # - conda-forge::gcc_linux-64 + # - gcc diff --git a/main.nf b/main.nf index 533a740d..ec72de5e 100644 --- a/main.nf +++ b/main.nf @@ -57,13 +57,16 @@ def helpMessage() { --ksizes Which nucleotide k-mer sizes to use. Multiple are separated by commas. Default is '21,27,33,51' --molecules Which molecule to compare on. Default is both DNA - and protein, i.e. 'dna,protein' + and protein, i.e. 'dna,protein,dayhoff' --log2_sketch_sizes Which log2 sketch sizes to use. Multiple are separated by commas. Default is '10,12,14,16' --one_signature_per_record Make a k-mer signature for each record in the FASTQ/FASTA files. Useful for comparing e.g. assembled transcriptomes or metagenomes. (Not typically used for raw sequencing data as this would create a k-mer signature for each read!) + --splitKmer If provided, use SKA to compute split k-mer sketches instead of + sourmash to compute k-mer sketches + --subsample Integer value to subsample reads from input fastq files """.stripIndent() } @@ -105,13 +108,14 @@ if (params.read_paths) { read_paths_ch = Channel .from(params.read_paths) .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] } - .ifEmpty { exit 1, "params.read_paths was empty - no input files supplied" } + .ifEmpty { exit 1, "params.read_paths (${params.read_paths}) was empty - no input files supplied" } + } else { // Provided SRA ids if (params.sra){ sra_ch = Channel .fromSRA( params.sra?.toString()?.tokenize(';') ) - .ifEmpty { exit 1, "params.sra ${params.sra} was not found - no input files supplied" } + .ifEmpty { exit 1, "params.sra (${params.sra}) was not found - no input files supplied" } } // Provided a samples.csv file of read pairs if (params.csv_pairs){ @@ -119,7 +123,7 @@ if (params.read_paths) { .fromPath(params.csv_pairs) .splitCsv(header:true) .map{ row -> tuple(row[0], tuple(file(row[1]), file(row[2])))} - .ifEmpty { exit 1, "params.csv_pairs was empty - no input files supplied" } + .ifEmpty { exit 1, "params.csv_pairs (${params.csv_pairs}) was empty - no input files supplied" } } // Provided a samples.csv file of single-ended reads @@ -128,35 +132,45 @@ if (params.read_paths) { .fromPath(params.csv_singles) .splitCsv(header:true) .map{ row -> tuple(row[0], tuple(file(row[1])))} - .ifEmpty { exit 1, "params.csv_singles was empty - no input files supplied" } + .ifEmpty { exit 1, "params.csv_singles (${params.csv_singles}) was empty - no input files supplied" } } // Provided fastq gz read pairs if (params.read_pairs){ read_pairs_ch = Channel .fromFilePairs(params.read_pairs?.toString()?.tokenize(';')) - .ifEmpty { exit 1, "params.read_pairs was empty - no input files supplied" } + .ifEmpty { exit 1, "params.read_pairs (${params.read_pairs}) was empty - no input files supplied" } } - // Provided fastq gz read pairs + + // Provided fastq gz single-end reads if (params.read_singles){ read_singles_ch = Channel .fromFilePairs(params.read_singles?.toString()?.tokenize(';'), size: 1) - .ifEmpty { exit 1, "params.read_singles was empty - no input files supplied" } + .ifEmpty { exit 1, "params.read_singles (${params.read_singles}) was empty - no input files supplied" } } // Provided vanilla fastas if (params.fastas){ fastas_ch = Channel .fromPath(params.fastas?.toString()?.tokenize(';')) .map{ f -> tuple(f.baseName, tuple(file(f))) } - .ifEmpty { exit 1, "params.fastas was empty - no input files supplied" } + .ifEmpty { exit 1, "params.fastas (${params.fastas}) was empty - no input files supplied" } } - } +} + +if (params.subsample) { + sra_ch.concat(samples_ch, csv_singles_ch, read_pairs_ch, + read_singles_ch, fastas_ch, read_paths_ch) + .ifEmpty{ exit 1, "No reads provided! Check read input files"} + .set{ subsample_reads_ch } +} else { sra_ch.concat(samples_ch, csv_singles_ch, read_pairs_ch, read_singles_ch, fastas_ch, read_paths_ch) .ifEmpty{ exit 1, "No reads provided! Check read input files"} .set{ reads_ch } +} + // Has the run name been specified by the user? @@ -172,16 +186,24 @@ if(workflow.profile == 'awsbatch'){ if (!workflow.workDir.startsWith('s3') || !params.outdir.startsWith('s3')) exit 1, "Specify S3 URLs for workDir and outdir parameters on AWSBatch!" } +if (params.splitKmer){ + params.ksizes = '15,9' + params.molecules = 'dna' +} else { + params.ksizes = '21,27,33,51' +} -params.ksizes = '21,27,33,51' -params.molecules = 'dna,protein' -params.log2_sketch_sizes = '10,12,14,16' // Parse the parameters + ksizes = params.ksizes?.toString().tokenize(',') molecules = params.molecules?.toString().tokenize(',') log2_sketch_sizes = params.log2_sketch_sizes?.toString().tokenize(',') +if (params.splitKmer && 'protein' in molecules){ + exit 1, "Cannot specify 'protein' in `--molecules` if --splitKmer is set" +} + // Header log info log.info nfcoreHeader() @@ -268,82 +290,156 @@ process get_software_versions { """ } - -process sourmash_compute_sketch { - tag "${sample_id}_${sketch_id}" - publishDir "${params.outdir}/sketches", mode: 'copy' - container 'czbiohub/nf-kmer-similarity' - - // If job fails, try again with more memory - // memory { 8.GB * task.attempt } - errorStrategy 'retry' - maxRetries 3 +if (params.subsample) { + process subsample_input { + tag "${id}_subsample" + publishDir "${params.outdir}/seqtk/", mode: 'copy' input: - each ksize from ksizes - each molecule from molecules - each log2_sketch_size from log2_sketch_sizes - set sample_id, file(reads) from reads_ch + set id, file(reads) from subsample_reads_ch output: - set val(sketch_id), val(molecule), val(ksize), val(log2_sketch_size), file("${sample_id}_${sketch_id}.sig") into sourmash_sketches + + set val(id), file("*_${params.subsample}.fastq.gz") into reads_ch script: - sketch_id = "molecule-${molecule}_ksize-${ksize}_log2sketchsize-${log2_sketch_size}" - molecule = molecule - not_dna = molecule == 'dna' ? '' : '--no-dna' - ksize = ksize - if ( params.one_signature_per_record ){ - """ - sourmash compute \\ - --num-hashes \$((2**$log2_sketch_size)) \\ - --ksizes $ksize \\ - --$molecule \\ - $not_dna \\ - --output ${sample_id}_${sketch_id}.sig \\ - $reads - """ - } else { + read1 = reads[0] + read2 = reads[1] + read1_prefix = read1.name.minus(".fastq.gz") // TODO: change to RE to match fasta as well? + read2_prefix = read2.name.minus(".fastq.gz") + """ - sourmash compute \\ - --num-hashes \$((2**$log2_sketch_size)) \\ - --ksizes $ksize \\ - --$molecule \\ - $not_dna \\ - --output ${sample_id}_${sketch_id}.sig \\ - --merge '$sample_id' $reads + seqtk sample -s100 ${read1} ${params.subsample} > ${read1_prefix}_${params.subsample}.fastq.gz + seqtk sample -s100 ${read2} ${params.subsample} > ${read2_prefix}_${params.subsample}.fastq.gz + """ - } + } +} + +if (params.splitKmer){ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/* -- -- */ +/* -- CREATE SKA SKETCH -- */ +/* -- -- */ +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + + process ska_compute_sketch { + tag "${sketch_id}" + publishDir "${params.outdir}/ska/sketches/", mode: 'copy' + errorStrategy 'retry' + maxRetries 3 + + + input: + each ksize from ksizes + set id, file(reads) from reads_ch + + output: + set val(ksize), file("${sketch_id}.skf") into ska_sketches + + script: + sketch_id = "${id}_ksize_${ksize}" + + """ + ska fastq \\ + -k $ksize \\ + -o ${sketch_id} \\ + ${reads} + """ + + } +} else { + process sourmash_compute_sketch { + tag "${sample_id}_${sketch_id}" + publishDir "${params.outdir}/sourmash/sketches/", mode: 'copy' + + errorStrategy 'retry' + maxRetries 3 + + input: + each ksize from ksizes + each molecule from molecules + each log2_sketch_size from log2_sketch_sizes + set sample_id, file(reads) from reads_ch + + output: + set val(sketch_id), val(molecule), val(ksize), val(log2_sketch_size), file("${sample_id}_${sketch_id}.sig") into sourmash_sketches + + script: + sketch_id = "molecule-${molecule}_ksize-${ksize}_log2sketchsize-${log2_sketch_size}" + molecule = molecule + not_dna = molecule == 'dna' ? '' : '--no-dna' + ksize = ksize + if ( params.one_signature_per_record ){ + """ + sourmash compute \\ + --num-hashes \$((2**$log2_sketch_size)) \\ + --ksizes $ksize \\ + --$molecule \\ + $not_dna \\ + --output ${sample_id}_${sketch_id}.sig \\ + $reads + """ + } else { + """ + sourmash compute \\ + --num-hashes \$((2**$log2_sketch_size)) \\ + --ksizes $ksize \\ + --$molecule \\ + $not_dna \\ + --output ${sample_id}_${sketch_id}.sig \\ + --merge '$sample_id' $reads + """ + } + } } -// sourmash_sketches.println() -// sourmash_sketches.groupTuple(by: [0,3]).println() -process sourmash_compare_sketches { - tag "${sketch_id}" +if (params.splitKmer){ + process ska_compare_sketches { + tag "${sketch_id}" + publishDir "${params.outdir}/ska/compare/", mode: 'copy' - container 'czbiohub/nf-kmer-similarity' - publishDir "${params.outdir}/", mode: 'copy' - errorStrategy 'retry' - maxRetries 3 + input: + set val(ksize), file (sketches) from ska_sketches.groupTuple() - input: - set val(sketch_id), val(molecule), val(ksize), val(log2_sketch_size), file ("sketches/*.sig") \ - from sourmash_sketches.groupTuple(by: [0, 3]) + output: + // uploaded distances, clusters, and graph connecting (dot) file + file "ksize_${ksize}*" - output: - file "similarities_${sketch_id}.csv" + script: + """ + ska distance -o ksize_${ksize} -s 25 -i 0.95 ${sketches} + """ - script: - """ - sourmash compare \\ - --ksize ${ksize[0]} \\ - --${molecule[0]} \\ - --csv similarities_${sketch_id}.csv \\ - --traverse-directory . - """ + } + +} else { + process sourmash_compare_sketches { + tag "${sketch_id}" + publishDir "${params.outdir}/sourmash/compare", mode: 'copy' + + input: + set val(sketch_id), val(molecule), val(ksize), val(log2_sketch_size), file ("sketches/*.sig") \ + from sourmash_sketches.groupTuple(by: [0, 3]) + + output: + file "similarities_${sketch_id}.csv" + + script: + """ + sourmash compare \\ + --ksize ${ksize[0]} \\ + --${molecule[0]} \\ + --csv similarities_${sketch_id}.csv \\ + --traverse-directory . + """ + + } } @@ -375,7 +471,7 @@ workflow.onComplete { email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId if(workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository if(workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if(workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision + if(workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision // if(workflow.container) email_fields['summary']['Docker image'] = workflow.container email_fields['summary']['Nextflow Version'] = workflow.nextflow.version email_fields['summary']['Nextflow Build'] = workflow.nextflow.build diff --git a/nextflow.config b/nextflow.config index 59ebbdaf..2cdef31d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,8 +1,6 @@ -docker.enabled = true - params { // Pipeline Options - read_pairs = "data/*{1,2}.fastq.gz" + read_pairs = false read_singles = false samples = false samples_singles = false @@ -12,6 +10,44 @@ params { fastas = false sra = false molecules ='dna,protein' + log2_sketch_sizes = '10,12,14,16' + one_signature_per_record = false + splitKmer = false + subsample = false + + // Boilerplate options + outdir = './results' + name = false + multiqc_config = "$baseDir/assets/multiqc_config.yaml" + email = false + plaintext_email = false + monochrome_logs = false + help = false + maxMultiqcEmailFileSize = 25.MB + igenomes_base = "./iGenomes" + tracedir = "${params.outdir}/pipeline_info" + awsqueue = false + awsregion = 'eu-west-1' + igenomesIgnore = false + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + hostnames = false + config_profile_description = false + config_profile_contact = false + config_profile_url = false +} + + +params { + // Pipeline Options + read_pairs = "data/*{1,2}.fastq.gz" + read_singles = false + read_paths = false + csv_pairs = false + csv_singles = false + fastas = false + sra = false + molecules ='dna,protein,dayhoff' ksizes = '21,27,33,51' log2_sketch_sizes = '10,12,14,16' one_signature_per_record = false @@ -58,7 +94,7 @@ dag { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'czbiohub/nf-kmer-similarity:dev' +process.container = 'nfcore/kmermaid:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -70,6 +106,14 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } +// Load nf-core custom profiles from different Institutions +try { + includeConfig "${params.custom_config_base}/nfcore_custom.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") +} + + profiles { awsbatch { includeConfig 'conf/awsbatch.config' } conda { process.conda = "$baseDir/environment.yml" } @@ -77,15 +121,22 @@ profiles { docker { docker.enabled = true } singularity { singularity.enabled = true } test { includeConfig 'conf/test.config' } + test_ska { includeConfig 'conf/test_ska.config' } } +// Avoid this error: +// WARNING: Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap. +// Thanks to: https://github.com/alesssia/YAMP/wiki/How-to-use-Docker +docker.runOptions = '-u $(id -u):$(id -g)' + // Load igenomes.config if required if(!params.igenomesIgnore){ includeConfig 'conf/igenomes.config' } + // Capture exit codes from upstream processes when piping -process.shell = ['/bin/bash', '-euo', 'pipefail'] +//process.shell = ['/bin/bash', '-euo' 'pipefail] manifest { @@ -150,9 +201,17 @@ process { memory = { check_max( 512.GB * task.attempt, 'memory') } cpus = 1 } + withName: ska_compare_sketches { + memory = { check_max( 512.GB * task.attempt, 'memory') } + cpus = 1 + } withName: sourmash_compute_sketch { memory = { check_max( 4.GB * task.attempt, 'memory') } cpus = 1 } + withName: ska_compute_sketch { + memory = { check_max( 4.GB * task.attempt, 'memory') } + cpus = 1 + } } diff --git a/test.nf b/test.nf deleted file mode 100644 index fb58dd1d..00000000 --- a/test.nf +++ /dev/null @@ -1,34 +0,0 @@ -params.samples = "testing/samples.csv" -params.sra = "SRP016501" -params.directories = "s3://olgabot-maca/sra/danio_rerio/smart-seq/whole_kidney_marrow_prjna393431/*{R1,R2}*.fastq.gz" - -// Samples from SRA -sra_ch = Channel.empty() -// R1, R2 pairs from a samples.csv file -samples_ch = Channel.empty() -// Extract R1, R2 pairs from a directory -directories_ch = Channel.empty() - - -// Provided SRA ids -if (params.sra){ - sra_ch = Channel - .fromSRA( params.sra.toString()?.tokenize(',') ) -} -// Provided a samples.csv file -if (params.samples){ - samples_ch = Channel - .fromPath(params.samples) - .splitCsv(header:true) - .map{ row -> tuple(row.sample_id, tuple(row.read1, row.read2))} -} -// Provided s3 or local directories -if (params.directories){ - directories_ch = Channel - .fromFilePairs(params.directories.toString()?.tokenize(';')) -} - -sra_ch.concat(samples_ch, directories_ch) - .set{ reads_ch } - -println reads_ch diff --git a/test_question_mark.nf b/test_question_mark.nf deleted file mode 100644 index f6fa3990..00000000 --- a/test_question_mark.nf +++ /dev/null @@ -1,5 +0,0 @@ -ksizes = params.ksizes.splitCsv() ?: [21, 27, 33, 51] - -Channel - .from(ksizes) - .println() diff --git a/test_recursive_ls_s3.nf b/test_recursive_ls_s3.nf deleted file mode 100644 index 4e0f2a42..00000000 --- a/test_recursive_ls_s3.nf +++ /dev/null @@ -1,3 +0,0 @@ -Channel - .fromFilePairs("s3://olgabot-maca/sra/homo_sapiens/smartseq2_quartzseq/**_{1,2}.fastq.gz") - .println() \ No newline at end of file