olgabot · phoenixAja · Apr 17, 2019 · Apr 17, 2019 · Apr 17, 2019 · Apr 17, 2019
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,8 +1,11 @@
-Many thanks to contributing to nf-core/kmer-similarity!
+Many thanks to contributing to nf-core/kmermaid!
+
+To ensure that your build passes, please make sure your pull request is to the `dev` branch rather than to `master`. Thank you!
 
 Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs).
 
 ## PR checklist
+ - [ ] PR is to `dev` rather than `master`
  - [ ] This comment contains a description of changes (with reason)
  - [ ] If you've fixed a bug or added code that should be tested, add tests!
  - [ ] If necessary, also make a PR on the [nf-core/kmer-similarity branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/kmer-similarity)

diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 work/
 results
 my-results
+__pycache__
 
 # Nextflow outputs
 timeline.html*
@@ -89,4 +90,4 @@ atlassian-ide-plugin.xml
 com_crashlytics_export_strings.xml
 crashlytics.properties
 crashlytics-build.properties
-fabric.properties
+fabric.properties
diff --git a/.travis.yml b/.travis.yml
@@ -6,32 +6,51 @@ python: '3.6'
 cache: pip
 matrix:
   fast_finish: true
+  include:
+  - name: "Minimum Nextflow version, regular test suite"
+    env: NXF_VER='0.32.0' SUITE=test FLAGS=
+    language: java
+    jdk: openjdk8
+  - name: "Latest Nextflow version, regular test suite"
+    env: NXF_VER='' SUITE=test FLAGS=
+  - name: "Latest Nextflow version, regular test suite with splitKmers, ensure that `protein` can't be specified"
+    # Check exit code to make sure it is nonzero for --splitKmer + --molecules protein
+    script:
+      - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --splitKmer ; if [ $? -eq 0 ]; then echo "--splitKmer + --molecules protein should fail but did not" && exit 1 ; else echo "Correctly failed --splitKmer + --molecules protein" ; fi
+  - name: "Latest Nextflow version, split k-mer test suite"
+    env: NXF_VER='' SUITE=test_ska FLAGS=
+  - name: "Latest Nextflow version, split k-mer test suite, test subsampling"
+    env: NXF_VER='' SUITE=test_ska FLAGS=--subsample 10
+  - name: "Lint the pipeline code"
+    install:
+      # Install nf-core/tools
+      - pip install --upgrade pip
+      - pip install nf-core
+    script: nf-core lint ${TRAVIS_BUILD_DIR}
+    python: '3.6'
+    jdk: openjdk8
+  - name: "Lint the documentation"
+    script: markdownlint ${TRAVIS_BUILD_DIR} -c ${TRAVIS_BUILD_DIR}/.github/markdownlint.yml
+    python: '3.6'
 
 before_install:
   # PRs to master are only ok if coming from dev branch
   - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])'
   # Pull the docker image first so the test doesn't wait for this
-  - docker pull czbiohub/nf-kmer-similarity:dev
+  - docker pull nfcore/kmermaid:dev
   # Fake the tag locally so that the pipeline runs properly
-  - docker tag czbiohub/nf-kmer-similarity:dev czbiohub/nf-kmer-similarity:dev
+  - docker tag nfcore/kmermaid:dev nfcore/kmermaid:dev
 
 install:
   # Install Nextflow
   - mkdir /tmp/nextflow && cd /tmp/nextflow
   - wget -qO- get.nextflow.io | bash
   - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow
-  # Install nf-core/tools
-  - pip install nf-core
   # Reset
   - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests
-
-env:
-  - NXF_VER='19.03.0-edge' # Specify a minimum NF version that should be tested and work
-  - NXF_VER='' # Plus: get the latest NF version and check that it works
+  # Install markdownlint-cli
+  - sudo apt-get install npm && npm install -g markdownlint-cli
 
 script:
-  # Lint the pipeline code
-  # Skip linting for now since container is built by czbiohub
-  # - nf-core lint ${TRAVIS_BUILD_DIR}
   # Run the pipeline with the test profile
-  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile ${SUITE},docker ${FLAGS}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,20 @@
 # nf-core/nf-kmer-similarity: Changelog
 
-## v1.0dev - 6 March 2019
+## v1.1dev
+
+* Add option to use Dayhoff encoding for sourmash
+* Add `ska` and `seqtk` to container dependencies
+
+## v1.0 - 6 March 2019
+
 Initial release of nf-core/nf-kmer-similarity, created with the [nf-core](http://nf-co.re/) template.
+
+## v1.1dev - 9 September 2019
+
+#### Pipeline Updates
+* Added fastq subsampling/truncating optional parameter using [seqtk](https://github.com/lh3/seqtk)
+* Added support for kmer comparisons using Split Kmer Analysis [SKA](https://github.com/simonrharris/SKA)
+
+#### Dependency Updates
+* seqtk -> 1.3
+* ska -> 1.0
diff --git a/Dockerfile b/Dockerfile
@@ -1,20 +1,17 @@
-FROM continuumio/anaconda3
-MAINTAINER [email protected]
+FROM nfcore/base
+LABEL description="Docker image containing all requirements for nf-core/kmermaid pipeline"
+
+COPY environment.yml /
+RUN conda env create -f /environment.yml && conda clean -a
+ENV PATH /opt/conda/envs/nfcore-kmermaid-0.1dev/bin:$PATH
 
 # Suggested tags from https://microbadger.com/labels
 ARG VCS_REF
 LABEL org.label-schema.vcs-ref=$VCS_REF \
-org.label-schema.vcs-url="e.g. https://github.com/czbiohub/nf-kmer-similarity"
-
+org.label-schema.vcs-url="e.g. https://github.com/nf-core/kmermaid"
 
 WORKDIR /home
 
-USER root
-
-# Add user "main" because that's what is expected by this image
-RUN useradd -ms /bin/bash main
-
-
 ENV PACKAGES zlib1g git g++ make ca-certificates gcc zlib1g-dev libc6-dev procps
 
 ### don't modify things below here for version updates etc.
@@ -25,28 +22,16 @@ RUN apt-get update && \
     apt-get install -y --no-install-recommends ${PACKAGES} && \
     apt-get clean
 
-RUN conda install --yes Cython bz2file pytest numpy matplotlib scipy sphinx alabaster
 
+RUN which -a pip
+RUN which -a python
+ENV SOURMASH_VERSION 'olgabot/dayhoff'
 RUN cd /home && \
-    git clone https://github.com/dib-lab/khmer.git -b master && \
-    cd khmer && \
-    python3 setup.py install
-
-# Check that khmer was installed properly
-RUN trim-low-abund.py --help
-RUN trim-low-abund.py --version
-
-
-# Required for multiprocessing of 10x bam file
-# RUN pip install pathos bamnostic
-
-# ENV SOURMASH_VERSION master
-RUN cd /home && \
-    git clone https://github.com/dib-lab/sourmash.git && \
+    git clone --branch $SOURMASH_VERSION https://github.com/czbiohub/sourmash.git && \
     cd sourmash && \
-    python3 setup.py install
+    python setup.py install
+
+RUN which -a sourmash
 
-RUN which -a python3
-RUN python3 --version
 RUN sourmash info
 COPY docker/sysctl.conf /etc/sysctl.conf
diff --git a/Dockerfile.ska b/Dockerfile.ska
@@ -0,0 +1,7 @@
+FROM nfcore/base
+LABEL authors="Phoenix Logan" \
+      description="Docker image containing all requirements for ska portion of the nfcore/kmermaid pipeline"
+
+COPY environment.ska.yml /
+RUN conda env create -f environment.ska.yml && conda clean -a
+ENV PATH /opt/conda/envs/nf-core-splitkmeranalysis-1.0.0/bin:$PATH
diff --git a/README.md b/README.md
@@ -1,33 +1,58 @@
 # nf-kmer-similarity
 
-This is a [Nextflow](nextflow.io) workflow for running k-mer similarity
+This is a [Nextflow](nextflow.io) workflow for running k-mer similarity.
 
 [![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/czbiohub/nf-kmer-similarity.svg)](https://cloud.docker.com/u/czbiohub/repository/docker/czbiohub/nf-kmer-similarity)
 
 ## Usage
 
-### With a samples.csv file:
+By default, this pipeline creates a [MinHash](https://en.wikipedia.org/wiki/MinHash) sketch of sequencing reads using [sourmash](https://sourmash.readthedocs.io), then compares them all using a [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) . Here are the default parameters:
 
-```
+- log2 sketch sizes of 10, 12, 14, 16  (as if `--log2_sketch_sizes 10,12,14,16` was specified on the command line), so 2^10, 2^12, 2^14, 2^16 = 1024, 4096, 16 384, 65 536 hashed k-mers for each sample
+- Compute both DNA and protein signatures (as if `--molecules dna,protein` was specified on the command line). The protein k-mers are obtained by doing [six-frame translation](https://en.wikipedia.org/wiki/Reading_frame#/media/File:Open_reading_frame.jpg) on the DNA k-mers
+- K-mer sizes of 21, 27, 33, 51 (as if `--ksizes 21,27,33,51` was specified on the command line).
+  - If using the `--splitKmer` option, keep in mind that the k-mer size in this case is the two halves of the split k-mer, which you can visualize as `[---ksize---]N[---ksize---]`. So the default k-mer sizes for `--splitKmer` is 9 and 15, for a total sequence unit size of `2*15+1 = 31` and `2*9+1 = 19` which is as if you specified on the command line `--splitKmer --ksize 9,15`. Additionally k-mer sizes with `--splitKmer` must be divisible by 3 (yes, this is inconvenient)
+
+### With a samples.csv file
+
+This is where you'd have a csv file with a `sample_id,read1,read2` header containing the sample id and paths to each of your R1 and R2 read files.
+
+```bash
 nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --samples samples.csv
 ```
 
-### With R1, R2 read pairs:
+### With R1, R2 read pairs
 
-```
+```bash
 nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ \
   --read_pairs 's3://olgabot-maca/sra/homo_sapiens/smartseq2_quartzseq/*{R1,R2}*.fastq.gz,s3://olgabot-maca/sra/danio_rerio/smart-seq/whole_kidney_marrow_prjna393431/*{1,2}.fastq.gz'
 ```
 
-### With SRA ids:
+### With SRA ids
 
-```
+```bash
 nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --sra SRP016501
 ```
 
-### With fasta files:
+### With fasta files
 
-```
+```bash
 nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ \
   --fastas '*.fasta'
 ```
+
+### With Split Kmer Analysis [SKA](https://github.com/simonrharris/SKA)
+
+Note: the meaning of `ksize` is different with split k-mers, so now the value specified by `--ksize` is just under half of the total sampled sequence size, where the middle base can be any base (`N`) `[---ksize---]N[---ksize---]`. Note that `--splitKmer` can only work with DNA sequence and does not work with `protein` specified in `--molecules`.
+
+```bash
+nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --samples samples.csv --splitKmer
+```
+
+### With Split Kmer Analysis [SKA](https://github.com/simonrharris/SKA) and fastq subsampling with [seqtk](https://github.com/lh3/seqtk)
+
+The `subsample` command is often necessary because the `ska` tool uses ALL the reads rather than a MinHash subsampling of them. If your input files are rather big, then the `ska` sketching command (`ska fastq`) runs out of memory, or it takes so long that it's untenable. The `--subsample` command specifies the number of reads to be used.
+
+```bash
+nextflow run czbiohub/nf-kmer-similarity --outdir s3://olgabot-maca/nf-kmer-similarity/ --samples samples.csv --splitKmer --subsample 1000
+```
diff --git a/conf/base.config b/conf/base.config
@@ -13,15 +13,15 @@ process {
 
   cpus = { check_max( 2, 'cpus' ) }
   memory = { check_max( 8.GB * task.attempt, 'memory' ) }
-  time = { check_max( 2.h * task.attempt, 'time' ) }
+  time = { check_max( 16.h * task.attempt, 'time' ) }
 
   errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'terminate' }
   maxRetries = 1
   maxErrors = '-1'
 
   // Process-specific resource requirements
   withLabel: low_memory {
-    memory = { check_max( 16.GB * task.attempt, 'memory' ) }
+    memory = { check_max( 8.GB * task.attempt, 'memory' ) }
   }
   withLabel: mid_memory {
     memory = { check_max( 32.GB * task.attempt, 'memory' ) }

diff --git a/conf/test.config b/conf/test.config
@@ -19,7 +19,7 @@ params {
   // fastas = 'testing/fastas/*.fasta'
   ksizes = '3,9'
   log2_sketch_sizes = '2,4'
-  molecules = 'dna,protein'
+  molecules = 'dna,protein,dayhoff'
   // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
   // sra = "SRP016501"
   read_paths = [

diff --git a/conf/test_ska.config b/conf/test_ska.config
@@ -0,0 +1,27 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/rnaseq -profile test
+ */
+
+params {
+  config_profile_name = 'Test profile'
+  config_profile_description = 'Minimal test dataset to check pipeline function'
+  // Limit resources so that this can run on Travis
+  max_cpus = 2
+  max_memory = 6.GB
+  max_time = 48.h
+  // Input data
+  ksizes = '3,6'
+  molecules = 'dna'
+  splitKmer = true
+  read_paths = [
+    ['SRR4050379', ['https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050379_pass_1.fastq.gz',
+                    'https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050379_pass_2.fastq.gz']],
+    ['SRR4050380', ['https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050380_pass_1.fastq.gz',
+                    'https://github.com/czbiohub/test-datasets/raw/kmer-similarity/testdata/SRR4050380_pass_2.fastq.gz']],
+  ]
+}
diff --git a/docs/installation.md b/docs/installation.md
@@ -1,6 +1,6 @@
-# nf-core/nf-kmer-similarity: Installation
+# nf-core/kmermaid: Installation
 
-To start using the nf-core/nf-kmer-similarity pipeline, follow the steps below:
+To start using the nf-core/kmermaid pipeline, follow the steps below:
 
 1. [Install Nextflow](#1-install-nextflow)
 2. [Install the pipeline](#2-install-the-pipeline)
@@ -72,7 +72,7 @@ Be warned of two important points about this default configuration:
 #### 3.1) Software deps: Docker
 First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/)
 
-Then, running the pipeline with the option `-profile standard,docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from dockerhub (https://hub.docker.com/r/nfcore/nf-kmer-similarity).
+Then, running the pipeline with the option `-profile standard,docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from [dockerhub](https://hub.docker.com/r/nfcore/nf-kmer-similarity).
 
 #### 3.1) Software deps: Singularity
 If you're not able to use Docker then [Singularity](http://singularity.lbl.gov/) is a great alternative.