Skip to content

Commit

Permalink
Add a WDL for GangSTR (#218)
Browse files Browse the repository at this point in the history
* Draft a WDL for GangSTR.

* Simplify joining array items by `,`.

* Install GangSTR via Bioconda.

* Add a short doc about the workflow & a comment.

* Update output, remove a comment & unused struct.

* remove unused sets & pinned TRTools version & image clean up.

* Use the current latest version of samtools.

* Draft splitting the WDL in two: GangSTR & GangSTRScatter.

* Bug fixes in GangSTR.wdl

* Bug fixes in GangSTRScatter.wdl.

* add str_ prefix to docker variables in GangSTR WDLs.

* Refactor for clarity & add args documentation.

* Add user-defined prefix for GangSTR output.
  • Loading branch information
VJalili authored Oct 19, 2021
1 parent f7a5544 commit 5cc9b1d
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 25 deletions.
69 changes: 44 additions & 25 deletions dockerfiles/str/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,16 @@
# - ExpansionHunter

FROM ubuntu:20.04
ARG DEBIAN_FRONTEND=noninteractive
ENV PATH="/root/miniconda3/bin:${PATH}"
ARG CONDA_RELEASE="4.6.14"
ARG CONDA_INSTALL_DIR="/opt/conda"
ARG CONDA_BIN=${CONDA_INSTALL_DIR}/bin
ARG CONDA_CMD=${CONDA_BIN}/conda
ENV PATH=${CONDA_BIN}:$PATH
ENV EH_VERSION=v4.0.2

RUN apt-get update && DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -qqy \
RUN apt-get update && apt-get install --no-install-recommends -qqy \
python3-dev \
python3-pip \
python \
Expand All @@ -20,16 +28,24 @@ RUN apt-get update && DEBIAN_FRONTEND="noninteractive" apt-get install --no-inst
pkg-config \
wget \
unzip \
zlib1g-dev
zlib1g-dev \
libcurl4-openssl-dev \
libssl-dev

# Install and configure conda
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_RELEASE}-Linux-x86_64.sh -O /tmp/miniconda.sh && \
bash /tmp/miniconda.sh -b -p ${CONDA_INSTALL_DIR} && \
${CONDA_CMD} init bash && \
bash -c "source ${HOME}/.bashrc"

RUN pip3 install pybedtools==0.8.2 pyvcf==0.6.8 scipy==1.7.1 numpy==1.21.1
RUN pip install Cython==0.29.24 numpy==1.21.2 pybedtools==0.8.2 scipy==1.7.1

# Install samtools (needed to index reference fasta files)
RUN wget -O samtools-1.9.tar.bz2 https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2 \
&& tar -xjf samtools-1.9.tar.bz2 \
&& cd samtools-1.9 \
&& ./configure --without-curses && make && make install \
&& cd ..
RUN wget -O samtools-1.13.tar.bz2 https://github.com/samtools/samtools/releases/download/1.13/samtools-1.13.tar.bz2 && \
tar -xjf samtools-1.13.tar.bz2 && \
cd samtools-1.13 && \
./configure --without-curses && make && make install && \
cd ..

# Install bedtools (needed for DumpSTR)
## Option 1: install from source
Expand All @@ -41,23 +57,26 @@ WORKDIR ..
## Option 2: install from apt
#RUN apt-get install bedtools

# Download, compile, and install GangSTR
RUN wget -O GangSTR-2.4.tar.gz https://github.com/gymreklab/GangSTR/releases/download/v2.4/GangSTR-2.4.tar.gz \
&& tar -xzvf GangSTR-2.4.tar.gz \
&& cd GangSTR-2.4 \
&& ./install-gangstr.sh \
&& ldconfig \
&& cd ..

# Download and install TRTools
RUN git clone https://github.com/gymreklab/TRTools \
&& cd TRTools \
&& python3 setup.py install \
&& cd ..
RUN git clone https://github.com/gymreklab/TRTools && \
cd TRTools && \
git checkout tags/v4.0.0 -b v4.0.0-branch && \
pip install -r requirements.txt && \
python3 setup.py install && \
cd ..

ENV EH_VERSION=v4.0.2
RUN wget https://github.com/Illumina/ExpansionHunter/releases/download/${EH_VERSION}/ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz \
&& tar xzf ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz \
&& rm ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz \
&& mv /ExpansionHunter-${EH_VERSION}-linux_x86_64 /ExpansionHunter
# Install ExpansionHunter
RUN wget https://github.com/Illumina/ExpansionHunter/releases/download/${EH_VERSION}/ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz && \
tar xzf ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz && \
rm ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz && \
mv /ExpansionHunter-${EH_VERSION}-linux_x86_64 /ExpansionHunter
ENV PATH="/ExpansionHunter/bin/:$PATH"

# Install GangSTR
RUN conda install -c bioconda -c conda-forge gangstr

# Clean up
RUN rm -rf /tmp/* \
/var/tmp/* \
/var/cache/apt/* \
/var/lib/apt/lists/* \
125 changes: 125 additions & 0 deletions wdl/GangSTR.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
## Workflow to run GangSTR (https://github.com/gymreklab/GangSTR), a tool
## for computing genome-wide profile of short tandem repeats (STR) from
## short reads.
version 1.0

import "Structs.wdl"

workflow GangSTR {

input {
File bam_or_cram
File? bam_or_cram_index
File reference_fasta
File? reference_fasta_index
File target_tr_loci_regions_bed
String? output_prefix
String str_docker
RuntimeAttr? runtime_attr
}
parameter_meta {
bam_or_cram: "Set the path to a sorted and indexed bam or cram file generated by an indel-sensitive aligner to be used as input for GangSTR."
bam_or_cram_index: "[Optional] Set the path to the index file of the `bam_or_cram` input."
reference_fasta: "Sets the path to the reference in fasta format."
reference_fasta_index: "[Optional] Sets the path to the index of reference file."
target_tr_loci_regions_bed: "Reference set of regions to genotype represented in bed-like format; see GangSTR documentation for the file structure at: https://github.com/gymreklab/GangSTR#tr-regions---regions"
output_prefix: "[Optional] Set an string to be used as a prefix to the output files. Defaults to the bam_or_cram filename."
str_docker: "Sets the STR docker image."
runtime_attr: "[Optional] Override the default runtime attributes for the GangSTR workflow."
}
Boolean is_bam =
basename(bam_or_cram, ".bam") + ".bam" == basename(bam_or_cram)
File bam_or_cram_index_ =
if defined(bam_or_cram_index) then
select_first([bam_or_cram_index])
else
bam_or_cram + if is_bam then ".bai" else ".crai"
File reference_fasta_index_ = select_first([
reference_fasta_index, reference_fasta + ".fai"])
String output_prefix_ =
if defined(output_prefix) then
select_first([output_prefix])
else
if is_bam then
basename(bam_or_cram, ".bam")
else
basename(bam_or_cram, ".cram")
call CallGangSTR {
input:
bam_or_cram = bam_or_cram,
bam_or_cram_index = bam_or_cram_index_,
reference_fasta = reference_fasta,
reference_fasta_index = reference_fasta_index_,
target_tr_loci_regions_bed = target_tr_loci_regions_bed,
output_prefix = output_prefix_,
str_docker = str_docker,
runtime_attr_override = runtime_attr
}
output {
File output_vcf = CallGangSTR.output_vcf
File sample_stats = CallGangSTR.sample_stats
File insdata = CallGangSTR.insdata
}
}

task CallGangSTR {
input {
File bam_or_cram
File bam_or_cram_index
File reference_fasta
File reference_fasta_index
File target_tr_loci_regions_bed
String output_prefix
String str_docker
RuntimeAttr? runtime_attr_override
}
output {
File output_vcf = "${output_prefix}.vcf"
File sample_stats = "${output_prefix}.samplestats.tab"
File insdata = "${output_prefix}.insdata.tab"
}

command <<<
set -euxo pipefail

GangSTR \
--bam ~{bam_or_cram} \
--ref ~{reference_fasta} \
--regions ~{target_tr_loci_regions_bed} \
--out ~{output_prefix}
>>>

RuntimeAttr runtime_attr_str_profile_default = object {
cpu_cores: 1,
mem_gb: 4,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1,
disk_gb: 10 + ceil(size([
bam_or_cram,
reference_fasta,
reference_fasta_index], "GiB"))
}
RuntimeAttr runtime_attr = select_first([
runtime_attr_override,
runtime_attr_str_profile_default])
runtime {
docker: str_docker
cpu: runtime_attr.cpu_cores
memory: runtime_attr.mem_gb + " GiB"
disks: "local-disk " + runtime_attr.disk_gb + " HDD"
bootDiskSizeGb: runtime_attr.boot_disk_gb
preemptible: runtime_attr.preemptible_tries
maxRetries: runtime_attr.max_retries
}
}
47 changes: 47 additions & 0 deletions wdl/GangSTRScatter.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
version 1.0

import "Structs.wdl"
import "GangSTR.wdl" as GangSTR

workflow GangSTRScatter {

input {
Array[File] bams_or_crams
Array[File]? bams_or_crams_indexes
File reference_fasta
File? reference_fasta_index
File target_tr_loci_regions_bed
String str_docker
RuntimeAttr? runtime_attr
}
scatter (i in range(length(bams_or_crams))) {
File bam_or_cram_ = bams_or_crams[i]
Boolean is_bam =
basename(bam_or_cram_, ".bam") + ".bam" == basename(bam_or_cram_)
File bam_or_cram_index_ =
if defined(bams_or_crams_indexes) then
select_first([bams_or_crams_indexes])[i]
else
bam_or_cram_ + if is_bam then ".bai" else ".crai"
File reference_fasta_index_ = select_first([
reference_fasta_index, reference_fasta + ".fai"])

call GangSTR.GangSTR as gangSTR {
input:
bam_or_cram=bam_or_cram_,
bam_or_cram_index=bam_or_cram_index_,
reference_fasta=reference_fasta,
reference_fasta_index=reference_fasta_index_,
target_tr_loci_regions_bed=target_tr_loci_regions_bed,
str_docker=str_docker,
runtime_attr=runtime_attr
}
}
output {
Array[File] output_vcfs = gangSTR.output_vcf
Array[File] samples_stats = gangSTR.sample_stats
Array[File] insdatas = gangSTR.insdata
}
}

0 comments on commit 5cc9b1d

Please sign in to comment.