MetaSUB · dcdanko · Sep 28, 2021 · Sep 28, 2021
diff --git a/cap2/api.py b/cap2/api.py
@@ -1,8 +1,11 @@
 
 import luigi
 
+from .sample import Sample
 from .pipeline.databases import MODULES as DB_MODULES
-
+from .pipeline.full_pipeline import FullPipeline
+from .pipeline.utils.cap_task import BaseCapTask
+from .pipeline.utils.conda import CondaPackage, PyPiPackage
 from .constants import (
     STAGES,
     STAGES_GROUP,
@@ -22,6 +25,23 @@ def run_db_stage(config_path='', cores=1, **kwargs):
     luigi.build(instances, local_scheduler=True, **kwargs)
 
 
+def install_software(config_path='', cores=1, workers=1, **kwargs):
+    sample = Sample('not_a_real_sample', '/this/will/never/be/run')
+    modules, queue = [], [FullPipeline.from_sample(sample, config_path, cores=cores)]
+    while queue:
+        obj = queue.pop()
+        if isinstance(obj, BaseCapTask):
+            try:
+                for dependency in obj.requires():
+                    queue.append(dependency)
+            except TypeError:
+                queue.append(obj.requires())
+        elif isinstance(obj, CondaPackage) or isinstance(obj, PyPiPackage):
+            modules.append(obj)
+    luigi.build(modules, local_scheduler=True, workers=workers, **kwargs)
+
+
+
 def run_stage(samples, stage_name, config_path='', cores=1, workers=1, **kwargs):
     """Run a subpipeline on a list of samples. stage_name can be one of `qc`, `pre`, `reads`."""
     modules = STAGES[stage_name]

diff --git a/cap2/cli.py b/cap2/cli.py
@@ -6,6 +6,7 @@
 from .api import (
     run_db_stage,
     run_stage,
+    install_software,
 )
 from .sample import Sample
 from .constants import (
@@ -73,5 +74,17 @@ def cap_pipeline(workers, threads, config, stage, manifest):
     run_stage(samples, stage, config_path=config, cores=threads, workers=workers)
 
 
+@run.command('install')
+@click.option('-w', '--workers', default=1)
+@click.option('-c', '--config', type=click.Path(), default='')
+def cap_pipeline(workers, config):
+    """Install the software needed by the MCAP.
+
+    This command will make a series of calls to conda in
+    order to install the requisite software for the MCAP.
+    """
+    install_software(config_path=config, workers=workers)
+
+
 if __name__ == '__main__':
     main()
diff --git a/cap2/pipeline/config.py b/cap2/pipeline/config.py
@@ -10,6 +10,8 @@ class PipelineConfig:
     def __init__(self, filename):
         if filename:
             self.blob = load(open(filename).read())
+        elif 'CAP2_CONFIG' in environ:
+            self.blob = load(open(environ['CAP2_CONFIG']).read())
         else:
             self.blob = {}
         self.out_dir = self.blob.get('out_dir', environ.get('CAP2_OUT_DIR', 'results'))

diff --git a/cap2/pipeline/utils/conda.py b/cap2/pipeline/utils/conda.py
@@ -59,7 +59,7 @@ def get_path(self, tool):
 
     def save_spec(self):
         proc = subprocess.Popen(
-            ' '.join(['conda', 'env', 'export', '--name', self.name]),
+            ' '.join(['conda', 'env', 'export']),
             stdout=subprocess.PIPE,
             shell=True
         )
@@ -111,7 +111,7 @@ def install(self, package, channel="anaconda"):
         except:
             print(f'Subprocess failed from {os.getcwd()}: {cmd}', file=sys.stderr)
             raise
-        self.save_spec()
+        # self.save_spec()
         self.add_to_path()
 
     def pypi_install(self, package):

diff --git a/containers/README.md b/containers/README.md
@@ -0,0 +1,41 @@
+# Containerization
+
+
+## Running the MCAP with Docker
+
+
+To run the MCAP with docker you will need to create two directories: 1) one where databases should be stored on your machine's filesystem and 2) one where the output from the mcap should go. For simplicity we will call these 1) `/my/databases` and `/my/outputs`.
+
+To run the MCAP use this command:
+
+```
+docker run \
+	--mount source=/my/databases,target=/mcap/dbs,type=bind,readonly \
+	--mount source=/my/outputs,target=/mcap/out,type=bind \
+	-it mcap \
+	cap2 --help
+```
+
+This command will print a help message and then exit. To run the mcap pipeline replace `cap2 --help` with a more complete command.
+
+
+### Using external databases
+
+Typically you won't want to download new databases every time you use the MCAP. Unfortunately the databases used by the MCAP are too large to fit comfortably into a docker image. As a work around the MCAP docker image can be set to look for already existing databases in your machine's filesystem. To do this you need to 1) load databases onto your local filesystem, 2) instruct docker to connect to these databases when it runs.
+
+#### Loading databases to your local filesystem
+
+
+#### Letting docker connect to your databases
+
+Suppose you have the MCAP databases downloaded into a directory on your local filesystem called `/path/to/mcap/databases`. You can instruct docker to connect to this folder at runtime using the following command:
+
+```
+docker run \
+	--mount source=/path/to/mcap/databases,target=/mcap/dbs,type=bind,readonly \
+	-it mcap \
+	/bin/bash
+
+```
+
+This command will make it possible for commands run in this docker image to read data from `/path/to/mcap/databases`. This command is read-only meaning the docker image will not be able to edit files.
diff --git a/containers/docker/latest/Dockerfile b/containers/docker/latest/Dockerfile
@@ -0,0 +1,100 @@
+FROM --platform=linux/amd64 ubuntu:20.04
+
+WORKDIR /mcap
+RUN mkdir -p /mcap/dbs /mcap/demo
+
+# Necessary for R
+ENV TZ=America/New_York
+ENV DEBIAN_FRONTEND=noninteractive
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+
+RUN apt-get update -y && apt-get install -y \
+	bzip2 \
+	build-essential \
+	zlib1g-dev \
+	libc6 \
+	libncurses5-dev \
+	libncursesw5-dev \
+	libnss-sss \
+	libbz2-dev \
+	liblzma-dev \
+	less \
+	libcurl4-openssl-dev \
+	wget \
+	unzip \
+	zip \
+	r-base \
+	r-base-core \
+	r-recommended \
+	default-jre \
+	default-jdk \
+	python \
+	python3 \
+	python3-pip \
+	mafft \
+	curl \
+	rsync \
+	git && \
+	rm -rf /var/lib/apt/lists/*
+
+# Install Miniconda
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-py38_4.10.3-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-py38_4.10.3-Linux-x86_64.sh
+RUN conda --version
+
+# Install software with conda
+# Based on https://jcristharif.com/conda-docker-tips.html
+RUN conda install --yes -c bioconda \
+	numpy \
+	pandas \
+	scipy
+
+RUN pip install \
+	pangea_api \
+	Jinja2==3.0.0a1 \
+	biopython==1.76 \
+	click==6.7 \
+	pysam \
+	python-louvain
+
+RUN pip install \
+	gimmebio.seqs \
+	bloom_filter \
+    luigi==3.0.0b2 \
+    PyYaml==5.3.1
+
+RUN conda install --yes -c conda-forge matplotlib
+RUN conda clean -afy
+
+WORKDIR /mcap
+RUN mkdir -p /mcap/out /mcap/conda/config/envs /mcap/conda/vendor/conda
+COPY mcap_config.yaml /mcap/config.yaml
+ENV CAP2_CONFIG=/mcap/config.yaml
+
+
+# install most of the software here
+# so future updates will be faster
+RUN pip install cap2==0.5.3
+RUN cap2 --help
+RUN cap2 run install
+
+RUN pip install cap2==0.5.3
+
+RUN cap2 --help
+
+ADD demo /mcap/demo/
+WORKDIR /mcap/demo
+# RUN cap2 run pipeline --stage qc manifest.txt
+
+WORKDIR /mcap
+RUN cap2 run install
+
+
+
+CMD ["bash"]
diff --git a/containers/docker/latest/Makefile b/containers/docker/latest/Makefile
@@ -0,0 +1,17 @@
+
+build:
+	docker buildx build --platform linux/amd64  -t mcap .
+
+shell: build
+	docker run \
+		--mount source=`pwd`/dbs,target=/mcap/dbs,type=bind,readonly \
+		--mount source=`pwd`/out,target=/mcap/out,type=bind \
+		-it mcap \
+		/bin/bash
+
+mcap: build
+	docker run \
+		--mount source=`pwd`/dbs,target=/mcap/dbs,type=bind,readonly \
+		--mount source=`pwd`/out,target=/mcap/out,type=bind \
+		-it mcap \
+		cap2 --help
diff --git a/containers/docker/latest/dbs/README.txt b/containers/docker/latest/dbs/README.txt
@@ -0,0 +1 @@
+this file should only be visible in the docker container if dbs was mounted properly
diff --git a/containers/docker/latest/demo/Makefile b/containers/docker/latest/demo/Makefile
@@ -0,0 +1,10 @@
+
+qc:
+	cap2 run pipeline -s qc -c config.yaml manifest.txt
+
+
+clean:
+	-rm -r config
+	-rm -r demo_db
+	-rm -r demo_out
+	-rm -r vendor
diff --git a/containers/docker/latest/demo/README.md b/containers/docker/latest/demo/README.md
@@ -0,0 +1,14 @@
+# CAP2 Demo
+
+This directory is a demo for running the CAP from the command line.
+
+## Installation
+
+See `README.md` in main directory.
+
+## Running
+
+Run the demo by entering the command `cap2 run qc config.yaml manifest.txt` or `make qc`
+
+
+## Notes
diff --git a/containers/docker/latest/demo/config.yaml b/containers/docker/latest/demo/config.yaml
@@ -0,0 +1,2 @@
+out_dir: demo_out
+db_dir: demo_db
diff --git a/containers/docker/latest/demo/manifest.txt b/containers/docker/latest/demo/manifest.txt
@@ -0,0 +1 @@
+zymo_pos_cntrl zymo_pos_cntrl.r1.fq.gz zymo_pos_cntrl.r2.fq.gz
diff --git a/containers/docker/latest/demo/zymo_pos_cntrl.r1.fq.gz b/containers/docker/latest/demo/zymo_pos_cntrl.r1.fq.gz
diff --git a/containers/docker/latest/demo/zymo_pos_cntrl.r2.fq.gz b/containers/docker/latest/demo/zymo_pos_cntrl.r2.fq.gz
diff --git a/containers/docker/latest/mcap_config.yaml b/containers/docker/latest/mcap_config.yaml
@@ -0,0 +1,6 @@
+# This is the config file that is used in the MCAP docker container
+
+out_dir: out
+db_dir: dbs
+conda_spec_dir: /mcap/conda/config/envs
+conda_base_path: /mcap/conda/vendor/conda
diff --git a/containers/docker/latest/out/README.txt b/containers/docker/latest/out/README.txt
@@ -0,0 +1 @@
+this file should only be visible in the docker container if out was mounted properly
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 setuptools.setup(
     name='cap2',
-    version='0.4.1',
+    version='0.5.3',
     description="CAP2",
     author="David C. Danko",
     author_email='[email protected]',
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		this file should only be visible in the docker container if dbs was mounted properly
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		zymo_pos_cntrl zymo_pos_cntrl.r1.fq.gz zymo_pos_cntrl.r2.fq.gz