Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reordering code for non-sudo runs and preparation for singularity. #43

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 58 additions & 55 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ RUN apt-get update && apt-get install -y \
software-properties-common

# install R from command line; get >= R-3.5
RUN add-apt-repository -y ppa:marutter/rrutter3.5
RUN add-apt-repository -y ppa:marutter/rrutter4.0
# RUN add-apt-repository -y ppa:c2d4u.team/c2d4u4.0+

# install:
# curl
# libcurl, Java (for h20)
Expand All @@ -29,17 +31,19 @@ RUN add-apt-repository -y ppa:marutter/rrutter3.5
# vim (for editing while in container)
# nginx (for static website hosting)
# ffmpeg (for animating figures)
RUN apt-get update && apt-get install -y \
# No need to install pandoc-citeproc -> https://github.com/jgm/pandoc-citeproc?tab=readme-ov-file#pandoc-citeproc
RUN apt-get update && apt-get install -y -qq --no-install-recommends --purge \
curl \
libcurl4-openssl-dev \
openjdk-8-jdk \
r-base \
r-base-dev \
pandoc \
pandoc-citeproc \
vim \
cmake \
nginx \
ffmpeg
# r-cran-devtools \
# vim \

RUN rm /var/www/html/index.nginx-debian.html

Expand All @@ -50,6 +54,8 @@ RUN Rscript -e 'install.packages("bookdown", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("seqinr", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("SuperLearner", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("quadprog", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("remotes", repos="https://cran.rstudio.com")'
RUN Rscript -e 'remotes::install_github("benkeser/cvma")'
# get ggplot2, dplyr, tidyr, readr, tibble, stringr, forcats (and purrr for free)
RUN Rscript -e 'install.packages("tidyverse", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("cowplot", repos="https://cran.rstudio.com")'
Expand All @@ -63,55 +69,45 @@ RUN Rscript -e 'install.packages("shiny", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("testthat", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("RCurl", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("bit64", repos="https://cran.rstudio.com")'
RUN Rscript -e 'install.packages("h2o", type = "source", repos="https://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")'
RUN Rscript -e 'install.packages("vimp", repos="https://cran.rstudio.com")'

# make directories
# lib contains R source files
# dat contains data
# dat/catnap contains original catnap data
# dat/analysis contains analysis data
RUN mkdir /home/dat /home/dat/catnap /home/dat/analysis /home/out
RUN mkdir /home/slfits /home/output

# copy R scripts to do do data pull, check options, run analysis, and return requested objects (and make executable)
COPY code/00_utils.R /home/lib/00_utils.R
COPY code/01_check_opts.R /home/lib/01_check_opts.R
COPY code/01_check_opts_functions.R /home/lib/01_check_opts_functions.R
COPY code/02_compile_analysis_dataset.R /home/lib/02_compile_analysis_dataset.R
COPY code/02_multi_ab.Rlib /home/lib/02_multi_ab.Rlib
COPY code/03_run_super_learners.R /home/lib/03_run_super_learners.R
COPY code/03_super_learner_libraries.R /home/lib/03_super_learner_libraries.R
COPY code/04_get_vimp.R /home/lib/04_get_vimp.R
COPY code/04_variable_groups.R /home/lib/04_variable_groups.R
COPY code/05_intrinsic_importance.R /home/lib/05_intrinsic_importance.R
COPY code/05_ml_var_importance_measures.R /home/lib/05_ml_var_importance_measures.R
COPY code/05_outcome_dist_plot.R /home/lib/05_outcome_dist_plot.R
COPY code/05_plot_one_vimp.R /home/lib/05_plot_one_vimp.R
COPY code/05_plotting_functions.R /home/lib/05_plotting_functions.R
COPY code/05_pred_importance.R /home/lib/05_pred_importance.R
COPY code/05_var_import_plot.R /home/lib/05_var_import_plot.R
COPY code/05_vimp_executive_summary_table.R /home/lib/05_vimp_executive_summary_table.R
COPY code/06_return_requested_objects.R /home/lib/06_return_requested_objects.R

RUN chmod +x /home/lib/01_check_opts.R /home/lib/02_compile_analysis_dataset.R /home/lib/03_run_super_learners.R /home/lib/04_get_vimp.R /home/lib/06_return_requested_objects.R

# copy report Rmd
COPY code/05_report.Rmd /home/lib/05_report.Rmd
COPY docs/refs.bib /home/lib/refs.bib
COPY code/run_analysis.sh /home/lib/run_analysis.sh
COPY code/05_render_report.R /home/lib/05_render_report.R
COPY code/05_report_preamble.R /home/lib/05_report_preamble.R
RUN chmod +x /home/lib/run_analysis.sh /home/lib/05_render_report.R /home/lib/05_report_preamble.R

# copy metadata Rmd
COPY code/07_metadata.Rmd /home/lib/07_metadata.Rmd
COPY code/07_render_metadata.R /home/lib/07_render_metadata.R
RUN chmod +x /home/lib/07_render_metadata.R
RUN Rscript -e 'install.packages("jsonlite", repos="https://cran.rstudio.com")'
RUN Rscript -e 'options(timeout=600); install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/4/R"); library(h2o)'
RUN Rscript -e 'library(h2o); h2o.init()'

# RUN wget "https://h2o-release.s3.amazonaws.com/h2o/latest_stable_R"
# RUN Rscript -e 'install.packages("h20", type = "source"); library(h2o)'
RUN Rscript -e 'install.packages("gam", repos="https://cran.rstudio.com")'

RUN Rscript -e 'remotes::install_github(repo = "bdwilliamson/vimp"); library(vimp)'
# RUN Rscript -e 'remotes::install_version("vimp", version="2.1.9", repos="https://cran.rstudio.com"); library(vimp)'

ADD slapnap slapnap/
RUN R CMD INSTALL -dc slapnap
# RUN Rscript -e 'setwd("slapnap/"); devtools::check()'
# RUN Rscript -e 'install.packages("slapnap_0.1.0.tar.gz", type="source")'
RUN Rscript -e 'library("slapnap")'

RUN rm -rf slapnap_0.1.0.tar.gz slapnap/

# Create default user and workdir
RUN useradd -ms /bin/bash slapnap
WORKDIR /home/slapnap/

RUN chown -R slapnap:slapnap /home/slapnap/

#---------------------------------------------------------------------
# Permanent options
#---------------------------------------------------------------------
RUN mkdir -p slfits output dat/analysis dat/catnap

# Slapnap data analysis directory
ENV analysis="/home/slapnap/dat/analysis/"

# Slapnap slfits directory
ENV slfits="/home/slapnap/slfits/"

# Slapnap output directory
ENV output="/home/slapnap/output/"

# which antibody to analyze
# "VRC01" is arbitrarily selected as default
ENV nab="VRC01"
Expand All @@ -122,7 +118,7 @@ ENV nab="VRC01"
# combinations of these
# For a single/multispecific bnAb, enter "sens".
# For a bnAb combination, enter "estsens" or "multsens".
ENV outcomes="ic50;sens"
ENV outcomes="ic50"

# which method to use for predicting combination IC-50 and IC-80
# possible methods are "additive" and "Bliss-Hill". For "Bliss-Hill",
Expand Down Expand Up @@ -210,11 +206,18 @@ ENV var_thresh="0"
ARG CACHEBUST=1
RUN echo "$CACHEBUST"

COPY bin/* /bin/

RUN chown -R slapnap:slapnap /home/slapnap/

# USER slapnap

# pull CATNAP data from LANL
RUN wget -O /home/dat/catnap/assay.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/assay.txt"
RUN wget -O /home/dat/catnap/viruses.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/viruses.txt"
RUN wget -O /home/dat/catnap/virseqs_aa.fasta "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/virseqs_aa.fasta"
RUN wget -O /home/dat/catnap/abs.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/abs.txt"
RUN wget -O dat/catnap/assay.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/assay.txt"
RUN wget -O dat/catnap/viruses.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/viruses.txt"
# RUN wget -O dat/catnap/virseqs_aa.fasta "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/virseqs_aa.fasta"
COPY virseqs_aa.fasta dat/catnap/virseqs_aa.fasta
RUN wget -O dat/catnap/abs.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/abs.txt"

# entry point to container runs run_analysis.sh
CMD /home/lib/run_analysis.sh
CMD run_analysis.sh
6 changes: 3 additions & 3 deletions code/01_check_opts.R → bin/01_check_opts.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env Rscript
#!/usr/bin/env -S Rscript --vanilla

library("shiny")
source("/home/lib/00_utils.R")
source("/home/lib/01_check_opts_functions.R")
library("slapnap")

#---------------------
# Permanent options
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env Rscript
#!/usr/bin/env -S Rscript --vanilla

# here are our standard example cases:
# single antibody: VRC07-523-LS
Expand All @@ -10,7 +10,7 @@
# STEP -1: prepare our environment
# ---------------------------------------------------------------------------- #
library(seqinr)
path.home <- "/home"
library("slapnap") # source our function library

# antibody names are passed to docker container at run time as
# environment variable Nab, which is a semicolon-separated list
Expand All @@ -27,24 +27,21 @@ combination_method <- Sys.getenv("combination_method")
# STEP 0: load and prepare our data
# ---------------------------------------------------------------------------- #
# define our working directories
path.lib <- file.path(path.home, "lib")
path.data <- file.path(path.home, "dat")
path.data.catnap <- file.path(path.data, "catnap")
path.data.analysis <- file.path(path.data, "analysis")
path.out <- file.path(path.home, "output")

source(file.path(path.lib, "00_utils.R"))
opts <- get_global_options()
# load data
data.assay <- read.table(file.path(path.data.catnap, "assay.txt"), header=T, sep="\t", quote="\"")
data.viruses <- read.table(file.path(path.data.catnap, "viruses.txt"), header=T, sep="\t", quote="\"")
data.abs <- read.table(file.path(path.data.catnap, "abs.txt"), header=T, sep="\t", quote="\"")

# source our function library
source(file.path(path.lib, "02_multi_ab.Rlib"))
# TODO: Use opts
path.data.catnap <- Sys.getenv("catnap") #file.path(path.data, "catnap")
path.data.analysis <- Sys.getenv("analysis") #file.path(path.data, "analysis")
path.ml.slfits <- Sys.getenv("slfits") #file.path(path.data, "analysis")
path.out <- Sys.getenv("output") #file.path(path.home, "output")

# load data
data.assay <- read.table(paste0(path.data.catnap, "assay.txt"), header=T, sep="\t", quote="\"")
data.viruses <- read.table(paste0(path.data.catnap, "viruses.txt"), header=T, sep="\t", quote="\"")
data.abs <- read.csv(paste0(path.data.catnap, "abs.txt"), header=T, sep="\t", quote="\"")

# load and process virus info and sequences
data.seq <- read.fasta(file.path(path.data.catnap, "virseqs_aa.fasta"), seqtype="AA")
data.seq <- read.fasta(paste0(path.data.catnap, "virseqs_aa.fasta"), seqtype="AA")
seqname.full <- names(data.seq)
header.info <- strsplit(names(data.seq), split=".", fixed=T)
subtype <- unlist(lapply(header.info, function(x) return(x[1])))
Expand Down Expand Up @@ -241,7 +238,7 @@ saveRDS(data.final, file=final_filename)

# save missing data stats for report compilation later
nprevious <- length(data.final[,1])
saveRDS(nprevious, "/home/slfits/nprevious.rds")
saveRDS(nprevious, paste0(path.ml.slfits, "nprevious.rds", sep=""))

# first covariate column
min_cov_col_idx <- min(grep("geographic", colnames(data.final)))
Expand All @@ -260,10 +257,10 @@ ncomplete_ic80 <- sum(!is.na(data.final$pc.ic80[complete_features_idx]))
# number with complete IC50 and IC80 + complete features
ncomplete_ic5080 <- sum(!is.na(data.final$pc.ic50[complete_features_idx]) & !is.na(data.final$pc.ic80[complete_features_idx]))

saveRDS(ncomplete_features, "/home/slfits/ncomplete_features.rds")
saveRDS(ncomplete_ic50, "/home/slfits/ncomplete_ic50.rds")
saveRDS(ncomplete_ic80, "/home/slfits/ncomplete_ic80.rds")
saveRDS(ncomplete_ic5080, "/home/slfits/ncomplete_ic5080.rds")
saveRDS(ncomplete_features, paste0(path.ml.slfits, "ncomplete_features.rds", sep=""))
saveRDS(ncomplete_ic50, paste0(path.ml.slfits, "ncomplete_ic50.rds", sep=""))
saveRDS(ncomplete_ic80, paste0(path.ml.slfits, "ncomplete_ic80.rds", sep=""))
saveRDS(ncomplete_ic5080, paste0(path.ml.slfits, "ncomplete_ic5080.rds", sep=""))
# ---------------------------------------------------------------------------- #
# - 30 -
# ---------------------------------------------------------------------------- #
25 changes: 14 additions & 11 deletions code/03_run_super_learners.R → bin/03_run_super_learners.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env Rscript
#! /usr/bin/env -S Rscript --vanilla

# (1) Run regression for the user-defined set of outcomes, in opts$outcomes: regression of outcome on all features, for use as the "best possible outcome predictor"
# (2) If "cond" is in opts$importance_grp, run regression of each outcome in opts$outcomes on each reduced set of features (created by removing the pre-defined group of interest): for use in group variable importance, conditional on all other features being in the model
Expand All @@ -13,15 +13,18 @@
# load libraries
library("SuperLearner")
library("dplyr")
source("/home/lib/04_variable_groups.R")
source("/home/lib/03_super_learner_libraries.R")
source("/home/lib/00_utils.R")
library("slapnap") # source our function library

#---------------------
# Permanent options
#---------------------
# read in options
opts <- get_global_options()
# path.home <- opts$output

# TODO: Use opts
path.data.analysis <- Sys.getenv("analysis") #file.path(path.data, "analysis")
path.ml.slfits <- Sys.getenv("slfits") # file.path(path.data, "analysis")

# If h2o is listed as a learner, initiate h2o cluster
h2o_here <- !(all(grepl("h2oboost", opts$learners) == FALSE))
Expand All @@ -34,13 +37,13 @@ if (h2o_here) {
}

# load data and subset to complete cases
analysis_data_names <- list.files("/home/dat/analysis")
analysis_data_name <- get_analysis_dataset_name(analysis_data_names, opts)
dat <- readRDS(paste0("/home/dat/analysis/", analysis_data_name))
analysis_data_names <- list.files(path.data.analysis)
analysis_data_names <- get_analysis_dataset_name(analysis_data_names, opts)
dat <- readRDS(paste0(path.data.analysis, analysis_data_names))

# make super learner library
SL.library <- make_sl_library_vector(opts = opts)

print(SL.library)
# get names of predictors
geog_idx <- min(grep("geographic.region.of", colnames(dat))) # geography first column of relevant data
pred_names <- colnames(dat)[geog_idx:ncol(dat)]
Expand Down Expand Up @@ -98,7 +101,7 @@ for (i in 1:length(outcome_names)) {
if (V <= 1) {
sample_splitting_folds <- vimp::make_folds(y = dat[, this_outcome_name], V = 2)
}
saveRDS(sample_splitting_folds, paste0("/home/slfits/ss_folds_", this_outcome_name, ".rds"))
saveRDS(sample_splitting_folds, paste0(path.ml.slfits, "ss_folds_", this_outcome_name, ".rds"))
# do the fitting, if there are enough outcomes
if (run_sl_vimp_bools2$run_sl[i]) {
print(paste0("Fitting ", nice_outcomes[i]))
Expand Down Expand Up @@ -126,7 +129,7 @@ if (("cond" %in% opts$importance_grp) | ("marg" %in% opts$importance_grp | "marg
this_outcome_name <- outcome_names[i]
sl_opts <- get_sl_options(this_outcome_name, V = V)
# set up validation rows for CV SuperLearner
cross_fitting_folds <- readRDS(paste0("/home/slfits/cvfolds_", this_outcome_name, ".rds"))
cross_fitting_folds <- readRDS(paste0(path.ml.slfits, "cvfolds_", this_outcome_name, ".rds"))
sl_opts$ctrl$validRows <- cross_fitting_folds
# only do this if we have enough obs to run it
if (run_sl_vimp_bools2$run_vimp[i]) {
Expand Down Expand Up @@ -193,7 +196,7 @@ if (("cond" %in% opts$importance_ind) | ("marg" %in% opts$importance_ind)) {
this_outcome_name <- outcome_names[i]
sl_opts <- get_sl_options(this_outcome_name, V = V)
# set up validation rows for CV SuperLearner
cross_fitting_folds <- readRDS(paste0("/home/slfits/cvfolds_", this_outcome_name, ".rds"))
cross_fitting_folds <- readRDS(paste0(path.ml.slfits, "cvfolds_", this_outcome_name, ".rds"))
sl_opts$ctrl$validRows <- cross_fitting_folds
if (run_sl_vimp_bools2$run_vimp[i]) {
print(paste0("Fitting reduced learners for individual variable importance for outcome ", nice_outcomes[i]))
Expand Down
Loading