benkeser · Rudolph-afk · Aug 30, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -20,7 +20,9 @@ RUN apt-get update && apt-get install -y \
   software-properties-common
 
 # install R from command line; get >= R-3.5
-RUN add-apt-repository -y ppa:marutter/rrutter3.5
+RUN add-apt-repository -y ppa:marutter/rrutter4.0
+# RUN add-apt-repository -y ppa:c2d4u.team/c2d4u4.0+
+
 # install:
 #   curl
 #   libcurl, Java (for h20)
@@ -29,17 +31,19 @@ RUN add-apt-repository -y ppa:marutter/rrutter3.5
 #   vim (for editing while in container)
 #   nginx (for static website hosting)
 #   ffmpeg (for animating figures)
-RUN apt-get update && apt-get install -y \
+# No need to install pandoc-citeproc -> https://github.com/jgm/pandoc-citeproc?tab=readme-ov-file#pandoc-citeproc
+RUN apt-get update && apt-get install -y -qq --no-install-recommends --purge \
   curl \
   libcurl4-openssl-dev \
   openjdk-8-jdk \
   r-base \
   r-base-dev \
   pandoc \
-  pandoc-citeproc \
-  vim \
+  cmake \
   nginx \
   ffmpeg
+#   r-cran-devtools \
+  #   vim \
 
 RUN rm /var/www/html/index.nginx-debian.html
 
@@ -50,6 +54,8 @@ RUN Rscript -e 'install.packages("bookdown", repos="https://cran.rstudio.com")'
 RUN Rscript -e 'install.packages("seqinr", repos="https://cran.rstudio.com")'
 RUN Rscript -e 'install.packages("SuperLearner", repos="https://cran.rstudio.com")'
 RUN Rscript -e 'install.packages("quadprog", repos="https://cran.rstudio.com")'
+RUN Rscript -e 'install.packages("remotes", repos="https://cran.rstudio.com")'
+RUN Rscript -e 'remotes::install_github("benkeser/cvma")'
 # get ggplot2, dplyr, tidyr, readr, tibble, stringr, forcats (and purrr for free)
 RUN Rscript -e 'install.packages("tidyverse", repos="https://cran.rstudio.com")'
 RUN Rscript -e 'install.packages("cowplot", repos="https://cran.rstudio.com")'
@@ -63,55 +69,45 @@ RUN Rscript -e 'install.packages("shiny", repos="https://cran.rstudio.com")'
 RUN Rscript -e 'install.packages("testthat", repos="https://cran.rstudio.com")'
 RUN Rscript -e 'install.packages("RCurl", repos="https://cran.rstudio.com")'
 RUN Rscript -e 'install.packages("bit64", repos="https://cran.rstudio.com")'
-RUN Rscript -e 'install.packages("h2o", type = "source", repos="https://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")'
-RUN Rscript -e 'install.packages("vimp", repos="https://cran.rstudio.com")'
-
-# make directories
-# lib contains R source files
-# dat contains data
-# dat/catnap contains original catnap data
-# dat/analysis contains analysis data
-RUN mkdir /home/dat /home/dat/catnap /home/dat/analysis /home/out
-RUN mkdir /home/slfits /home/output
-
-# copy R scripts to do do data pull, check options, run analysis, and return requested objects (and make executable)
-COPY code/00_utils.R /home/lib/00_utils.R
-COPY code/01_check_opts.R /home/lib/01_check_opts.R
-COPY code/01_check_opts_functions.R /home/lib/01_check_opts_functions.R
-COPY code/02_compile_analysis_dataset.R /home/lib/02_compile_analysis_dataset.R
-COPY code/02_multi_ab.Rlib /home/lib/02_multi_ab.Rlib
-COPY code/03_run_super_learners.R /home/lib/03_run_super_learners.R
-COPY code/03_super_learner_libraries.R /home/lib/03_super_learner_libraries.R
-COPY code/04_get_vimp.R /home/lib/04_get_vimp.R
-COPY code/04_variable_groups.R /home/lib/04_variable_groups.R
-COPY code/05_intrinsic_importance.R /home/lib/05_intrinsic_importance.R
-COPY code/05_ml_var_importance_measures.R /home/lib/05_ml_var_importance_measures.R
-COPY code/05_outcome_dist_plot.R /home/lib/05_outcome_dist_plot.R
-COPY code/05_plot_one_vimp.R /home/lib/05_plot_one_vimp.R
-COPY code/05_plotting_functions.R /home/lib/05_plotting_functions.R
-COPY code/05_pred_importance.R /home/lib/05_pred_importance.R
-COPY code/05_var_import_plot.R /home/lib/05_var_import_plot.R
-COPY code/05_vimp_executive_summary_table.R /home/lib/05_vimp_executive_summary_table.R
-COPY code/06_return_requested_objects.R /home/lib/06_return_requested_objects.R
-
-RUN chmod +x /home/lib/01_check_opts.R /home/lib/02_compile_analysis_dataset.R /home/lib/03_run_super_learners.R /home/lib/04_get_vimp.R /home/lib/06_return_requested_objects.R
-
-# copy report Rmd
-COPY code/05_report.Rmd /home/lib/05_report.Rmd
-COPY docs/refs.bib /home/lib/refs.bib
-COPY code/run_analysis.sh /home/lib/run_analysis.sh
-COPY code/05_render_report.R /home/lib/05_render_report.R
-COPY code/05_report_preamble.R /home/lib/05_report_preamble.R
-RUN chmod +x /home/lib/run_analysis.sh /home/lib/05_render_report.R /home/lib/05_report_preamble.R
-
-# copy metadata Rmd
-COPY code/07_metadata.Rmd /home/lib/07_metadata.Rmd
-COPY code/07_render_metadata.R /home/lib/07_render_metadata.R
-RUN chmod +x /home/lib/07_render_metadata.R
+RUN Rscript -e 'install.packages("jsonlite", repos="https://cran.rstudio.com")'
+RUN Rscript -e 'options(timeout=600); install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/4/R"); library(h2o)'
+RUN Rscript -e 'library(h2o); h2o.init()'
+
+# RUN wget "https://h2o-release.s3.amazonaws.com/h2o/latest_stable_R"
+# RUN Rscript -e 'install.packages("h20", type = "source"); library(h2o)'
+RUN Rscript -e 'install.packages("gam", repos="https://cran.rstudio.com")'
+
+RUN Rscript -e 'remotes::install_github(repo = "bdwilliamson/vimp"); library(vimp)'
+# RUN Rscript -e 'remotes::install_version("vimp", version="2.1.9", repos="https://cran.rstudio.com"); library(vimp)'
+
+ADD slapnap slapnap/
+RUN R CMD INSTALL -dc slapnap
+# RUN Rscript -e 'setwd("slapnap/"); devtools::check()'
+# RUN Rscript -e 'install.packages("slapnap_0.1.0.tar.gz", type="source")'
+RUN Rscript -e 'library("slapnap")'
+
+RUN rm -rf slapnap_0.1.0.tar.gz slapnap/
+
+# Create default user and workdir
+RUN useradd -ms /bin/bash slapnap
+WORKDIR /home/slapnap/
+
+RUN chown -R slapnap:slapnap /home/slapnap/
 
 #---------------------------------------------------------------------
 # Permanent options
 #---------------------------------------------------------------------
+RUN mkdir -p slfits output dat/analysis dat/catnap
+
+# Slapnap data analysis directory
+ENV analysis="/home/slapnap/dat/analysis/"
+
+# Slapnap slfits directory
+ENV slfits="/home/slapnap/slfits/"
+
+# Slapnap output directory
+ENV output="/home/slapnap/output/"
+
 # which antibody to analyze
 #   "VRC01" is arbitrarily selected as default
 ENV nab="VRC01"
@@ -122,7 +118,7 @@ ENV nab="VRC01"
 #   combinations of these
 #   For a single/multispecific bnAb, enter "sens".
 #   For a bnAb combination, enter "estsens" or "multsens".
-ENV outcomes="ic50;sens"
+ENV outcomes="ic50"
 
 # which method to use for predicting combination IC-50 and IC-80
 #   possible methods are "additive" and "Bliss-Hill". For "Bliss-Hill",
@@ -210,11 +206,18 @@ ENV var_thresh="0"
 ARG CACHEBUST=1
 RUN echo "$CACHEBUST"
 
+COPY bin/* /bin/
+
+RUN chown -R slapnap:slapnap /home/slapnap/
+
+# USER slapnap
+
 # pull CATNAP data from LANL
-RUN wget -O /home/dat/catnap/assay.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/assay.txt"
-RUN wget -O /home/dat/catnap/viruses.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/viruses.txt"
-RUN wget -O /home/dat/catnap/virseqs_aa.fasta "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/virseqs_aa.fasta"
-RUN wget -O /home/dat/catnap/abs.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/abs.txt"
+RUN wget -O dat/catnap/assay.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/assay.txt"
+RUN wget -O dat/catnap/viruses.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/viruses.txt"
+# RUN wget -O dat/catnap/virseqs_aa.fasta "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/virseqs_aa.fasta"
+COPY virseqs_aa.fasta dat/catnap/virseqs_aa.fasta
+RUN wget -O dat/catnap/abs.txt "https://www.hiv.lanl.gov/cgi-bin/common_code/download.cgi?/scratch/NEUTRALIZATION/abs.txt"
 
 # entry point to container runs run_analysis.sh
-CMD /home/lib/run_analysis.sh
+CMD run_analysis.sh
diff --git a/code/01_check_opts.R → bin/01_check_opts.R b/code/01_check_opts.R → bin/01_check_opts.R
@@ -1,7 +1,7 @@
-#! /usr/bin/env Rscript
+#!/usr/bin/env -S Rscript --vanilla
+
 library("shiny")
-source("/home/lib/00_utils.R")
-source("/home/lib/01_check_opts_functions.R")
+library("slapnap")
 
 #---------------------
 # Permanent options

diff --git a/code/02_compile_analysis_dataset.R → bin/02_compile_analysis_dataset.R b/code/02_compile_analysis_dataset.R → bin/02_compile_analysis_dataset.R
@@ -1,4 +1,4 @@
-#!/usr/bin/env Rscript
+#!/usr/bin/env -S Rscript --vanilla
 
 # here are our standard example cases:
 #   single antibody:  VRC07-523-LS
@@ -10,7 +10,7 @@
 # STEP -1:  prepare our environment
 # ---------------------------------------------------------------------------- #
 library(seqinr)
-path.home <- "/home"
+library("slapnap") # source our function library
 
 # antibody names are passed to docker container at run time as
 # environment variable Nab, which is a semicolon-separated list
@@ -27,24 +27,21 @@ combination_method <- Sys.getenv("combination_method")
 # STEP 0:  load and prepare our data
 # ---------------------------------------------------------------------------- #
 # define our working directories
-path.lib <- file.path(path.home, "lib")
-path.data <- file.path(path.home, "dat")
-path.data.catnap <- file.path(path.data, "catnap")
-path.data.analysis <- file.path(path.data, "analysis")
-path.out <- file.path(path.home, "output")
-
-source(file.path(path.lib, "00_utils.R"))
 opts <- get_global_options()
-# load data
-data.assay <- read.table(file.path(path.data.catnap, "assay.txt"), header=T, sep="\t", quote="\"")
-data.viruses <- read.table(file.path(path.data.catnap, "viruses.txt"), header=T, sep="\t", quote="\"")
-data.abs <- read.table(file.path(path.data.catnap, "abs.txt"), header=T, sep="\t", quote="\"")
 
-# source our function library
-source(file.path(path.lib, "02_multi_ab.Rlib"))
+# TODO: Use opts
+path.data.catnap <- Sys.getenv("catnap") #file.path(path.data, "catnap")
+path.data.analysis <- Sys.getenv("analysis") #file.path(path.data, "analysis")
+path.ml.slfits <- Sys.getenv("slfits") #file.path(path.data, "analysis")
+path.out <- Sys.getenv("output") #file.path(path.home, "output")
+
+# load data
+data.assay <- read.table(paste0(path.data.catnap, "assay.txt"), header=T, sep="\t", quote="\"")
+data.viruses <- read.table(paste0(path.data.catnap, "viruses.txt"), header=T, sep="\t", quote="\"")
+data.abs <- read.csv(paste0(path.data.catnap, "abs.txt"), header=T, sep="\t", quote="\"")
 
 # load and process virus info and sequences
-data.seq <- read.fasta(file.path(path.data.catnap, "virseqs_aa.fasta"), seqtype="AA")
+data.seq <- read.fasta(paste0(path.data.catnap, "virseqs_aa.fasta"), seqtype="AA")
 seqname.full <- names(data.seq)
 header.info <- strsplit(names(data.seq), split=".", fixed=T)
 subtype <- unlist(lapply(header.info, function(x) return(x[1])))
@@ -241,7 +238,7 @@ saveRDS(data.final, file=final_filename)
 
 # save missing data stats for report compilation later
 nprevious <- length(data.final[,1])
-saveRDS(nprevious, "/home/slfits/nprevious.rds")
+saveRDS(nprevious, paste0(path.ml.slfits, "nprevious.rds", sep=""))
 
 # first covariate column
 min_cov_col_idx <- min(grep("geographic", colnames(data.final)))
@@ -260,10 +257,10 @@ ncomplete_ic80 <- sum(!is.na(data.final$pc.ic80[complete_features_idx]))
 # number with complete IC50 and IC80 + complete features
 ncomplete_ic5080 <- sum(!is.na(data.final$pc.ic50[complete_features_idx]) & !is.na(data.final$pc.ic80[complete_features_idx]))
 
-saveRDS(ncomplete_features, "/home/slfits/ncomplete_features.rds")
-saveRDS(ncomplete_ic50, "/home/slfits/ncomplete_ic50.rds")
-saveRDS(ncomplete_ic80, "/home/slfits/ncomplete_ic80.rds")
-saveRDS(ncomplete_ic5080, "/home/slfits/ncomplete_ic5080.rds")
+saveRDS(ncomplete_features, paste0(path.ml.slfits, "ncomplete_features.rds", sep=""))
+saveRDS(ncomplete_ic50, paste0(path.ml.slfits, "ncomplete_ic50.rds", sep=""))
+saveRDS(ncomplete_ic80, paste0(path.ml.slfits, "ncomplete_ic80.rds", sep=""))
+saveRDS(ncomplete_ic5080, paste0(path.ml.slfits, "ncomplete_ic5080.rds", sep=""))
 # ---------------------------------------------------------------------------- #
 #                                    - 30 -
 # ---------------------------------------------------------------------------- #
diff --git a/code/03_run_super_learners.R → bin/03_run_super_learners.R b/code/03_run_super_learners.R → bin/03_run_super_learners.R
@@ -1,4 +1,4 @@
-#! /usr/bin/env Rscript
+#! /usr/bin/env -S Rscript --vanilla
 
 #  (1) Run regression for the user-defined set of outcomes, in opts$outcomes: regression of outcome on all features, for use as the "best possible outcome predictor"
 #  (2) If "cond" is in opts$importance_grp, run regression of each outcome in opts$outcomes on each reduced set of features (created by removing the pre-defined group of interest): for use in group variable importance, conditional on all other features being in the model
@@ -13,15 +13,18 @@
 # load libraries
 library("SuperLearner")
 library("dplyr")
-source("/home/lib/04_variable_groups.R")
-source("/home/lib/03_super_learner_libraries.R")
-source("/home/lib/00_utils.R")
+library("slapnap") # source our function library
 
 #---------------------
 # Permanent options
 #---------------------
 # read in options
 opts <- get_global_options()
+# path.home <- opts$output
+
+# TODO: Use opts
+path.data.analysis <- Sys.getenv("analysis") #file.path(path.data, "analysis")
+path.ml.slfits <- Sys.getenv("slfits") # file.path(path.data, "analysis")
 
 # If h2o is listed as a learner, initiate h2o cluster
 h2o_here <- !(all(grepl("h2oboost", opts$learners) == FALSE))
@@ -34,13 +37,13 @@ if (h2o_here) {
 }
 
 # load data and subset to complete cases
-analysis_data_names <- list.files("/home/dat/analysis")
-analysis_data_name <- get_analysis_dataset_name(analysis_data_names, opts)
-dat <- readRDS(paste0("/home/dat/analysis/", analysis_data_name))
+analysis_data_names <- list.files(path.data.analysis)
+analysis_data_names <- get_analysis_dataset_name(analysis_data_names, opts)
+dat <- readRDS(paste0(path.data.analysis, analysis_data_names))
 
 # make super learner library
 SL.library <- make_sl_library_vector(opts = opts)
-
+print(SL.library)
 # get names of predictors
 geog_idx <- min(grep("geographic.region.of", colnames(dat))) # geography first column of relevant data
 pred_names <- colnames(dat)[geog_idx:ncol(dat)]
@@ -98,7 +101,7 @@ for (i in 1:length(outcome_names)) {
     if (V <= 1) {
         sample_splitting_folds <- vimp::make_folds(y = dat[, this_outcome_name], V = 2)
     }
-    saveRDS(sample_splitting_folds, paste0("/home/slfits/ss_folds_", this_outcome_name, ".rds"))
+    saveRDS(sample_splitting_folds, paste0(path.ml.slfits, "ss_folds_", this_outcome_name, ".rds"))
     # do the fitting, if there are enough outcomes
     if (run_sl_vimp_bools2$run_sl[i]) {
         print(paste0("Fitting ", nice_outcomes[i]))
@@ -126,7 +129,7 @@ if (("cond" %in% opts$importance_grp) | ("marg" %in% opts$importance_grp | "marg
         this_outcome_name <- outcome_names[i]
         sl_opts <- get_sl_options(this_outcome_name, V = V)
         # set up validation rows for CV SuperLearner
-        cross_fitting_folds <- readRDS(paste0("/home/slfits/cvfolds_", this_outcome_name, ".rds"))
+        cross_fitting_folds <- readRDS(paste0(path.ml.slfits, "cvfolds_", this_outcome_name, ".rds"))
         sl_opts$ctrl$validRows <- cross_fitting_folds
         # only do this if we have enough obs to run it
         if (run_sl_vimp_bools2$run_vimp[i]) {
@@ -193,7 +196,7 @@ if (("cond" %in% opts$importance_ind) | ("marg" %in% opts$importance_ind)) {
         this_outcome_name <- outcome_names[i]
         sl_opts <- get_sl_options(this_outcome_name, V = V)
         # set up validation rows for CV SuperLearner
-        cross_fitting_folds <- readRDS(paste0("/home/slfits/cvfolds_", this_outcome_name, ".rds"))
+        cross_fitting_folds <- readRDS(paste0(path.ml.slfits, "cvfolds_", this_outcome_name, ".rds"))
         sl_opts$ctrl$validRows <- cross_fitting_folds
         if (run_sl_vimp_bools2$run_vimp[i]) {
             print(paste0("Fitting reduced learners for individual variable importance for outcome ", nice_outcomes[i]))