From bbdda1b4a44a3d6a22041e03eed38f27319d8f32 Mon Sep 17 00:00:00 2001 From: wlandau-lilly Date: Fri, 5 Apr 2024 15:03:16 -0400 Subject: [PATCH] To align with https://github.com/ropensci/targets/issues/1244 and https://github.com/ropensci/targets/pull/1262, switch the hashing functions from digest::digest() to secretbase::siphash13(). --- DESCRIPTION | 8 +- NAMESPACE | 3 +- NEWS.md | 8 +- R/tar_stan_package.R | 6 +- R/utils_data.R | 2 +- R/utils_output.R | 2 +- codemeta.json | 183 ++++++++++++++++++++++++++------------- vignettes/simulation.Rmd | 28 +++--- 8 files changed, 154 insertions(+), 86 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b7ca1aa..9c25285 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,7 @@ Description: Bayesian data analysis usually incurs long runtimes both single-fit workflows and multi-rep simulation studies. For the statistical methodology, please refer to 'Stan' documentation (Stan Development Team 2020) . -Version: 0.1.0.9001 +Version: 0.1.1 License: MIT + file LICENSE URL: https://docs.ropensci.org/stantargets/, https://github.com/ropensci/stantargets BugReports: https://github.com/ropensci/stantargets/issues @@ -44,16 +44,16 @@ Depends: R (>= 3.5.0) Imports: cmdstanr (>= 0.5.0), - digest (>= 0.6.25), fs (>= 1.5.0), fst (>= 0.9.2), posterior (>= 1.0.1), purrr (>= 0.3.4), qs (>= 0.23.2), rlang (>= 0.4.10), + secretbase (>= 0.4.0), stats, - targets (>= 1.5.1.9001), - tarchetypes (>= 0.7.12.9001), + targets (>= 1.6.0), + tarchetypes (>= 0.8.0), tibble (>= 3.0.1), tidyselect, withr (>= 2.1.2) diff --git a/NAMESPACE b/NAMESPACE index 7f24e50..eca34fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,7 +30,6 @@ export(tar_stan_vb_rep_run) export(tar_stan_vb_rep_summary) export(tar_stan_vb_run) importFrom(cmdstanr,cmdstan_model) -importFrom(digest,digest) importFrom(fs,path_ext_remove) importFrom(fs,path_rel) importFrom(fst,read_fst) @@ -42,6 +41,7 @@ importFrom(qs,qread) importFrom(rlang,check_installed) importFrom(rlang,expr) importFrom(rlang,quo_squash) +importFrom(secretbase,siphash13) importFrom(stats,rnorm) importFrom(stats,runif) importFrom(tarchetypes,tar_combine_raw) @@ -61,6 +61,7 @@ importFrom(targets,tar_load) importFrom(targets,tar_option_get) importFrom(targets,tar_read) importFrom(targets,tar_script) +importFrom(targets,tar_seed_create) importFrom(targets,tar_seed_get) importFrom(targets,tar_target) importFrom(targets,tar_target_raw) diff --git a/NEWS.md b/NEWS.md index 4627fcf..0ed1367 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,10 @@ -# stantargets 0.1.0.9001 (development) +# stantargets 0.1.1 + +## Invalidating changes + +* To align with https://github.com/ropensci/targets/issues/1244 and https://github.com/ropensci/targets/pull/1262, switch the hashing functions from `digest::digest()` to `secretbase::siphash13()`. + +## Other changes * Remove temporary files generated CmdStan. * Add the new `description` arguments of `tar_target()` (`targets >= 1.5.1.9001). diff --git a/R/tar_stan_package.R b/R/tar_stan_package.R index 108722f..1878b8f 100644 --- a/R/tar_stan_package.R +++ b/R/tar_stan_package.R @@ -14,21 +14,21 @@ #' @name stantargets-package #' @seealso , [tar_stan_mcmc()] #' @importFrom cmdstanr cmdstan_model -#' @importFrom digest digest #' @importFrom fs path_ext_remove path_rel #' @importFrom fst read_fst #' @importFrom qs qread #' @importFrom posterior as_draws_df #' @importFrom purrr map map_dbl map2_dfr #' @importFrom rlang check_installed expr quo_squash +#' @importFrom secretbase siphash13 #' @importFrom stats rnorm runif #' @importFrom targets tar_assert_chr tar_assert_nonempty #' tar_assert_not_dir tar_assert_not_in #' tar_assert_nzchar tar_assert_path #' tar_assert_scalar tar_assert_unique #' tar_cue tar_deparse_safe tar_dir tar_load tar_option_get -#' tar_read tar_script tar_seed_get tar_target tar_target_raw tar_test -#' tar_tidy_eval tar_throw_validate +#' tar_read tar_script tar_seed_create tar_seed_get tar_target +#' tar_target_raw tar_test tar_tidy_eval tar_throw_validate #' @importFrom tarchetypes tar_combine_raw tar_map #' @importFrom tidyselect any_of #' @importFrom withr local_message_sink local_output_sink diff --git a/R/utils_data.R b/R/utils_data.R index 1c28148..dae46d2 100644 --- a/R/utils_data.R +++ b/R/utils_data.R @@ -43,7 +43,7 @@ produce_seed_rep <- function(name, batch, rep, reps) { return(NA_integer_) } scalar <- paste(name, rep + reps * (batch - 1)) - abs(digest::digest2int(as.character(scalar), seed = seed)) + abs(targets::tar_seed_create(as.character(scalar), global_seed = seed)) } list_nonempty <- function(list) { diff --git a/R/utils_output.R b/R/utils_output.R index 5781d1d..54f9a1c 100644 --- a/R/utils_output.R +++ b/R/utils_output.R @@ -41,7 +41,7 @@ tar_stan_output <- function( ) out <- tibble::as_tibble(out) out <- tar_stan_output_rep_scalars(out, data, data_copy) - out$.rep <- digest::digest(stats::runif(1), algo = "xxhash32") + out$.rep <- secretbase::siphash13(stats::runif(1)) out$.dataset_id <- data$.dataset_id out$.seed <- if_any(length(seed) == 1L, seed, list(seed)) out diff --git a/codemeta.json b/codemeta.json index ea35c75..b980641 100644 --- a/codemeta.json +++ b/codemeta.json @@ -1,46 +1,41 @@ { - "@context": [ - "https://doi.org/10.5063/schema/codemeta-2.0", - "http://schema.org" - ], + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "stantargets", - "description": "Bayesian data analysis usually incurs long runtimes\n and cumbersome custom code. A specialized pipeline toolkit for\n Bayesians, the 'stantargets' R package leverages\n 'targets' and 'cmdstanr' to ease these burdens.\n 'stantargets' makes it super easy to set up useful scalable\n Stan pipelines that automatically parallelize the computation\n and skip expensive steps when the results are already up to date.\n Minimal custom code is required, and there is no need to manually\n configure branching, so usage is much easier than 'targets' alone.\n 'stantargets' can access all of 'cmdstanr''s major algorithms\n (MCMC, variational Bayes, and optimization) and it supports\n both single-fit workflows and multi-rep simulation studies.\n For the statistical methodology, please refer to 'Stan' documentation\n (Stan Development Team 2020) .", + "description": "Bayesian data analysis usually incurs long runtimes and cumbersome custom code. A pipeline toolkit tailored to Bayesian statisticians, the 'stantargets' R package leverages 'targets' and 'cmdstanr' to ease these burdens. 'stantargets' makes it super easy to set up scalable Stan pipelines that automatically parallelize the computation and skip expensive steps when the results are already up to date. Minimal custom code is required, and there is no need to manually configure branching, so usage is much easier than 'targets' alone. 'stantargets' can access all of 'cmdstanr''s major algorithms (MCMC, variational Bayes, and optimization) and it supports both single-fit workflows and multi-rep simulation studies. For the statistical methodology, please refer to 'Stan' documentation (Stan Development Team 2020) .", "name": "stantargets: Targets for Stan Workflows", + "relatedLink": "https://docs.ropensci.org/stantargets/", "codeRepository": "https://github.com/ropensci/stantargets", - "relatedLink": ["https://wlandau.github.io/stantargets/", "https://docs.ropensci.org/stantargets/"], "issueTracker": "https://github.com/ropensci/stantargets/issues", "license": "https://spdx.org/licenses/MIT", - "version": "0.0.0.9002", + "version": "0.1.1", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", "url": "https://r-project.org" }, - "runtimePlatform": "R version 4.0.3 (2020-10-10)", + "runtimePlatform": "R version 4.3.2 (2023-10-31)", "author": [ { "@type": "Person", "givenName": ["William", "Michael"], "familyName": "Landau", - "email": "will.landau@gmail.com", + "email": "will.landau.oss@gmail.com", "@id": "https://orcid.org/0000-0003-1878-3253" } ], - "contributor": {}, "copyrightHolder": [ { "@type": "Organization", "name": "Eli Lilly and Company" } ], - "funder": {}, "maintainer": [ { "@type": "Person", "givenName": ["William", "Michael"], "familyName": "Landau", - "email": "will.landau@gmail.com", + "email": "will.landau.oss@gmail.com", "@id": "https://orcid.org/0000-0003-1878-3253" } ], @@ -58,6 +53,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=dplyr" }, + { + "@type": "SoftwareApplication", + "identifier": "ggplot2", + "name": "ggplot2", + "version": ">= 3.0.0", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=ggplot2" + }, { "@type": "SoftwareApplication", "identifier": "knitr", @@ -97,6 +105,13 @@ }, "sameAs": "https://CRAN.R-project.org/package=rmarkdown" }, + { + "@type": "SoftwareApplication", + "identifier": "SBC", + "name": "SBC", + "version": ">= 0.2.0", + "sameAs": "https://github.com/hyunjimoon/SBC" + }, { "@type": "SoftwareApplication", "identifier": "testthat", @@ -110,6 +125,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=testthat" }, + { + "@type": "SoftwareApplication", + "identifier": "tidyr", + "name": "tidyr", + "version": ">= 1.0.0", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=tidyr" + }, { "@type": "SoftwareApplication", "identifier": "visNetwork", @@ -124,34 +152,21 @@ "sameAs": "https://CRAN.R-project.org/package=visNetwork" } ], - "softwareRequirements": [ - { + "softwareRequirements": { + "1": { "@type": "SoftwareApplication", "identifier": "R", "name": "R", "version": ">= 3.5.0" }, - { + "2": { "@type": "SoftwareApplication", "identifier": "cmdstanr", "name": "cmdstanr", - "version": ">= 0.2.0", + "version": ">= 0.5.0", "sameAs": "https://github.com/stan-dev/cmdstanr" }, - { - "@type": "SoftwareApplication", - "identifier": "digest", - "name": "digest", - "version": ">= 0.6.25", - "provider": { - "@id": "https://cran.r-project.org", - "@type": "Organization", - "name": "Comprehensive R Archive Network (CRAN)", - "url": "https://cran.r-project.org" - }, - "sameAs": "https://CRAN.R-project.org/package=digest" - }, - { + "3": { "@type": "SoftwareApplication", "identifier": "fs", "name": "fs", @@ -164,7 +179,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fs" }, - { + "4": { "@type": "SoftwareApplication", "identifier": "fst", "name": "fst", @@ -177,14 +192,20 @@ }, "sameAs": "https://CRAN.R-project.org/package=fst" }, - { + "5": { "@type": "SoftwareApplication", "identifier": "posterior", "name": "posterior", - "version": ">= 0.1.2", - "sameAs": "https://github.com/stan-dev/posterior" + "version": ">= 1.0.1", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=posterior" }, - { + "6": { "@type": "SoftwareApplication", "identifier": "purrr", "name": "purrr", @@ -197,7 +218,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=purrr" }, - { + "7": { "@type": "SoftwareApplication", "identifier": "qs", "name": "qs", @@ -210,7 +231,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=qs" }, - { + "8": { "@type": "SoftwareApplication", "identifier": "rlang", "name": "rlang", @@ -223,16 +244,29 @@ }, "sameAs": "https://CRAN.R-project.org/package=rlang" }, - { + "9": { + "@type": "SoftwareApplication", + "identifier": "secretbase", + "name": "secretbase", + "version": ">= 0.4.0", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=secretbase" + }, + "10": { "@type": "SoftwareApplication", "identifier": "stats", "name": "stats" }, - { + "11": { "@type": "SoftwareApplication", "identifier": "targets", "name": "targets", - "version": ">= 0.0.1", + "version": ">= 1.5.1.9001", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", @@ -241,11 +275,11 @@ }, "sameAs": "https://CRAN.R-project.org/package=targets" }, - { + "12": { "@type": "SoftwareApplication", "identifier": "tarchetypes", "name": "tarchetypes", - "version": ">= 0.0.1", + "version": ">= 0.7.12.9001", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", @@ -254,7 +288,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=tarchetypes" }, - { + "13": { "@type": "SoftwareApplication", "identifier": "tibble", "name": "tibble", @@ -267,7 +301,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=tibble" }, - { + "14": { + "@type": "SoftwareApplication", + "identifier": "tidyselect", + "name": "tidyselect", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=tidyselect" + }, + "15": { "@type": "SoftwareApplication", "identifier": "withr", "name": "withr", @@ -279,29 +325,44 @@ "url": "https://cran.r-project.org" }, "sameAs": "https://CRAN.R-project.org/package=withr" + }, + "SystemRequirements": "CmdStan >= 2.25.0" + }, + "fileSize": "982.447KB", + "citation": [ + { + "@type": "ScholarlyArticle", + "datePublished": "2021", + "author": [ + { + "@type": "Person", + "givenName": ["William", "Michael"], + "familyName": "Landau" + } + ], + "name": "The stantargets {R} package: a workflow framework for efficient reproducible {S}tan-powered {B}ayesian data analysis pipelines", + "url": "https://doi.org/10.21105/joss.03193", + "pagination": "3193", + "isPartOf": { + "@type": "PublicationIssue", + "issueNumber": "60", + "datePublished": "2021", + "isPartOf": { + "@type": ["PublicationVolume", "Periodical"], + "volumeNumber": "6", + "name": "Journal of Open Source Software" + } + } } ], + "releaseNotes": "https://github.com/ropensci/stantargets/blob/master/NEWS.md", "readme": "https://github.com/ropensci/stantargets/blob/main/README.md", - "fileSize": "964.37KB", - "contIntegration": "https://codecov.io/gh/ropensci/stantargets", + "contIntegration": ["https://github.com/ropensci/stantargets/actions?query=workflow%3Acheck", "https://app.codecov.io/gh/ropensci/stantargets", "https://github.com/ropensci/stantargets/actions?query=workflow%3Alint"], "developmentStatus": "https://www.repostatus.org/#active", - "keywords": [ - "r", - "rstats", - "reproducibility", - "high-performance-computing", - "stan", - "bayesian", - "statistics", - "targets", - "make", - "rstats-package", - "r-targetopia" - ], - "releaseNotes": "https://github.com/ropensci/stantargets/blob/master/NEWS.md", "review": { "@type": "Review", "url": "https://github.com/ropensci/software-review/issues/430", "provider": "https://ropensci.org" - } + }, + "keywords": ["r", "rstats", "reproducibility", "high-performance-computing", "stan", "bayesian", "statistics", "targets", "make", "rstats-package", "r-targetopia"] } diff --git a/vignettes/simulation.Rmd b/vignettes/simulation.Rmd index 5937571..f8b18cb 100644 --- a/vignettes/simulation.Rmd +++ b/vignettes/simulation.Rmd @@ -53,7 +53,7 @@ Visit for an example This particular example uses the concept of calibration that Bob Carpenter [explains here](https://statmodeling.stat.columbia.edu/2017/04/12/bayesian-posteriors-calibrated/) [@carpenter2017]. The goal is to simulate multiple datasets from the model below, analyze each dataset, and assess how often the estimated posterior intervals cover the true parameters from the prior predictive simulations. If coverage is no systematically different from nominal, this is evidence that the model was implemented correctly. The quantile method by @cook2006 generalizes this concept, and simulation-based calibration [@talts2020] generalizes further. The interval-based technique featured in this vignette is not as robust as SBC, but it may be more expedient for large models because it does not require visual inspection of multiple histograms. See a later section in this vignette for an example of simulation-based calibration on this same model. -```{r} +```{r, eval = TRUE} lines <- "data { int n; vector[n] x; @@ -71,7 +71,7 @@ writeLines(lines, "model.stan") Next, we define a pipeline to simulate multiple datasets and fit each dataset with the model. In our data-generating function, we put the true parameter values of each simulation in a special `.join_data` list. `stantargets` will automatically join the elements of `.join_data` to the correspondingly named variables in the summary output. This will make it super easy to check how often our posterior intervals capture the truth. As for scale, generate 10 datasets (5 batches with 2 replications each) and run the model on each of the 10 datasets.^[Internally, each batch is a [dynamic branch target](https://books.ropensci.org/targets/dynamic.html), and the number of replications determines the amount of work done within a branch. In the general case, [batching](https://books.ropensci.org/targets/dynamic.html#batching) is a way to find the right compromise between target-specific overhead and the horizontal scale of the pipeline.] By default, each of the 10 model runs computes 4 MCMC chains with 2000 MCMC iterations each (including burn-in) and you can adjust with the `chains`, `iter_sampling`, and `iter_warmup` arguments of `tar_stan_mcmc_rep_summary()`. -```{r, echo = FALSE} +```{r, echo = FALSE, eval = TRUE} library(targets) tar_script({ library(stantargets) @@ -147,26 +147,26 @@ list( We now have a pipeline that runs the model 10 times: 5 batches (branch targets) with 2 replications per batch. -```{r} +```{r, eval = TRUE} tar_visnetwork() ``` Run the computation with `tar_make()` -```{r, output = FALSE, warning = FALSE} +```{r, output = FALSE, warning = FALSE, eval = TRUE} tar_make() ``` The result is an aggregated data frame of summary statistics, where the `.rep` column distinguishes among individual replicates. We have the posterior intervals for `beta` in columns `q2.5` and `q97.5`. And thanks to `.join_data` in `simulate_data()`, there is a special `.join_data` column in the output to indicate the true value of each parameter from the simulation. -```{r} +```{r, eval = TRUE} tar_load(model) model ``` Now, let's assess how often the estimated 95% posterior intervals capture the true values of `beta`. If the model is implemented correctly, the coverage value below should be close to 95%. (Ordinarily, we would [increase the number of batches and reps per batch](https://books.ropensci.org/targets/dynamic.html#batching) and [run batches in parallel computing](https://books.ropensci.org/targets/hpc.html).) -```{r} +```{r, eval = TRUE} library(dplyr) model %>% group_by(variable) %>% @@ -174,7 +174,7 @@ model %>% ``` For maximum reproducibility, we should express the coverage assessment as a custom function and a target in the pipeline. -```{r, echo = FALSE} +```{r, echo = FALSE, eval = TRUE} library(targets) tar_script({ library(stantargets) @@ -261,17 +261,17 @@ list( The new `coverage` target should the only outdated target, and it should be connected to the upstream `model` target. -```{r} +```{r, eval = TRUE} tar_visnetwork() ``` When we run the pipeline, only the coverage assessment should run. That way, we skip all the expensive computation of simulating datasets and running MCMC multiple times. -```{r, output = FALSE, warning = FALSE} +```{r, output = FALSE, warning = FALSE, eval = TRUE} tar_make() ``` -```{r} +```{r, eval = TRUE} tar_read(coverage) ``` @@ -279,7 +279,7 @@ tar_read(coverage) `tar_stan_rep_mcmc_summary()` and similar functions allow you to supply multiple Stan models. If you do, each model will share the the same collection of datasets, and the `.dataset_id` column of the model target output allows for custom analyses that compare different models against each other. Suppose we have a new model, `model2.stan`. -```{r} +```{r, eval = TRUE} lines <- "data { int n; vector[n] x; @@ -298,7 +298,7 @@ writeLines(lines, "model2.stan") To set up the simulation workflow to run on both models, we add `model2.stan` to the `stan_files` argument of `tar_stan_rep_mcmc_summary()`. And in the coverage summary below, we group by `.name` to compute a coverage statistic for each model. -```{r, echo = FALSE} +```{r, echo = FALSE, eval = TRUE} library(targets) tar_script({ library(stantargets) @@ -385,7 +385,7 @@ list( In the graph below, notice how targets `model_model` and `model_model2` are both connected to `model_data` upstream. Downstream, `model` is equivalent to `dplyr::bind_rows(model_model, model_model2)`, and it will have special columns `.name` and `.file` to distinguish among all the models. -```{r} +```{r, eval = TRUE} tar_visnetwork() ``` @@ -444,7 +444,7 @@ get_ranks(data = data, draws = draws) To put this into practice in a pipeline, we supply the symbol `get_ranks` to the `transform` argument of `tar_stan_mcmc_rep_draws()`. That way, instead of a full set of draws, each replication will return only the output of `get_ranks()` on those draws (plus a few helper columns). If supplied, the `transform` argument of `tar_stan_mcmc_rep_draws()` must be the name of a function in the pipeline. This function must accept arguments `data` and `draws`, and it must return a data frame. -```{r, echo = FALSE} +```{r, echo = FALSE, eval = TRUE} library(targets) tar_script({ library(stantargets)