diff --git a/R/DESCRIPTION b/R/DESCRIPTION index fbebcc50e..0b4bf0263 100644 --- a/R/DESCRIPTION +++ b/R/DESCRIPTION @@ -1,14 +1,14 @@ Package: Robyn Type: Package Title: Semi-Automated Marketing Mix Modeling (MMM) from Meta Marketing Science -Version: 3.11.1.9004 +Version: 3.12.0.9000 Authors@R: c( - person("Gufeng", "Zhou", , "gufeng@meta.com", c("aut")), - person("Bernardo", "Lares", , "laresbernardo@gmail.com", c("cre","aut")), - person("Leonel", "Sentana", , "leonelsentana@meta.com", c("aut")), + person("Gufeng", "Zhou", , "gufeng@meta.com", c("cre", "aut")), person("Igor", "Skokan", , "igorskokan@meta.com", c("aut")), + person("Bernardo", "Lares", , "laresbernardo@gmail.com", c("aut")), + person("Leonel", "Sentana", , "leonelsentana@meta.com", c("aut")), person("Meta Platforms, Inc.", role = c("cph", "fnd"))) -Maintainer: Bernardo Lares +Maintainer: Gufeng Zhou Description: Semi-Automated Marketing Mix Modeling (MMM) aiming to reduce human bias by means of ridge regression and evolutionary algorithms, enables actionable decision making providing a budget allocation and diminishing returns curves and allows ground-truth calibration to account for causation. Depends: R (>= 4.0.0) @@ -23,7 +23,6 @@ Imports: jsonlite, lares, lubridate, - minpack.lm, nloptr, patchwork, prophet, diff --git a/R/NAMESPACE b/R/NAMESPACE index d7a3907ca..4b9ee8c36 100644 --- a/R/NAMESPACE +++ b/R/NAMESPACE @@ -22,6 +22,7 @@ export(pareto_front) export(plot_adstock) export(plot_saturation) export(robyn_allocator) +export(robyn_calibrate) export(robyn_clusters) export(robyn_converge) export(robyn_csv) @@ -43,6 +44,7 @@ export(robyn_update) export(robyn_write) export(run_transformations) export(saturation_hill) +export(set_default_hyppar) export(transform_adstock) export(ts_validation) import(ggplot2) @@ -63,6 +65,7 @@ importFrom(dplyr,distinct) importFrom(dplyr,ends_with) importFrom(dplyr,everything) importFrom(dplyr,filter) +importFrom(dplyr,full_join) importFrom(dplyr,group_by) importFrom(dplyr,lag) importFrom(dplyr,left_join) @@ -110,7 +113,6 @@ importFrom(lares,v2t) importFrom(lubridate,day) importFrom(lubridate,floor_date) importFrom(lubridate,is.Date) -importFrom(minpack.lm,nlsLM) importFrom(nloptr,nloptr) importFrom(parallel,detectCores) importFrom(patchwork,guide_area) diff --git a/R/R/allocator.R b/R/R/allocator.R index 947081fd8..3a0e94d21 100644 --- a/R/R/allocator.R +++ b/R/R/allocator.R @@ -50,7 +50,7 @@ #' \code{channel_constr_low = 0.7} means minimum spend of the variable is 70% of historical #' average, using non-zero spend values, within \code{date_min} and \code{date_max} date range. #' Both constrains must be length 1 (same for all values) OR same length and order as -#' \code{paid_media_spends}. It's not recommended to 'exaggerate' upper bounds, especially +#' \code{paid_media_selected}. It's not recommended to 'exaggerate' upper bounds, especially #' if the new level is way higher than historical level. Lower bound must be >=0.01, #' and upper bound should be < 5. #' @param channel_constr_multiplier Numeric. Default to 3. For example, if @@ -111,7 +111,7 @@ robyn_allocator <- function(robyn_object = NULL, quiet = FALSE, ui = FALSE, ...) { - ### Use previously exported model using json_file + ## Use previously exported model using json_file if (!is.null(json_file)) { if (is.null(InputCollect)) { InputCollect <- robyn_inputs( @@ -131,37 +131,24 @@ robyn_allocator <- function(robyn_object = NULL, if (is.null(select_model)) select_model <- OutputCollect$selectID } - ## Collect inputs - # if (!is.null(robyn_object) && (is.null(InputCollect) && is.null(OutputCollect))) { - # if ("robyn_exported" %in% class(robyn_object)) { - # imported <- robyn_object - # robyn_object <- imported$robyn_object - # } else { - # imported <- robyn_load(robyn_object, select_build, quiet) - # } - # InputCollect <- imported$InputCollect - # OutputCollect <- imported$OutputCollect - # select_model <- imported$select_model - # } else { + ## set local variables, sort & prompt + # paid_media_spends <- InputCollect$paid_media_spends + paid_media_selected <- InputCollect$paid_media_selected + dep_var_type <- InputCollect$dep_var_type if (is.null(select_model) && length(OutputCollect$allSolutions == 1)) { select_model <- OutputCollect$allSolutions } - if (any(is.null(InputCollect), is.null(OutputCollect), is.null(select_model))) { - stop("When 'robyn_object' is not provided, then InputCollect, OutputCollect, select_model must be provided") - } - # } - - if (length(InputCollect$paid_media_spends) <= 1) { - stop("Must have a valid model with at least two 'paid_media_spends'") - } - if (!quiet) message(paste(">>> Running budget allocator for model ID", select_model, "...")) - - ## Set local data & params values - paid_media_spends <- InputCollect$paid_media_spends - media_order <- order(paid_media_spends) - mediaSpendSorted <- paid_media_spends[media_order] - dep_var_type <- InputCollect$dep_var_type + media_order <- order(paid_media_selected) + # mediaSpendSorted <- paid_media_spends[media_order] + mediaSelectedSorted <- paid_media_selected[media_order] + + ## Checks and constraints + if ("max_historical_response" %in% scenario) scenario <- "max_response" #legacy + check_allocator( + OutputCollect, select_model, paid_media_selected, scenario, + channel_constr_low, channel_constr_up, constr_mode + ) if (is.null(channel_constr_low)) { channel_constr_low <- case_when( scenario == "max_response" ~ 0.5, @@ -174,43 +161,31 @@ robyn_allocator <- function(robyn_object = NULL, scenario == "target_efficiency" ~ 10 ) } - if (length(channel_constr_low) == 1) channel_constr_low <- rep(channel_constr_low, length(paid_media_spends)) - if (length(channel_constr_up) == 1) channel_constr_up <- rep(channel_constr_up, length(paid_media_spends)) - check_allocator_constrains(channel_constr_low, channel_constr_up) - names(channel_constr_low) <- paid_media_spends - names(channel_constr_up) <- paid_media_spends - channel_constr_low <- channel_constr_low[media_order] - channel_constr_up <- channel_constr_up[media_order] - dt_hyppar <- filter(OutputCollect$resultHypParam, .data$solID == select_model) - dt_bestCoef <- filter(OutputCollect$xDecompAgg, .data$solID == select_model, .data$rn %in% paid_media_spends) - - ## Check inputs and parameters - scenario <- check_allocator( - OutputCollect, select_model, paid_media_spends, scenario, - channel_constr_low, channel_constr_up, constr_mode - ) - - ## Sort media - dt_coef <- select(dt_bestCoef, .data$rn, .data$coef) - get_rn_order <- order(dt_bestCoef$rn) - dt_coefSorted <- dt_coef[get_rn_order, ] - dt_bestCoef <- dt_bestCoef[get_rn_order, ] - coefSelectorSorted <- dt_coefSorted$coef > 0 - names(coefSelectorSorted) <- dt_coefSorted$rn - - dt_hyppar <- select(dt_hyppar, hyper_names(InputCollect$adstock, mediaSpendSorted)) %>% + if (length(channel_constr_low) == 1) channel_constr_low <- rep(channel_constr_low, length(paid_media_selected)) + if (length(channel_constr_up) == 1) channel_constr_up <- rep(channel_constr_up, length(paid_media_selected)) + #check_allocator_constrains(channel_constr_low, channel_constr_up) + names(channel_constr_low) <- names(channel_constr_up) <- paid_media_selected + channelConstrLowSorted <- channel_constr_low[mediaSelectedSorted] + channelConstrUpSorted <- channel_constr_up[mediaSelectedSorted] + + ## get hill parameters and coefs + dt_hyppar_sorted <- OutputCollect$resultHypParam %>% + filter(.data$solID == select_model) %>% + select(c(hyper_names(InputCollect$adstock, mediaSelectedSorted), + paste0(mediaSelectedSorted, "_inflexion"), + paste0(mediaSelectedSorted, "_inflation"))) %>% select(sort(colnames(.))) - dt_bestCoef <- dt_bestCoef[dt_bestCoef$rn %in% mediaSpendSorted, ] - channelConstrLowSorted <- channel_constr_low[mediaSpendSorted] - channelConstrUpSorted <- channel_constr_up[mediaSpendSorted] - - ## Get hill parameters for each channel - hills <- get_hill_params( - InputCollect, OutputCollect, dt_hyppar, dt_coef, mediaSpendSorted, select_model - ) - alphas <- hills$alphas - inflexions <- hills$inflexions - coefs_sorted <- hills$coefs_sorted + dt_coef_sorted <- OutputCollect$xDecompAgg %>% + filter(.data$solID == select_model & .data$rn %in% mediaSelectedSorted) %>% + select("rn", "coef") %>% + arrange(.data$rn) + non_zero_coef_sorted <- dt_coef_sorted$coef > 0 + names(non_zero_coef_sorted) <- dt_coef_sorted$rn + alphas <- dt_hyppar_sorted %>% select(contains("alphas")) %>% unlist + inflexions <- dt_hyppar_sorted %>% select(contains("inflexion")) %>% unlist + inflations <- dt_hyppar_sorted %>% select(contains("inflation")) %>% unlist + coefs_sorted <- dt_coef_sorted$coef + names(coefs_sorted) <- dt_coef_sorted$rn # Spend values based on date range set window_loc <- InputCollect$rollingWindowStartWhich:InputCollect$rollingWindowEndWhich @@ -225,21 +200,21 @@ robyn_allocator <- function(robyn_object = NULL, if (date_max > max(dt_optimCost$ds)) date_max <- max(dt_optimCost$ds) histFiltered <- filter(dt_optimCost, .data$ds >= date_min & .data$ds <= date_max) - histSpendAll <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSpendSorted)), sum)) + histSpendAll <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSelectedSorted)), sum)) histSpendAllTotal <- sum(histSpendAll) - histSpendAllUnit <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSpendSorted)), mean)) + histSpendAllUnit <- unlist(summarise_all(select(dt_optimCost, any_of(mediaSelectedSorted)), mean)) histSpendAllUnitTotal <- sum(histSpendAllUnit) histSpendAllShare <- histSpendAllUnit / histSpendAllUnitTotal - histSpendWindow <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), sum)) + histSpendWindow <- unlist(summarise_all(select(histFiltered, any_of(mediaSelectedSorted)), sum)) histSpendWindowTotal <- sum(histSpendWindow) - initSpendUnit <- histSpendWindowUnit <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), mean)) + initSpendUnit <- histSpendWindowUnit <- unlist(summarise_all(select(histFiltered, any_of(mediaSelectedSorted)), mean)) histSpendWindowUnitTotal <- sum(histSpendWindowUnit) histSpendWindowShare <- histSpendWindowUnit / histSpendWindowUnitTotal - simulation_period <- initial_mean_period <- unlist(summarise_all(select(histFiltered, any_of(mediaSpendSorted)), length)) - nDates <- lapply(mediaSpendSorted, function(x) histFiltered$ds) - names(nDates) <- mediaSpendSorted + simulation_period <- initial_mean_period <- unlist(summarise_all(select(histFiltered, any_of(mediaSelectedSorted)), length)) + nDates <- lapply(mediaSelectedSorted, function(x) histFiltered$ds) + names(nDates) <- mediaSelectedSorted if (!quiet) { message(sprintf( "Date Window: %s:%s (%s %ss)", @@ -267,13 +242,13 @@ robyn_allocator <- function(robyn_object = NULL, initResponseMargUnit <- NULL hist_carryover <- list() qa_carryover <- list() - for (i in seq_along(mediaSpendSorted)) { + for (i in seq_along(mediaSelectedSorted)) { resp <- robyn_response( json_file = json_file, # robyn_object = robyn_object, select_build = select_build, select_model = select_model, - metric_name = mediaSpendSorted[i], + metric_name = mediaSelectedSorted[i], # metric_value = initSpendUnit[i] * simulation_period[i], # date_range = date_range, dt_hyppar = OutputCollect$resultHypParam, @@ -299,17 +274,17 @@ robyn_allocator <- function(robyn_object = NULL, x_input <- initSpendUnit[i] resp_simulate <- fx_objective( x = x_input, - coeff = coefs_sorted[[mediaSpendSorted[i]]], - alpha = alphas[[paste0(mediaSpendSorted[i], "_alphas")]], - inflexion = inflexions[[paste0(mediaSpendSorted[i], "_gammas")]], + coeff = coefs_sorted[[mediaSelectedSorted[i]]], + alpha = alphas[[paste0(mediaSelectedSorted[i], "_alphas")]], + inflexion = inflexions[[paste0(mediaSelectedSorted[i], "_inflexion")]], x_hist_carryover = mean(hist_carryover_temp), get_sum = FALSE ) resp_simulate_plus1 <- fx_objective( x = x_input + 1, - coeff = coefs_sorted[[mediaSpendSorted[i]]], - alpha = alphas[[paste0(mediaSpendSorted[i], "_alphas")]], - inflexion = inflexions[[paste0(mediaSpendSorted[i], "_gammas")]], + coeff = coefs_sorted[[mediaSelectedSorted[i]]], + alpha = alphas[[paste0(mediaSelectedSorted[i], "_alphas")]], + inflexion = inflexions[[paste0(mediaSelectedSorted[i], "_inflexion")]], x_hist_carryover = mean(hist_carryover_temp), get_sum = FALSE ) @@ -317,7 +292,7 @@ robyn_allocator <- function(robyn_object = NULL, initResponseMargUnit <- c(initResponseMargUnit, resp_simulate_plus1 - resp_simulate) } qa_carryover <- do.call(cbind, qa_carryover) %>% as.data.frame() - names(initResponseUnit) <- names(hist_carryover) <- names(qa_carryover) <- mediaSpendSorted + names(initResponseUnit) <- names(hist_carryover) <- names(qa_carryover) <- mediaSelectedSorted # QA adstock: simulated adstock should be identical to model adstock # qa_carryover_origin <- OutputCollect$mediaVecCollect %>% # filter(.data$solID == select_model & .data$type == "adstockedMedia") %>% @@ -339,22 +314,6 @@ robyn_allocator <- function(robyn_object = NULL, 1 + (channelConstrUpSorted - 1) * channel_constr_multiplier ) - target_value_ext <- target_value - if (scenario == "target_efficiency") { - channelConstrLowSortedExt <- channelConstrLowSorted - channelConstrUpSortedExt <- channelConstrUpSorted - if (dep_var_type == "conversion") { - if (is.null(target_value)) { - target_value <- sum(initSpendUnit) / sum(initResponseUnit) * 1.2 - } - target_value_ext <- target_value * 1.5 - } else { - if (is.null(target_value)) { - target_value <- sum(initResponseUnit) / sum(initSpendUnit) * 0.8 - } - target_value_ext <- 1 - } - } temp_init <- temp_init_all <- initSpendUnit # if no spend within window as initial spend, use historical average if (length(zero_spend_channel) > 0) temp_init_all[zero_spend_channel] <- histSpendAllUnit[zero_spend_channel] @@ -371,15 +330,15 @@ robyn_allocator <- function(robyn_object = NULL, ## Exclude 0 coef and 0 constraint channels for the optimisation skip_these <- (channel_constr_low == 0 & channel_constr_up == 0) - zero_constraint_channel <- mediaSpendSorted[skip_these] + zero_constraint_channel <- mediaSelectedSorted[skip_these] if (any(skip_these) && !quiet) { message( "Excluded variables (constrained to 0): ", paste(zero_constraint_channel, collapse = ", ") ) } - if (!all(coefSelectorSorted) & !keep_zero_coefs) { - zero_coef_channel <- setdiff(names(coefSelectorSorted), mediaSpendSorted[coefSelectorSorted]) + if (!all(non_zero_coef_sorted) & !keep_zero_coefs) { + zero_coef_channel <- setdiff(names(non_zero_coef_sorted), mediaSelectedSorted[non_zero_coef_sorted]) if (!quiet) { message( "Excluded variables (coefficients are 0): ", @@ -389,8 +348,8 @@ robyn_allocator <- function(robyn_object = NULL, } else { zero_coef_channel <- as.character() } - channel_to_drop_loc <- mediaSpendSorted %in% c(zero_coef_channel, zero_constraint_channel) - channel_for_allocation <- mediaSpendSorted[!channel_to_drop_loc] + channel_to_drop_loc <- mediaSelectedSorted %in% c(zero_coef_channel, zero_constraint_channel) + channel_for_allocation <- mediaSelectedSorted[!channel_to_drop_loc] if (any(channel_to_drop_loc)) { temp_init <- temp_init_all[channel_for_allocation] temp_ub <- temp_ub_all[channel_for_allocation] @@ -410,16 +369,36 @@ robyn_allocator <- function(robyn_object = NULL, x0_ext <- lb_ext <- temp_init * temp_lb_ext ub_ext <- temp_init * temp_ub_ext + target_value_ext <- target_value + if (scenario == "target_efficiency") { + # channelConstrLowSortedExt <- channelConstrLowSorted + # channelConstrUpSortedExt <- channelConstrUpSorted + x0_ext <- lb_ext <- temp_init * 0.9 + if (dep_var_type == "conversion") { + if (is.null(target_value)) { + target_value <- sum(initSpendUnit) / sum(initResponseUnit) * 1.2 + } + target_value_ext <- target_value * 1.5 + } else { + if (is.null(target_value)) { + target_value <- sum(initResponseUnit) / sum(initSpendUnit) * 0.8 + } + target_value_ext <- 1 + } + } + # Gather all values that will be used internally on optim (nloptr) coefs_eval <- coefs_sorted[channel_for_allocation] alphas_eval <- alphas[paste0(channel_for_allocation, "_alphas")] - inflexions_eval <- inflexions[paste0(channel_for_allocation, "_gammas")] + inflexions_eval <- inflexions[paste0(channel_for_allocation, "_inflexion")] hist_carryover_eval <- hist_carryover[channel_for_allocation] + inflations_eval <- inflations[paste0(channel_for_allocation, "_inflation")] eval_list <- list( coefs_eval = coefs_eval, alphas_eval = alphas_eval, inflexions_eval = inflexions_eval, + inflation_eval = inflations_eval, # mediaSpendSortedFiltered = mediaSpendSorted, total_budget = total_budget, total_budget_unit = total_budget_unit, @@ -487,7 +466,7 @@ robyn_allocator <- function(robyn_object = NULL, eval_g_eq = if (constr_mode == "eq") eval_g_eq_effi else NULL, eval_g_ineq = if (constr_mode == "ineq") eval_g_eq_effi else NULL, lb = lb, - ub = x0 * channel_constr_up[1], # Large enough, but not infinite (customizable) + ub = ub, # Large enough, but not infinite (customizable) opts = list( "algorithm" = "NLOPT_LD_AUGLAG", "xtol_rel" = 1.0e-10, @@ -498,12 +477,12 @@ robyn_allocator <- function(robyn_object = NULL, ) ## unbounded optimisation nlsModUnbound <- nloptr::nloptr( - x0 = x0, + x0 = x0_ext, eval_f = eval_f, eval_g_eq = if (constr_mode == "eq") eval_g_eq_effi else NULL, eval_g_ineq = if (constr_mode == "ineq") eval_g_eq_effi else NULL, - lb = lb, - ub = x0 * channel_constr_up[1], # Large enough, but not infinite (customizable) + lb = lb_ext, + ub = ub_ext, # Large enough, but not infinite (customizable) opts = list( "algorithm" = "NLOPT_LD_AUGLAG", "xtol_rel" = 1.0e-10, @@ -545,7 +524,7 @@ robyn_allocator <- function(robyn_object = NULL, names(optmSpendUnit) <- names(optmResponseUnit) <- names(optmResponseMargUnit) <- names(optmSpendUnitUnbound) <- names(optmResponseUnitUnbound) <- names(optmResponseMargUnitUnbound) <- channel_for_allocation - mediaSpendSorted %in% names(optmSpendUnit) + mediaSelectedSorted %in% names(optmSpendUnit) optmSpendUnitOut <- optmResponseUnitOut <- optmResponseMargUnitOut <- optmSpendUnitUnboundOut <- optmResponseUnitUnboundOut <- optmResponseMargUnitUnboundOut <- initSpendUnit @@ -565,7 +544,7 @@ robyn_allocator <- function(robyn_object = NULL, dt_optimOut <- data.frame( solID = select_model, dep_var_type = dep_var_type, - channels = mediaSpendSorted, + channels = mediaSelectedSorted, date_min = date_min, date_max = date_max, periods = sprintf("%s %ss", initial_mean_period, InputCollect$intervalType), @@ -693,7 +672,7 @@ robyn_allocator <- function(robyn_object = NULL, x = simulate_spend, coeff = eval_list$coefs_eval[[i]], alpha = eval_list$alphas_eval[[paste0(i, "_alphas")]], - inflexion = eval_list$inflexions_eval[[paste0(i, "_gammas")]], + inflexion = eval_list$inflexions_eval[[paste0(i, "_inflexion")]], x_hist_carryover = 0, get_sum = FALSE ) @@ -701,7 +680,7 @@ robyn_allocator <- function(robyn_object = NULL, x = mean(carryover_vec), coeff = eval_list$coefs_eval[[i]], alpha = eval_list$alphas_eval[[paste0(i, "_alphas")]], - inflexion = eval_list$inflexions_eval[[paste0(i, "_gammas")]], + inflexion = eval_list$inflexions_eval[[paste0(i, "_inflexion")]], x_hist_carryover = 0, get_sum = FALSE ) @@ -862,6 +841,7 @@ eval_f <- function(X, target_value) { inflexions_eval <- eval_list[["inflexions_eval"]] # mediaSpendSortedFiltered <- eval_list[["mediaSpendSortedFiltered"]] hist_carryover_eval <- eval_list[["hist_carryover_eval"]] + inflations_eval <- eval_list[["inflations_eval"]] objective <- -sum(mapply( fx_objective, @@ -897,7 +877,7 @@ eval_f <- function(X, target_value) { return(optm) } -fx_objective <- function(x, coeff, alpha, inflexion, x_hist_carryover, get_sum = TRUE) { +fx_objective <- function(x, coeff, alpha, inflexion, x_hist_carryover, inflation = NULL, get_sum = TRUE) { # Apply Michaelis Menten model to scale spend to exposure # if (criteria) { # xScaled <- mic_men(x = x, Vmax = vmax, Km = km) # vmax * x / (km + x) @@ -909,6 +889,7 @@ fx_objective <- function(x, coeff, alpha, inflexion, x_hist_carryover, get_sum = # Adstock scales xAdstocked <- x + mean(x_hist_carryover) + # xAdstocked <- x * inflation # Hill transformation if (get_sum) { xOut <- coeff * sum((1 + inflexion**alpha / xAdstocked**alpha)**-1) @@ -919,7 +900,7 @@ fx_objective <- function(x, coeff, alpha, inflexion, x_hist_carryover, get_sum = } # https://www.derivative-calculator.net/ on the objective function 1/(1+gamma^alpha / x^alpha) -fx_gradient <- function(x, coeff, alpha, inflexion, x_hist_carryover +fx_gradient <- function(x, coeff, alpha, inflexion, x_hist_carryover, inflation = NULL # , chnName, vmax, km, criteria ) { # Apply Michaelis Menten model to scale spend to exposure @@ -933,11 +914,12 @@ fx_gradient <- function(x, coeff, alpha, inflexion, x_hist_carryover # Adstock scales xAdstocked <- x + mean(x_hist_carryover) + # xAdstocked <- x * inflation xOut <- -coeff * sum((alpha * (inflexion**alpha) * (xAdstocked**(alpha - 1))) / (xAdstocked**alpha + inflexion**alpha)**2) return(xOut) } -fx_objective.chanel <- function(x, coeff, alpha, inflexion, x_hist_carryover +fx_objective.chanel <- function(x, coeff, alpha, inflexion, x_hist_carryover, inflation = NULL # , chnName, vmax, km, criteria ) { # Apply Michaelis Menten model to scale spend to exposure @@ -951,6 +933,7 @@ fx_objective.chanel <- function(x, coeff, alpha, inflexion, x_hist_carryover # Adstock scales xAdstocked <- x + mean(x_hist_carryover) + # xAdstocked <- x * inflation xOut <- -coeff * sum((1 + inflexion**alpha / xAdstocked**alpha)**-1) return(xOut) } @@ -1028,30 +1011,3 @@ get_adstock_params <- function(InputCollect, dt_hyppar) { } return(getAdstockHypPar) } - -get_hill_params <- function(InputCollect, OutputCollect = NULL, dt_hyppar, dt_coef, mediaSpendSorted, select_model, chnAdstocked = NULL) { - hillHypParVec <- unlist(select(dt_hyppar, na.omit(str_extract(names(dt_hyppar), ".*_alphas|.*_gammas")))) - alphas <- hillHypParVec[paste0(mediaSpendSorted, "_alphas")] - gammas <- hillHypParVec[paste0(mediaSpendSorted, "_gammas")] - if (is.null(chnAdstocked)) { - chnAdstocked <- filter( - OutputCollect$mediaVecCollect, - .data$type == "adstockedMedia", - .data$solID == select_model - ) %>% - select(all_of(mediaSpendSorted)) %>% - slice(InputCollect$rollingWindowStartWhich:InputCollect$rollingWindowEndWhich) - } - inflexions <- unlist(lapply(seq(ncol(chnAdstocked)), function(i) { - c(range(chnAdstocked[, i]) %*% c(1 - gammas[i], gammas[i])) - })) - names(inflexions) <- names(gammas) - coefs <- dt_coef$coef - names(coefs) <- dt_coef$rn - coefs_sorted <- coefs[mediaSpendSorted] - return(list( - alphas = alphas, - inflexions = inflexions, - coefs_sorted = coefs_sorted - )) -} diff --git a/R/R/auxiliary.R b/R/R/auxiliary.R index ed320186c..95a7cd53f 100644 --- a/R/R/auxiliary.R +++ b/R/R/auxiliary.R @@ -94,3 +94,24 @@ baseline_vars <- function(InputCollect, baseline_level) { } return(x) } + +# Calculate dot product +.dot_product <- function(range, proportion) { + mapply(function(proportion) { + c(range %*% c(1 - proportion, proportion)) + }, + proportion = proportion) +} + +# Calculate quantile interval +.qti <- function(x, interval = 0.95) { + check_qti(interval) + int_low <- (1 - interval)/2 + int_up <- 1 - int_low + qt_low <- quantile(x, int_low) + qt_up <- quantile(x, int_up) + return(c(qt_low, qt_up)) +} + +# Calculate MSE +.mse_loss <- function(y, y_hat) mean((y - y_hat)^2) diff --git a/R/R/calibration.R b/R/R/calibration.R index f0cc4c26f..ba283de6d 100644 --- a/R/R/calibration.R +++ b/R/R/calibration.R @@ -3,15 +3,505 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -robyn_calibrate <- function(calibration_input, - df_raw, # df_raw = InputCollect$dt_mod - dayInterval, # dayInterval = InputCollect$dayInterval - xDecompVec, # xDecompVec = decompCollect$xDecompVec - coefs, # coefs = decompCollect$coefsOutCat - hypParamSam, - wind_start = 1, - wind_end = nrow(df_raw), - adstock) { +#################################################################### +#' Robyn Calibration Function - BETA +#' +#' \code{robyn_calibrate()} consumes source of truth or proxy data for +#' saturation or adstock curve estimation. This is an experimental feature and +#' can be used independently from Robyn's main model. +#' +#' @inheritParams robyn_run +#' @param df_curve data.frame. Requires two columns named spend and response. +#' Recommended sources of truth are Halo R&F or Meta conversion lift. +#' @param curve_type Character. Currently only allows "saturation_reach_hill" +#' and only supports Hill function. +#' @param force_shape Character. Allows c("c", "s") with default NULL that's no +#' shape forcing. It's recommended for offline media to have "c" shape, while +#' for online can be "s" or NULL. Shape forcing only works if hp_bounds is null. +#' @param hp_bounds list. Currently only allows Hill for saturation. Ranges +#' for alpha and gamma are provided as Hill parameters. If NULL, hp_bounds takes +#' on default ranges. +#' @param max_trials integer. Different trials have different starting point +#' and provide diversified sampling paths. Default to 10. +#' @param max_iters integer. Loss is minimized while iteration increases. +#' Default to 2500. +#' @param loss_min_step_rel numeric. Default to 0.01 and value is between 0-0.1. +#' 0.01 means the optimisation is considered converged if error minimization is +#' <1 percent of maximal error. +#' @param loss_stop_rel numeric. Default is 0.05 and value is between 0-0.5. +#' 0.05 means 5 percent of the max_iters is used as the length of iterations to +#' calculate the mean error for convergence. +#' @param burn_in_rel numeric. Default to 0.1 and value is between 0.0.5. 0.1 +#' means 10 percent of iterations is used as burn-in period. +#' @param sim_n integer. Number of simulation for plotting fitted curve. +#' @param hp_interval numeric. Default to 0.95 and is between 0.8-1. 0.95 means +#' 2.5 - 97.5 percent percentile are used as parameter range for output. +#' @examples +#' \dontrun{ +#' # Dummy input data for Meta spend. This is derived from Halo's reach & frequency data. +#' # Note that spend and response need to be cumulative metrics. +#' data("df_curve_reach_freq") +#' +#' # Using reach saturation from Halo as proxy +#' curve_out <- robyn_calibrate( +#' df_curve = df_curve_reach_freq, +#' curve_type = "saturation_reach_hill" +#' ) +#' # For the simulated reach and frequency dataset, it's recommended to use +#' # "reach 1+" for gamma lower bound and "reach 10+" for gamma upper bound +#' facebook_I_gammas <- c( +#' curve_out[["curve_collect"]][["reach 1+"]][["hill"]][["gamma_best"]], +#' curve_out[["curve_collect"]][["reach 10+"]][["hill"]][["gamma_best"]]) +#' print(facebook_I_gammas) +#' } +#' @return List. Class: \code{curve_out}. Contains the results of all trials +#' and iterations modeled. +#' @export +robyn_calibrate <- function( + df_curve = NULL, + curve_type = NULL, + force_shape = NULL, + hp_bounds = NULL, + max_trials = 10, + max_iters = 2500, + loss_min_step_rel = 0.0001, + loss_stop_rel = 0.05, + burn_in_rel = 0.1, + sim_n = 30, + hp_interval = 0.5, + quiet = FALSE, + ...) { + ## check all inputs + # df_curve df format + # curve types + # hp_bounds format + # hp_interval + + if (curve_type == "saturation_reach_hill") { + curve_collect <- list() + for (i in unique(df_curve$freq_bucket)) { + message(">>> Fitting ", i) + df_curve_i <- df_curve %>% filter(.data$freq_bucket == i) + curve_collect[[i]] <- robyn_calibrate_single_dim( + df_curve = df_curve_i, + curve_type, + force_shape = "c", # assumption: reach saturation is always concave + hp_bounds, + max_trials, + max_iters, + loss_min_step_rel, + loss_stop_rel, + burn_in_rel, + sim_n, + hp_interval, + quiet) + } + + df_curve_plot <- bind_rows(lapply(curve_collect, function(x) x$df_out)) + + p_rnf <- ggplot(df_curve_plot, aes(x = .data$spend_cumulated)) + + geom_point(aes(y = .data$response_cumulated, colour = .data$freq_bucket)) + + geom_line(aes(y = .data$response_pred, colour = .data$freq_bucket), alpha = 0.5) + + labs( + title = "Cumulative reach & frequency saturation fitting", + subtitle = "The dots are input R&F data. The lines are fitted curves.", + x = "cumulative spend", + y = "cumulative reach" + ) + + #theme_lares(background = "white")+ + #scale_alpha_discrete(range = c(1, 0.2)) + scale_colour_discrete(h =c(120,260)) + + return(list(curve_collect = curve_collect, + plot_reach_freq = p_rnf)) + } else { + curve_collect <- robyn_calibrate_single_dim( + df_curve, + curve_type, + force_shape, + hp_bounds, + max_trials, + max_iters, + loss_min_step_rel, + loss_stop_rel, + burn_in_rel, + sim_n, + hp_interval, + quiet) + return(list(curve_collect = curve_collect)) + } +} + + +robyn_calibrate_single_dim <- function( + df_curve, + curve_type, + force_shape, + hp_bounds, + max_trials, + max_iters, + loss_min_step_rel, + loss_stop_rel, + burn_in_rel, + sim_n, + hp_interval, + quiet, + ...) { + spend_cum_sot <- df_curve[["spend_cumulated"]] + response_cum_sot <- df_curve[["response_cumulated"]] + # amend 0 if not available + if (!any(spend_cum_sot == 0)) { + spend_cum_sot <- c(0, spend_cum_sot) + response_cum_sot <- c(0, response_cum_sot) + } + + ## get hyperparameter bounds + if (is.null(hp_bounds)) { + hp_bounds <- list(hill = list(alpha = c(0, 10), gamma = c(0, 1)), coef = c(0, max(response_cum_sot) / 0.01)) + hp_bounds_loop <- hp_bounds[["hill"]] + hp_bounds_loop[["coef"]] <- hp_bounds[["coef"]] + if (force_shape == "s") { + hp_bounds_loop[["alpha"]] <- c(1, 10) + } else if (force_shape == "c") { + hp_bounds_loop[["alpha"]] <- c(0, 1) + } + } else { + hp_bounds_loop <- hp_bounds[["hill"]] + hp_bounds_loop[["coef"]] <- hp_bounds[["coef"]] + } + + ## initiate Nevergrad + if (reticulate::py_module_available("nevergrad")) { + ng <- reticulate::import("nevergrad", delay_load = TRUE) + } + + ## trial loop + ng_hp <- list() + loss_collect <- c() + pred_collect <- list() + loss_stop_abs <- round(max_iters * loss_stop_rel) + max_iters_vec <- rep(max_iters, max_trials) + + for (j in seq(max_trials)) { + my_tuple <- reticulate::tuple(as.integer(3)) + instrumentation <- ng$p$Array(shape = my_tuple, lower = 0, upper = 1) + optimizer <- ng$optimizers$registry["TwoPointsDE"](instrumentation, budget = max_iters) + + ## inner while loop that stops when converged + ng_hp_i <- list() + loss_collect_i <- c() + pred_collect_i <- list() + if (!quiet) pb_cf <- txtProgressBar(min = 0, max = max_iters_vec[j], style = 3) + loop_continue <- TRUE + i <- 0 + + while (loop_continue) { + i <- i + 1 + if (!quiet) setTxtProgressBar(pb_cf, i) + + ## Nevergrad ask sample + ng_hp_i[[i]] <- optimizer$ask() + ng_hp_val <- ng_hp_i[[i]]$value + ng_hp_val_scaled <- mapply( + function(hpb, hp) { + qunif(hp, min = min(hpb), max = max(hpb)) + }, + hpb = hp_bounds_loop, + hp = ng_hp_val + ) + alpha <- ng_hp_val_scaled["alpha"] + gamma <- ng_hp_val_scaled["gamma"] + coeff <- ng_hp_val_scaled["coef"] + + ## predict saturation vector + total_cum_spend <- max(spend_cum_sot) + response_pred <- coeff * saturation_hill(x = total_cum_spend, alpha, gamma, x_marginal = spend_cum_sot)[["x_saturated"]] + # response_sot_scaled <- .min_max_norm(response_cum_sot) + + ## get loss + loss_iter <- sqrt(.mse_loss(y = response_cum_sot, y_hat = response_pred)) + max_loss <- ifelse(i == 1, loss_iter, max(max_loss, loss_iter)) + loss_min_step_abs <- max_loss * loss_min_step_rel + + ## collect loop results + pred_collect_i[[i]] <- response_pred + loss_collect_i[i] <- loss_iter + + ## Nevergrad tell loss + optimizer$tell(ng_hp_i[[i]], tuple(loss_iter)) + + ## Loop config & prompting + if ((i >= (loss_stop_abs * 2))) { + if ((i == max_iters_vec[j])) { + loop_continue <- FALSE + if (!quiet) { + close(pb_cf) + message(paste0( + "Trial ", j, " didn't converged after ", i, + " iterations. Increase iterations or adjust convergence criterias." + )) + } + } else { + current_unit <- (i - loss_stop_abs + 1):i + previous_unit <- current_unit - loss_stop_abs + loss_unit_change <- (mean(loss_collect_i[current_unit]) - mean(loss_collect_i[previous_unit])) + loop_continue <- !all(loss_unit_change > 0, loss_unit_change <= loss_min_step_abs) + + if (loop_continue == FALSE) { + if (!quiet) { + close(pb_cf) + message(paste0( + "Trial ", j, " converged & stopped at iteration ", i, + " from ", max_iters_vec[j] + )) + } + max_iters_vec[j] <- i + } + } + } + } + ng_hp[[j]] <- ng_hp_i + loss_collect[[j]] <- loss_collect_i + pred_collect[[j]] <- pred_collect_i + if (!quiet) close(pb_cf) + } + + ## collect loop output + best_loss_iters <- mapply(function(x) which.min(x), x = loss_collect) + best_loss_vals <- mapply(function(x) min(x), x = loss_collect) + best_loss_trial <- which.min(best_loss_vals) + best_loss_iter <- best_loss_iters[best_loss_trial] + best_loss_val <- best_loss_vals[best_loss_trial] + best_hp <- ng_hp[[best_loss_trial]][[best_loss_iter]]$value + best_pred_response <- pred_collect[[best_loss_trial]][[best_loss_iter]] + + ## saturation hill + hp_alpha <- hp_bounds_loop[["alpha"]] + hp_gamma <- hp_bounds_loop[["gamma"]] + hp_coef <- hp_bounds_loop[["coef"]] + best_alpha <- qunif(best_hp[1], min = min(hp_alpha), max = max(hp_alpha)) + best_gamma <- qunif(best_hp[2], min = min(hp_gamma), max = max(hp_gamma)) + best_coef <- qunif(best_hp[3], min = min(hp_coef), max = max(hp_coef)) + # best_response_pred <- saturation_hill(spend_cum_sot, best_alpha, best_gamma)[["x_saturated"]] + # best_inflexion <- saturation_hill(spend_cum_sot, best_alpha, best_gamma)[["inflexion"]] + alpha_collect <- lapply(ng_hp, FUN = function(x) { + sapply(x, FUN = function(y) qunif(y$value[1], min = min(hp_alpha), max = max(hp_alpha))) + }) + gamma_collect <- lapply(ng_hp, FUN = function(x) { + sapply(x, FUN = function(y) qunif(y$value[2], min = min(hp_gamma), max = max(hp_gamma))) + }) + coef_collect <- lapply(ng_hp, FUN = function(x) { + sapply(x, FUN = function(y) qunif(y$value[3], min = min(hp_coef), max = max(hp_coef))) + }) + + ## slice by convergence + burn_in_abs <- rep(max_iters * burn_in_rel, max_trials) + alpha_collect_converged <- unlist(mapply( + function(x, start, end) x[start:end], + x = alpha_collect, start = burn_in_abs, + end = max_iters_vec, SIMPLIFY = FALSE + )) + gamma_collect_converged <- unlist(mapply( + function(x, start, end) x[start:end], + x = gamma_collect, start = burn_in_abs, + end = max_iters_vec, SIMPLIFY = FALSE + )) + coef_collect_converged <- unlist(mapply( + function(x, start, end) x[start:end], + x = coef_collect, start = burn_in_abs, + end = max_iters_vec, SIMPLIFY = FALSE + )) + + ## get calibration range for hyparameters + p_alpha <- data.frame(alpha = alpha_collect_converged) %>% + ggplot(aes(x = alpha)) + geom_density(fill = "grey99", color = "grey") + alpha_den <- .den_interval(p_alpha, hp_interval, best_alpha) + + p_gamma <- data.frame(gamma = gamma_collect_converged) %>% ggplot(aes(x = gamma)) + + geom_density(fill = "grey99", color = "grey") + gamma_den <- .den_interval(p_gamma, hp_interval, best_gamma) + + p_coef <- data.frame(coef = coef_collect_converged) %>% ggplot(aes(x = coef)) + + geom_density(fill = "grey99", color = "grey") + coef_den <- .den_interval(p_coef, hp_interval, best_coef) + + # qt_alpha_out <- .qti(x = alpha_collect_converged, interval = hp_interval) + # qt_gamma_out <- .qti(x = gamma_collect_converged, interval = hp_interval) + # qt_coef_out <- .qti(x = coef_collect_converged, interval = hp_interval) + + ## plotting & prompting + #coef_response <- max(response_cum_sot) / max(response_sot_scaled) + df_sot_plot <- data.frame( + spend = spend_cum_sot, + response = response_cum_sot, + response_pred = best_pred_response) + temp_spend <- seq(0, max(spend_cum_sot), length.out = sim_n) + temp_sat <- best_coef * saturation_hill(x = total_cum_spend, alpha = best_alpha, gamma = best_gamma, x_marginal = temp_spend)[["x_saturated"]] + df_pred_sim_plot <- data.frame(spend = temp_spend, response = temp_sat) + + sim_alphas <- alpha_collect_converged[ + alpha_collect_converged > alpha_den$interval[1] & + alpha_collect_converged < alpha_den$interval[2] + ] + sim_alphas <- sample(sim_alphas, sim_n, replace = TRUE) + sim_gammas <- gamma_collect_converged[ + gamma_collect_converged > gamma_den$interval[1] & + gamma_collect_converged < gamma_den$interval[2] + ] + sim_gammas <- sample(sim_gammas, sim_n, replace = TRUE) + + # simulation for plotting + sim_collect <- list() + for (i in 1:sim_n) { + sim_collect[[i]] <- best_coef * saturation_hill(x = total_cum_spend, alpha = sim_alphas[i], gamma = sim_gammas[i], x_marginal = temp_spend)[["x_saturated"]] + } + sim_collect <- data.frame( + sim = as.character(c(sapply(1:sim_n, function(x) rep(x, length(temp_spend))))), + sim_spend = rep(temp_spend, sim_n), + sim_saturation = unlist(sim_collect) + ) + + y_lab <- "response proxy" + p_lines <- ggplot() + + geom_line( + data = sim_collect, + aes( + x = .data$sim_spend, y = .data$sim_saturation, + color = .data$sim + ), size = 2, alpha = 0.2 + ) + + scale_colour_grey() + + geom_point( + data = df_sot_plot, + aes(x = .data$spend, y = .data$response) + ) + + geom_line( + data = df_pred_sim_plot, + aes(x = .data$spend, y = .data$response), color = "blue" + ) + + labs(title = paste0("Spend to ", y_lab, " saturation curve estimation")) + + ylab(y_lab) + + xlab("Spend") + + theme_lares(legend = "none", ...) + + df_mse <- data.frame( + mse = unlist(loss_collect), + iterations = unlist(mapply(function(x) seq(x), max_iters_vec, SIMPLIFY = FALSE)), + trials = as.character(unlist( + mapply(function(x, y) rep(x, y), + x = 1:max_trials, y = max_iters_vec + ) + )) + ) + p_mse <- df_mse %>% + mutate(trials = factor(.data$trials, levels = seq(max_trials))) %>% + ggplot(aes(x = .data$iterations, y = .data$mse)) + + geom_line(size = 0.2) + + facet_grid(.data$trials ~ .) + + labs( + title = paste0( + "Loss convergence with error reduction of ", + round((1 - best_loss_val / max_loss), 4) * 100, "%" + ), + x = "Iterations", y = "MSE" + ) + + theme_lares(grid = "Xx", ...) + + scale_x_abbr() + + theme( + axis.text.y = element_blank(), + axis.ticks.y = element_blank() + ) + + p_alpha <- p_alpha + + labs( + title = paste0("Alpha (Hill) density after ", round(burn_in_rel * 100), "% burn-in"), + subtitle = paste0(round(hp_interval*100), "% center density: ", round(alpha_den$interval[1], 4), "-", round(alpha_den$interval[2], 4), + "\nBest alpha: ", round(best_alpha,4)) + ) + + theme_lares(...) + + scale_y_abbr() + p_alpha <- geom_density_ci(p_alpha, alpha_den$interval[1], alpha_den$interval[2], fill = "lightblue") + + p_gamma <- p_gamma + + labs( + title = paste0("Gamma (Hill) density after ", round(burn_in_rel * 100), "% burn-in"), + subtitle = paste0(round(hp_interval*100), "% center density: ", round(gamma_den$interval[1], 4), "-", round(gamma_den$interval[2], 4), + "\nBest gamma: ", round(best_gamma,4)) + ) + + theme_lares(...) + + scale_y_abbr() + p_gamma <- geom_density_ci(p_gamma, gamma_den$interval[1], gamma_den$interval[2], fill = "lightblue") + + # p_coef <- p_coef + + # labs( + # title = paste0("Beta coefficient density after ", round(burn_in_rel * 100), "% burn-in"), + # subtitle = paste0(round(hp_interval*100), "% center density: ", round(exp(coef_den$interval[1])), "-", round(exp(coef_den$interval[2]))), + # x = "log(coef)" + # ) + + # theme_lares(...) + + # scale_y_abbr() + # p_coef <- geom_density_ci(p_coef, coef_den$interval[1], coef_den$interval[2], fill = "lightblue") + + if (!quiet) { + message( + paste0( + "\nBest alpha: ", round(best_alpha, 4), " (", + paste0(round(alpha_den$interval, 4), collapse = "-"), ")", + ", Best gamma: ", round(best_gamma, 4), " (", + paste0(round(gamma_den$interval, 4), collapse = "-"), ")", + ", Best coef: ", round(best_coef), " (", + paste0(round(coef_den$interval), collapse = "-"), ")", + ", Total spend: ", max(spend_cum_sot), ", Best loss: ", + round(best_loss_val, 4), "\n" + ) + ) + } + + curve_out <- list( + hill = list(alpha_range = c(alpha_den$interval), + alpha_best = best_alpha, + gamma_range = c(gamma_den$interval), + gamma_best = best_gamma, + coef_range = c(coef_den$interval), + coef_best = best_coef, + inflexion_max = total_cum_spend), + plot = p_lines / p_mse / (p_alpha + p_gamma) + + plot_annotation( + theme = theme_lares(background = "white", ...) + ), + df_out = df_curve %>% + mutate(response_pred = best_pred_response), + df_out_sim = df_pred_sim_plot %>% + mutate(response_pred = .data$response) + ) + return(curve_out) +} + + +.den_interval <- function(plot_object, hp_interval, best_val) { + get_den <- ggplot_build(plot_object)$data[[1]] + # mode_loc <- which.max(get_den$y) + mode_loc <- which.min(abs(get_den$x - best_val)) + mode_wing <- sum(get_den$y) * hp_interval /2 + int_left <- mode_loc - which.min(abs(cumsum(get_den$y[mode_loc:1]) - mode_wing)) + 1 + int_left <- ifelse(is.na(int_left) | int_left < 1, 1, int_left) + int_right <- mode_loc + which.min(abs(cumsum(get_den$y[(mode_loc+1):length(get_den$y)]) - mode_wing)) + int_right <- ifelse(length(int_right) == 0 , length(get_den$y), int_right) + return(list(interval = c(get_den$x[int_left], get_den$x[int_right]), + mode = get_den$x[mode_loc])) +} + + +lift_calibration <- function( + calibration_input, + df_raw, # df_raw = InputCollect$dt_mod + dayInterval, # dayInterval = InputCollect$dayInterval + xDecompVec, # xDecompVec = decompCollect$xDecompVec + coefs, # coefs = decompCollect$coefsOutCat + hypParamSam, + wind_start = 1, + wind_end = nrow(df_raw), + adstock) { ds_wind <- df_raw$ds[wind_start:wind_end] include_study <- any( calibration_input$liftStartDate >= min(ds_wind) & @@ -24,7 +514,7 @@ robyn_calibrate <- function(calibration_input, calibration_input, pred = NA, pred_total = NA, decompStart = NA, decompEnd = NA ) - split_channels <- strsplit(calibration_input$channel, split = "\\+") + split_channels <- strsplit(calibration_input$channel_selected, split = "\\+") for (l_study in seq_along(split_channels)) { get_channels <- split_channels[[l_study]] @@ -56,24 +546,27 @@ robyn_calibrate <- function(calibration_input, scale <- hypParamSam[paste0(get_channels[l_chn], "_scales")][[1]][[1]] } x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) + x_list_cal <- transform_adstock(m[calib_pos], adstock, theta = theta, shape = shape, scale = scale) if (adstock == "weibull_pdf") { m_imme <- x_list$x_imme + m_imme_cal <- x_list_cal$x_imme } else { m_imme <- m + m_imme_cal <- m[calib_pos] } m_total <- x_list$x_decayed - m_caov <- m_total - m_imme + # m_caov <- m_total - m_imme + m_total_cal <- x_list_cal$x_decayed ## 2. Saturation - m_caov_calib <- m_caov[calib_pos] m_total_rw <- m_total[wind_start:wind_end] alpha <- hypParamSam[paste0(get_channels[l_chn], "_alphas")][[1]][[1]] gamma <- hypParamSam[paste0(get_channels[l_chn], "_gammas")][[1]][[1]] m_calib_caov_sat <- saturation_hill( m_total_rw, - alpha = alpha, gamma = gamma, x_marginal = m_caov_calib + alpha = alpha, gamma = gamma, x_marginal = m_total[calib_pos] - m_total_cal ) - m_calib_caov_decomp <- m_calib_caov_sat * coefs$s0[coefs$rn == get_channels[l_chn]] + m_calib_caov_decomp <- m_calib_caov_sat$x_saturated * coefs$s0[coefs$rn == get_channels[l_chn]] m_calib_total_decomp <- xDecompVec[calib_pos_rw, get_channels[l_chn]] m_calib_decomp <- m_calib_total_decomp - m_calib_caov_decomp } @@ -118,7 +611,7 @@ robyn_calibrate <- function(calibration_input, decompAbsTotalScaled = .data$pred_total / .data$decompDays * .data$liftDays ) %>% mutate( - liftMedia = .data$channel, + liftMedia = .data$channel_selected, liftStart = .data$liftStartDate, liftEnd = .data$liftEndDate, mape_lift = abs((.data$decompAbsScaled - .data$liftAbs) / .data$liftAbs), diff --git a/R/R/checks.R b/R/R/checks.R index 6b092e985..48c412bf0 100644 --- a/R/R/checks.R +++ b/R/R/checks.R @@ -57,39 +57,26 @@ check_allneg <- function(df) { return(df) } -check_varnames <- function(dt_input, dt_holidays, - dep_var, date_var, - context_vars, paid_media_spends, - organic_vars) { +check_varnames <- function(dt_input, dt_holidays) { dfs <- list(dt_input = dt_input, dt_holidays = dt_holidays) for (i in seq_along(dfs)) { # Which names to check by data.frame table_name <- names(dfs[i]) - if (table_name == "dt_input") { - vars <- c( - dep_var, date_var, context_vars, - paid_media_spends, organic_vars, "auto" - ) - } - if (table_name == "dt_holidays") { - vars <- c("ds", "country") # holiday? - } - df <- dfs[[i]] - vars <- vars[vars != "auto"] + temp_vars <- names(dt_input) # Duplicate names - if (length(vars) != length(unique(vars))) { - these <- names(table(vars)[table(vars) > 1]) + if (length(temp_vars) != length(unique(temp_vars))) { + these <- names(table(temp_vars)[table(temp_vars) > 1]) stop(paste( "You have duplicated variable names for", table_name, "in different parameters.", "Check:", paste(these, collapse = ", ") )) } # Names with spaces - with_space <- grepl(" ", vars) + with_space <- grepl(" ", temp_vars) if (sum(with_space) > 0) { stop(paste( "You have invalid variable names on", table_name, "with spaces.\n ", - "Please fix columns:", v2t(vars[with_space]) + "Please fix columns:", v2t(temp_vars[with_space]) )) } } @@ -297,10 +284,13 @@ check_paidmedia <- function(dt_input, paid_media_vars, paid_media_signs, paid_me " contains negative values. Media must be >=0" ) } + exposure_selector <- paid_media_spends != paid_media_vars + paid_media_selected <- ifelse(exposure_selector, paid_media_vars, paid_media_spends) return(invisible(list( paid_media_signs = paid_media_signs, - expVarCount = expVarCount, - paid_media_vars = paid_media_vars + paid_media_vars = paid_media_vars, + exposure_selector = exposure_selector, + paid_media_selected = paid_media_selected ))) } @@ -337,21 +327,20 @@ check_organicvars <- function(dt_input, organic_vars, organic_signs) { check_factorvars <- function(dt_input, factor_vars = NULL, context_vars = NULL) { check_vector(factor_vars) check_vector(context_vars) + if (!is.null(factor_vars)) { + if (!all(factor_vars %in% context_vars)) { + stop("Input 'factor_vars' must be any from 'context_vars' inputs") + } + } temp <- select(dt_input, all_of(context_vars)) - are_not_numeric <- !sapply(temp, is.numeric) - if (any(are_not_numeric)) { - these <- are_not_numeric[!names(are_not_numeric) %in% factor_vars] - these <- these[these] + undefined_factor <- !sapply(temp, function(x) is.integer(x) | is.numeric(x)) & !(names(temp) %in% factor_vars) + if (any(undefined_factor)) { + these <- temp[undefined_factor] if (length(these) > 0) { message("Automatically set these variables as 'factor_vars': ", v2t(names(these))) factor_vars <- c(factor_vars, names(these)) } } - if (!is.null(factor_vars)) { - if (!all(factor_vars %in% context_vars)) { - stop("Input 'factor_vars' must be any from 'context_vars' inputs") - } - } return(factor_vars) } @@ -468,10 +457,10 @@ check_adstock <- function(adstock) { return(adstock) } -check_hyperparameters <- function(hyperparameters = NULL, adstock = NULL, - paid_media_spends = NULL, organic_vars = NULL, - exposure_vars = NULL, prophet_vars = NULL, - contextual_vars = NULL) { +check_hyperparameters <- function( + hyperparameters = NULL, adstock = NULL, paid_media_selected = NULL, + paid_media_spends = NULL, organic_vars = NULL, exposure_vars = NULL, + prophet_vars = NULL, contextual_vars = NULL) { if (is.null(hyperparameters)) { message(paste( "Input 'hyperparameters' not provided yet. To include them, run", @@ -488,22 +477,23 @@ check_hyperparameters <- function(hyperparameters = NULL, adstock = NULL, hyperparameters_ordered <- hyperparameters[order(names(hyperparameters))] get_hyp_names <- names(hyperparameters_ordered) original_order <- sapply(names(hyperparameters), function(x) which(x == get_hyp_names)) + ref_hyp_name_selected <- hyper_names(adstock, all_media = paid_media_selected) ref_hyp_name_spend <- hyper_names(adstock, all_media = paid_media_spends) ref_hyp_name_expo <- hyper_names(adstock, all_media = exposure_vars) ref_hyp_name_org <- hyper_names(adstock, all_media = organic_vars) ref_hyp_name_other <- get_hyp_names[get_hyp_names %in% HYPS_OTHERS] # Excluding lambda (first HYPS_OTHERS) given its range is not customizable - ref_all_media <- sort(c(ref_hyp_name_spend, ref_hyp_name_org, HYPS_OTHERS)) - all_ref_names <- c(ref_hyp_name_spend, ref_hyp_name_expo, ref_hyp_name_org, HYPS_OTHERS) - all_ref_names <- all_ref_names[order(all_ref_names)] - # Adding penalty variations to the dictionary - if (any(grepl("_penalty", paste0(get_hyp_names)))) { - ref_hyp_name_penalties <- paste0( - c(paid_media_spends, organic_vars, prophet_vars, contextual_vars), "_penalty" - ) - all_ref_names <- c(all_ref_names, ref_hyp_name_penalties) - } else { - ref_hyp_name_penalties <- NULL + ref_all_media <- sort(c(ref_hyp_name_selected, ref_hyp_name_org, HYPS_OTHERS)) + all_ref_names <- sort(c(ref_hyp_name_selected, ref_hyp_name_spend, ref_hyp_name_org, HYPS_OTHERS)) + + if (!all(get_hyp_names %in% ref_all_media)) { + diff_hyp_loc <- which(!(get_hyp_names %in% ref_all_media)) + diff_hyp_names <- get_hyp_names[diff_hyp_loc] + if (all(diff_hyp_names %in% ref_hyp_name_spend)) { + updated_hyp_names <- ref_hyp_name_selected[which(diff_hyp_names %in% ref_hyp_name_spend)] + get_hyp_names[diff_hyp_loc] <- updated_hyp_names + names(hyperparameters_ordered) <- get_hyp_names + } } if (!all(get_hyp_names %in% all_ref_names)) { wrong_hyp_names <- get_hyp_names[which(!(get_hyp_names %in% all_ref_names))] @@ -512,8 +502,17 @@ check_hyperparameters <- function(hyperparameters = NULL, adstock = NULL, paste(wrong_hyp_names, collapse = ", ") ) } + # Adding penalty variations to the dictionary + if (any(grepl("_penalty", paste0(get_hyp_names)))) { + ref_hyp_name_penalties <- paste0( + c(paid_media_selected, organic_vars, prophet_vars, contextual_vars), "_penalty" + ) + all_ref_names <- c(all_ref_names, ref_hyp_name_penalties) + } else { + ref_hyp_name_penalties <- NULL + } total <- length(get_hyp_names) - total_in <- length(c(ref_hyp_name_spend, ref_hyp_name_org, ref_hyp_name_penalties, ref_hyp_name_other)) + total_in <- length(c(ref_hyp_name_selected, ref_hyp_name_org, ref_hyp_name_penalties, ref_hyp_name_other)) if (total != total_in) { stop(sprintf( paste( @@ -523,12 +522,6 @@ check_hyperparameters <- function(hyperparameters = NULL, adstock = NULL, total_in, total )) } - # Old workflow: replace exposure with spend hyperparameters - if (any(get_hyp_names %in% ref_hyp_name_expo)) { - get_expo_pos <- which(get_hyp_names %in% ref_hyp_name_expo) - get_hyp_names[get_expo_pos] <- ref_all_media[!ref_all_media %in% HYPS_OTHERS][get_expo_pos] - names(hyperparameters_ordered) <- get_hyp_names - } check_hyper_limits(hyperparameters_ordered, "thetas") check_hyper_limits(hyperparameters_ordered, "alphas") check_hyper_limits(hyperparameters_ordered, "gammas") @@ -579,7 +572,7 @@ check_hyper_limits <- function(hyperparameters, hyper) { } check_calibration <- function(dt_input, date_var, calibration_input, dayInterval, dep_var, - window_start, window_end, paid_media_spends, organic_vars) { + window_start, window_end, paid_media_spends, organic_vars, paid_media_selected) { if (!is.null(calibration_input)) { calibration_input <- as_tibble(as.data.frame(calibration_input)) these <- c("channel", "liftStartDate", "liftEndDate", "liftAbs", "spend", "confidence", "metric", "calibration_scope") @@ -591,6 +584,10 @@ check_calibration <- function(dt_input, date_var, calibration_input, dayInterval } all_media <- c(paid_media_spends, organic_vars) cal_media <- str_split(calibration_input$channel, "\\+|,|;|\\s") + cal_media_selected <- lapply(cal_media, function(x) sapply(x, function(y) { + ifelse(y %in% c(paid_media_selected, organic_vars), y, paid_media_selected[paid_media_spends == y]) + })) + calibration_input$channel_selected <- sapply(cal_media_selected, function(x) paste0(x, collapse = "+")) if (!all(unlist(cal_media) %in% all_media)) { these <- unique(unlist(cal_media)[which(!unlist(cal_media) %in% all_media)]) stop(sprintf( @@ -845,23 +842,7 @@ check_class <- function(x, object) { if (any(!x %in% class(object))) stop(sprintf("Input object must be class %s", x)) } -check_allocator_constrains <- function(low, upr) { - if (all(is.na(low)) || all(is.na(upr))) { - stop("You must define lower (channel_constr_low) and upper (channel_constr_up) constraints") - } - max_length <- max(c(length(low), length(upr))) - if (any(low < 0)) { - stop("Inputs 'channel_constr_low' must be >= 0") - } - if (length(upr) != length(low)) { - stop("Inputs 'channel_constr_up' and 'channel_constr_low' must have the same length or length 1") - } - if (any(upr < low)) { - stop("Inputs 'channel_constr_up' must be >= 'channel_constr_low'") - } -} - -check_allocator <- function(OutputCollect, select_model, paid_media_spends, scenario, +check_allocator <- function(OutputCollect, select_model, paid_media_selected, scenario, channel_constr_low, channel_constr_up, constr_mode) { if (!(select_model %in% OutputCollect$allSolutions)) { stop( @@ -869,51 +850,55 @@ check_allocator <- function(OutputCollect, select_model, paid_media_spends, scen paste(OutputCollect$allSolutions, collapse = ", ") ) } - if ("max_historical_response" %in% scenario) scenario <- "max_response" + if (length(paid_media_selected) <= 1) { + stop("Must have a valid model with at least two 'paid_media_selected'") + } opts <- c("max_response", "target_efficiency") # Deprecated: max_response_expected_spend if (!(scenario %in% opts)) { stop("Input 'scenario' must be one of: ", paste(opts, collapse = ", ")) } - check_allocator_constrains(channel_constr_low, channel_constr_up) - if (!(scenario == "target_efficiency" & is.null(channel_constr_low) & is.null(channel_constr_up))) { - if (length(channel_constr_low) != 1 && length(channel_constr_low) != length(paid_media_spends)) { - stop(paste( - "Input 'channel_constr_low' have to contain either only 1", - "value or have same length as 'InputCollect$paid_media_spends':", length(paid_media_spends) - )) + if ((is.null(channel_constr_low) & !is.null(channel_constr_up)) | + (!is.null(channel_constr_low) & is.null(channel_constr_up))) { + stop("channel_constr_low and channel_constr_up must be both provided or both NULL") + } else if (!is.null(channel_constr_low) & !is.null(channel_constr_up)) { + if (any(channel_constr_low < 0)) { + stop("Inputs 'channel_constr_low' must be >= 0") } - if (length(channel_constr_up) != 1 && length(channel_constr_up) != length(paid_media_spends)) { - stop(paste( - "Input 'channel_constr_up' have to contain either only 1", - "value or have same length as 'InputCollect$paid_media_spends':", length(paid_media_spends) - )) + if ((length(channel_constr_low) != 1 && length(channel_constr_low) != length(paid_media_selected)) | + (length(channel_constr_up) != 1 && length(channel_constr_up) != length(paid_media_selected))) { + stop("'channel_constr_low' and 'channel_constr_up' require either only 1 value or the same length as 'paid_media_selected'") + } + if (any(channel_constr_up < channel_constr_low)) { + stop("Inputs 'channel_constr_up' must be >= 'channel_constr_low'") } } opts <- c("eq", "ineq") if (!(constr_mode %in% opts)) { stop("Input 'constr_mode' must be one of: ", paste(opts, collapse = ", ")) } - return(scenario) } -check_metric_type <- function(metric_name, paid_media_spends, paid_media_vars, exposure_vars, organic_vars) { - if (metric_name %in% paid_media_spends && length(metric_name) == 1) { - metric_type <- "spend" - } else if (metric_name %in% exposure_vars && length(metric_name) == 1) { - metric_type <- "exposure" - } else if (metric_name %in% organic_vars && length(metric_name) == 1) { +check_metric_type <- function(metric_name, paid_media_spends, paid_media_vars, paid_media_selected, exposure_vars, organic_vars) { + if (metric_name %in% organic_vars && length(metric_name) == 1) { metric_type <- "organic" + metric_name_updated <- metric_name + } else if ((metric_name %in% paid_media_spends && length(metric_name) == 1) | + (metric_name %in% paid_media_vars && length(metric_name) == 1)) { + metric_type <- "paid" + name_loc <- unique(c(which(metric_name == paid_media_spends), + which(metric_name == paid_media_vars))) + metric_name_updated <- paid_media_selected[name_loc] } else { stop(paste( "Invalid 'metric_name' input:", metric_name, - "\nInput should be any media variable from paid_media_spends (spend),", - "paid_media_vars (exposure), or organic_vars (organic):", - paste("\n- paid_media_spends:", v2t(paid_media_spends, quotes = FALSE)), - paste("\n- paid_media_vars:", v2t(paid_media_vars, quotes = FALSE)), + "\nInput should be any media variable from paid_media_selected", + "or organic_vars:", + paste("\n- paid_media_spends:", v2t(paid_media_selected, quotes = FALSE)), paste("\n- organic_vars:", v2t(organic_vars, quotes = FALSE)) )) } - return(metric_type) + return(list(metric_type = metric_type, + metric_name_updated = metric_name_updated)) } check_metric_dates <- function(date_range = NULL, all_dates, dayInterval = NULL, quiet = FALSE, is_allocator = FALSE, ...) { @@ -939,54 +924,18 @@ check_metric_dates <- function(date_range = NULL, all_dates, dayInterval = NULL, date_range <- tail(all_dates, get_n) date_range_loc <- which(all_dates %in% date_range) date_range_updated <- all_dates[date_range_loc] - rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) - } else { + } else if (is.Date(as.Date(date_range[1]))) { ## Using dates as date_range range - if (all(is.Date(as.Date(date_range, origin = "1970-01-01")))) { - date_range <- as.Date(date_range, origin = "1970-01-01") - if (length(date_range) == 1) { - ## Using only 1 date - if (all(date_range %in% all_dates)) { - date_range_updated <- date_range - date_range_loc <- which(all_dates == date_range) - if (!quiet) message("Using ds '", date_range_updated, "' as the response period") - } else { - date_range_loc <- which.min(abs(date_range - all_dates)) - date_range_updated <- all_dates[date_range_loc] - if (!quiet) warning("Input 'date_range' (", date_range, ") has no match. Picking closest date: ", date_range_updated) - } - } else if (length(date_range) == 2) { - ## Using two dates as "from-to" date range - date_range_loc <- unlist(lapply(date_range, function(x) which.min(abs(x - all_dates)))) - date_range_loc <- date_range_loc[1]:date_range_loc[2] - date_range_updated <- all_dates[date_range_loc] - if (!quiet & !all(date_range %in% date_range_updated)) { - warning(paste( - "At least one date in 'date_range' input do not match any date.", - "Picking closest dates for range:", paste(range(date_range_updated), collapse = ":") - )) - } - rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) - get_n <- length(date_range_loc) - } else { - ## Manually inputting each date - date_range_updated <- date_range - if (all(date_range %in% all_dates)) { - date_range_loc <- which(all_dates %in% date_range_updated) - } else { - date_range_loc <- unlist(lapply(date_range_updated, function(x) which.min(abs(x - all_dates)))) - rg <- v2t(range(date_range_updated), sep = ":", quotes = FALSE) - } - if (all(na.omit(date_range_loc - lag(date_range_loc)) == 1)) { - date_range_updated <- all_dates[date_range_loc] - if (!quiet) warning("At least one date in 'date_range' do not match ds. Picking closest date: ", date_range_updated) - } else { - stop("Input 'date_range' needs to have sequential dates") - } - } + date_range_updated <- date_range <- as.Date(date_range, origin = "1970-01-01") + if (!all(date_range %in% all_dates)) { + date_range_loc <- range(sapply(date_range, FUN = function(x) which.min(abs(x - all_dates)))) + date_range_loc <- seq(from = date_range_loc[1], to = date_range_loc[2], by = 1) } else { - stop("Input 'date_range' must have date format '2023-01-01' or use 'last_n'") + date_range_loc <- which(all_dates %in% date_range) } + date_range_updated <- all_dates[date_range_loc] + } else { + stop("Input 'date_range' must have date format '1970-01-01' or use 'last_n'") } return(list( date_range_updated = date_range_updated, @@ -995,8 +944,9 @@ check_metric_dates <- function(date_range = NULL, all_dates, dayInterval = NULL, } check_metric_value <- function(metric_value, metric_name, all_values, metric_loc) { - get_n <- length(metric_loc) if (any(is.nan(metric_value))) metric_value <- NULL + get_n <- length(metric_loc) + metric_value_updated <- all_values[metric_loc] if (!is.null(metric_value)) { if (!is.numeric(metric_value)) { stop(sprintf( @@ -1009,18 +959,14 @@ check_metric_value <- function(metric_value, metric_name, all_values, metric_loc )) } if (get_n > 1 & length(metric_value) == 1) { - metric_value_updated <- rep(metric_value / get_n, get_n) + metric_value_updated <- metric_value * (metric_value_updated / sum(metric_value_updated)) # message(paste0("'metric_value'", metric_value, " splitting into ", get_n, " periods evenly")) - } else { - if (length(metric_value) != get_n) { - stop("robyn_response metric_value & date_range must have same length\n") - } + } else if (get_n == 1 & length(metric_value) == 1) { metric_value_updated <- metric_value + } else { + stop("robyn_response metric_value & date_range must have same length\n") } } - if (is.null(metric_value)) { - metric_value_updated <- all_values[metric_loc] - } all_values_updated <- all_values all_values_updated[metric_loc] <- metric_value_updated return(list( @@ -1099,3 +1045,9 @@ check_refresh_data <- function(Robyn, dt_input) { )) } } + +check_qti <- function(interval) { + if (interval > 1 | interval < 0.5) { + stop("Quantile interval needs to be within 0.5-1.") + } +} diff --git a/R/R/data.R b/R/R/data.R index a17194386..09598358d 100644 --- a/R/R/data.R +++ b/R/R/data.R @@ -61,3 +61,45 @@ # lares::missingness(dt_prophet_holidays) # dt_prophet_holidays <- dplyr::filter(dt_prophet_holidays, !is.na(country)) # save(dt_prophet_holidays, file = "data/dt_prophet_holidays.RData", version = 2) + +#################################################################### +#' Robyn Dataset: Reach & frequency simulated dataset +#' +#' A simulated cumulated reach and spend dataset by frequency buckets. +#' The headers must be kept as +#' \code{c("spend_cumulated", "response_cumulated", "freq_bucket")}. +#' +#' +#' @family Dataset +#' @docType data +#' @usage data(df_curve_reach_freq) +#' @return data.frame +#' @format An object of class \code{"data.frame"} +#' \describe{ +#' \item{spend_cumulated}{cumulated spend of paid media} +#' \item{response_cumulated}{cumulated reach of paid media} +#' \item{freq_bucket}{Frequency bucket for cumulated reach} +#' } +#' @examples +#' data(df_curve_reach_freq) +#' head(df_curve_reach_freq) +#' @return Dataframe. +"df_curve_reach_freq" + +# xSample <- round(seq(0, 100000, length.out = 10)) +# gammaSamp <- seq(0.3, 1, length.out = 20) +# coeff <- 10000000 +# df_curve_reach_freq <- list() +# for (i in seq_along(gammaSamp)) { +# df_curve_reach_freq[[i]] <- data.frame( +# spend_cumulated = xSample, +# response_predicted = (xSample**0.5 / (xSample**0.5 + (gammaSamp[i] * max(xSample))**0.5)) * coeff , +# gamma = gammaSamp[i], +# freq_bucket = as.factor(paste0("reach ", i, "+")) +# ) +# } +# df_curve_reach_freq <- bind_rows(df_curve_reach_freq) %>% +# mutate(response_cumulated = response_predicted * (1+ runif(length(xSample) * length(gammaSamp), -0.05, 0.05))) %>% +# select(spend_cumulated, response_cumulated, response_predicted, freq_bucket) +# levels(df_curve_reach_freq$freq_bucket) <- paste0("reach ", seq_along(gammaSamp), "+") +# save(df_curve_reach_freq, file = "data/df_curve_reach_freq.RData", version = 2) diff --git a/R/R/imports.R b/R/R/imports.R index 4f51bdb85..22b1d619f 100644 --- a/R/R/imports.R +++ b/R/R/imports.R @@ -21,8 +21,8 @@ #' @importFrom doRNG %dorng% #' @importFrom doParallel registerDoParallel stopImplicitCluster #' @importFrom dplyr across any_of arrange as_tibble bind_rows case_when contains desc distinct -#' everything filter group_by lag left_join mutate n pull rename row_number select slice -#' summarise summarise_all ungroup all_of bind_cols mutate_at starts_with ends_with tally n_distinct +#' everything filter full_join group_by lag left_join mutate mutate_at n pull rename row_number select +#' slice summarise summarise_all ungroup all_of bind_cols mutate_at starts_with ends_with tally n_distinct #' @importFrom foreach foreach %dopar% getDoParWorkers registerDoSEQ #' @import ggplot2 #' @importFrom ggridges geom_density_ridges geom_density_ridges_gradient @@ -31,7 +31,6 @@ #' @importFrom lares check_opts clusterKmeans formatNum freqs glued num_abbr ohse removenacols #' theme_lares `%>%` scale_x_abbr scale_x_percent scale_y_percent scale_y_abbr try_require v2t #' @importFrom lubridate is.Date day floor_date -#' @importFrom minpack.lm nlsLM #' @importFrom nloptr nloptr #' @importFrom parallel detectCores #' @importFrom patchwork guide_area plot_layout plot_annotation wrap_plots diff --git a/R/R/inputs.R b/R/R/inputs.R index 141fdc545..a70d5c24c 100644 --- a/R/R/inputs.R +++ b/R/R/inputs.R @@ -200,12 +200,7 @@ robyn_inputs <- function(dt_input = NULL, if (!is.null(dt_holidays)) dt_holidays <- as_tibble(dt_holidays) ## Check vars names (duplicates and valid) - check_varnames( - dt_input, dt_holidays, - dep_var, date_var, - context_vars, paid_media_spends, - organic_vars - ) + check_varnames(dt_input, dt_holidays) ## Check for NA and all negative values dt_input <- check_allneg(dt_input) @@ -234,8 +229,8 @@ robyn_inputs <- function(dt_input = NULL, ## Check paid media variables (and maybe transform paid_media_signs) if (is.null(paid_media_vars)) paid_media_vars <- paid_media_spends - paidmedia <- check_paidmedia(dt_input, paid_media_vars, paid_media_signs, paid_media_spends) - paid_media_signs <- paidmedia$paid_media_signs + paid_collect <- check_paidmedia(dt_input, paid_media_vars, paid_media_signs, paid_media_spends) + paid_media_signs <- paid_collect$paid_media_signs exposure_vars <- paid_media_vars[!(paid_media_vars == paid_media_spends)] ## Check organic media variables (and maybe transform organic_signs) @@ -246,7 +241,7 @@ robyn_inputs <- function(dt_input = NULL, factor_vars <- check_factorvars(dt_input, factor_vars, context_vars) ## Check all vars - all_media <- c(paid_media_spends, organic_vars) + all_media <- c(paid_collect$paid_media_selected, organic_vars) all_ind_vars <- c(tolower(prophet_vars), context_vars, all_media) check_allvars(all_ind_vars) @@ -255,14 +250,12 @@ robyn_inputs <- function(dt_input = NULL, ## Check window_start & window_end (and transform parameters/data) windows <- check_windows(dt_input, date_var, all_media, window_start, window_end) - if (TRUE) { - window_start <- windows$window_start - rollingWindowStartWhich <- windows$rollingWindowStartWhich - refreshAddedStart <- windows$refreshAddedStart - window_end <- windows$window_end - rollingWindowEndWhich <- windows$rollingWindowEndWhich - rollingWindowLength <- windows$rollingWindowLength - } + window_start <- windows$window_start + rollingWindowStartWhich <- windows$rollingWindowStartWhich + refreshAddedStart <- windows$refreshAddedStart + window_end <- windows$window_end + rollingWindowEndWhich <- windows$rollingWindowEndWhich + rollingWindowLength <- windows$rollingWindowLength ## Check adstock adstock <- check_adstock(adstock) @@ -270,7 +263,8 @@ robyn_inputs <- function(dt_input = NULL, ## Check calibration and iters/trials calibration_input <- check_calibration( dt_input, date_var, calibration_input, dayInterval, dep_var, - window_start, window_end, paid_media_spends, organic_vars + window_start, window_end, paid_media_spends, organic_vars, + paid_collect$paid_media_selected ) ## Not used variables @@ -311,6 +305,7 @@ robyn_inputs <- function(dt_input = NULL, paid_media_vars = paid_media_vars, paid_media_signs = paid_media_signs, paid_media_spends = paid_media_spends, + paid_media_selected = paid_collect$paid_media_selected, paid_media_total = paid_media_total, exposure_vars = exposure_vars, organic_vars = organic_vars, @@ -338,7 +333,7 @@ robyn_inputs <- function(dt_input = NULL, ## Check hyperparameters hyperparameters <- check_hyperparameters( - hyperparameters, adstock, paid_media_spends, organic_vars, + hyperparameters, adstock, paid_collect$paid_media_selected, paid_media_spends, organic_vars, exposure_vars, prophet_vars, context_vars ) InputCollect <- robyn_engineering(InputCollect, ...) @@ -358,7 +353,8 @@ robyn_inputs <- function(dt_input = NULL, window_start = InputCollect$window_start, window_end = InputCollect$window_end, paid_media_spends = InputCollect$paid_media_spends, - organic_vars = InputCollect$organic_vars + organic_vars = InputCollect$organic_vars, + paid_media_selected = InputCollect$paid_media_selected ) ## Update calibration_input @@ -372,12 +368,15 @@ robyn_inputs <- function(dt_input = NULL, ## Update & check hyperparameters if (is.null(InputCollect$hyperparameters)) InputCollect$hyperparameters <- hyperparameters InputCollect$hyperparameters <- check_hyperparameters( - InputCollect$hyperparameters, InputCollect$adstock, InputCollect$all_media + InputCollect$hyperparameters, InputCollect$adstock, + InputCollect$paid_media_selected, InputCollect$paid_media_spends, + InputCollect$organic_vars, InputCollect$exposure_vars, + InputCollect$prophet_vars, InputCollect$context_vars ) InputCollect <- robyn_engineering(InputCollect, ...) } - # Check for no-variance columns (after filtering modeling window) + # Re-check for no-variance columns after feature enginerring dt_mod_model_window <- InputCollect$dt_mod %>% select(-any_of(InputCollect$unused_vars)) %>% filter( @@ -469,39 +468,14 @@ Adstock: {x$adstock} #' hyperparameters that is inserted into \code{robyn_inputs(hyperparameters = ...)} #' #' @section Guide to setup hyperparameters: -#' \enumerate{ -#' \item Get correct hyperparameter names: -#' All variables in \code{paid_media_vars} or \code{organic_vars} require hyperprameters -#' and will be transformed by adstock & saturation. Difference between \code{paid_media_vars} -#' and \code{organic_vars} is that \code{paid_media_vars} has spend that -#' needs to be specified in \code{paid_media_spends} specifically. Run \code{hyper_names()} -#' to get correct hyperparameter names. All names in hyperparameters must -#' equal names from \code{hyper_names()}, case sensitive. -#' \item Get guidance for setting hyperparameter bounds: -#' For geometric adstock, use theta, alpha & gamma. For both weibull adstock options, -#' use shape, scale, alpha, gamma. -#' \itemize{ -#' \item Theta: In geometric adstock, theta is decay rate. guideline for usual media genre: -#' TV c(0.3, 0.8), OOH/Print/Radio c(0.1, 0.4), digital c(0, 0.3) -#' \item Shape: In weibull adstock, shape controls the decay shape. Recommended c(0.0001, 2). -#' The larger, the more S-shape. The smaller, the more L-shape. Channel-type specific -#' values still to be investigated -#' \item Scale: In weibull adstock, scale controls the decay inflexion point. Very conservative -#' recommended bounce c(0, 0.1), because scale can increase adstocking half-life greatly. -#' Channel-type specific values still to be investigated -#' \item Gamma: In s-curve transformation with hill function, gamma controls the inflexion point. -#' Recommended bounce c(0.3, 1). The larger the gamma, the later the inflection point -#' in the response curve -#' } -#' \item Set each hyperparameter bounds. They either contains two values e.g. c(0, 0.5), -#' or only one value (in which case you've "fixed" that hyperparameter) -#' } +#' See section "Hyperparameter interpretation & recommendation" in demo +#' https://github.com/facebookexperimental/Robyn/blob/main/demo/demo.R #' #' @section Helper plots: #' \describe{ -#' \item{plot_adstock}{Get adstock transformation example plot, +#' \item{plot_adstock(TRUE)}{Get adstock transformation example plot, #' helping you understand geometric/theta and weibull/shape/scale transformation} -#' \item{plot_saturation}{Get saturation curve transformation example plot, +#' \item{plot_saturation(TRUE)}{Get saturation curve transformation example plot, #' helping you understand hill/alpha/gamma transformation} #' } #' @@ -512,13 +486,13 @@ Adstock: {x$adstock} #' @param all_vars Used to check the penalties inputs, especially for refreshing models. #' @examples #' \donttest{ -#' media <- c("facebook_S", "print_S", "tv_S") +#' media <- c("facebook_I", "print_S", "tv_S") #' hyper_names(adstock = "geometric", all_media = media) #' #' hyperparameters <- list( -#' facebook_S_alphas = c(0.5, 3), # example bounds for alpha -#' facebook_S_gammas = c(0.3, 1), # example bounds for gamma -#' facebook_S_thetas = c(0, 0.3), # example bounds for theta +#' facebook_I_alphas = c(0.5, 3), # example bounds for alpha +#' facebook_I_gammas = c(0.3, 1), # example bounds for gamma +#' facebook_I_thetas = c(0, 0.3), # example bounds for theta #' print_S_alphas = c(0.5, 3), #' print_S_gammas = c(0.3, 1), #' print_S_thetas = c(0.1, 0.4), @@ -528,13 +502,13 @@ Adstock: {x$adstock} #' ) #' #' # Define hyper_names for weibull adstock -#' hyper_names(adstock = "weibull", all_media = media) +#' hyper_names(adstock = "weibull_pdf", all_media = media) #' #' hyperparameters <- list( -#' facebook_S_alphas = c(0.5, 3), # example bounds for alpha -#' facebook_S_gammas = c(0.3, 1), # example bounds for gamma -#' facebook_S_shapes = c(0.0001, 2), # example bounds for shape -#' facebook_S_scales = c(0, 0.1), # example bounds for scale +#' facebook_I_alphas = c(0.5, 3), # example bounds for alpha +#' facebook_I_gammas = c(0.3, 1), # example bounds for gamma +#' facebook_I_shapes = c(0.0001, 2), # example bounds for shape +#' facebook_I_scales = c(0, 0.1), # example bounds for scale #' print_S_alphas = c(0.5, 3), #' print_S_gammas = c(0.3, 1), #' print_S_shapes = c(0.0001, 2), @@ -607,123 +581,11 @@ robyn_engineering <- function(x, quiet = FALSE, ...) { rollingWindowStartWhich <- InputCollect$rollingWindowStartWhich rollingWindowEndWhich <- InputCollect$rollingWindowEndWhich - # dt_inputRollWind - dt_inputRollWind <- dt_input[rollingWindowStartWhich:rollingWindowEndWhich, ] - - # dt_transform - dt_transform <- dt_input - colnames(dt_transform)[colnames(dt_transform) == InputCollect$date_var] <- "ds" - colnames(dt_transform)[colnames(dt_transform) == InputCollect$dep_var] <- "dep_var" - dt_transform <- arrange(dt_transform, .data$ds) - - # dt_transformRollWind - dt_transformRollWind <- dt_transform[rollingWindowStartWhich:rollingWindowEndWhich, ] - - ################################################################ - #### Model exposure metric from spend - - exposure_selector <- paid_media_spends != paid_media_vars - names(exposure_selector) <- paid_media_vars - - if (any(exposure_selector)) { - modNLSCollect <- list() - yhatCollect <- list() - plotNLSCollect <- list() - mediaCostFactor <- colSums(subset(dt_inputRollWind, select = paid_media_spends), na.rm = TRUE) / - colSums(subset(dt_inputRollWind, select = paid_media_vars), na.rm = TRUE) - - for (i in seq_along(paid_media_spends)) { - if (exposure_selector[i]) { - # Run models (NLS and/or LM) - dt_spendModInput <- subset(dt_inputRollWind, select = c(paid_media_spends[i], paid_media_vars[i])) - results <- fit_spend_exposure(dt_spendModInput, mediaCostFactor[i], paid_media_vars[i]) - # Compare NLS & LM, takes LM if NLS fits worse - mod <- results$res - exposure_selector[i] <- if (is.null(mod$rsq_nls)) FALSE else mod$rsq_nls > mod$rsq_lm - # Data to create plot - dt_plotNLS <- data.frame( - channel = paid_media_vars[i], - yhatNLS = if (exposure_selector[i]) results$yhatNLS else results$yhatLM, - yhatLM = results$yhatLM, - y = results$data$exposure, - x = results$data$spend - ) - caption <- glued(" - nls: AIC = {aic_nls} | R2 = {r2_nls} - lm: AIC = {aic_lm} | R2 = {r2_lm}", - aic_nls = signif(AIC(if (exposure_selector[i]) results$modNLS else results$modLM), 3), - r2_nls = signif(if (exposure_selector[i]) mod$rsq_nls else mod$rsq_lm, 3), - aic_lm = signif(AIC(results$modLM), 3), - r2_lm = signif(mod$rsq_lm, 3) - ) - dt_plotNLS <- dt_plotNLS %>% - pivot_longer( - cols = c("yhatNLS", "yhatLM"), - names_to = "models", values_to = "yhat" - ) %>% - mutate(models = str_remove(tolower(.data$models), "yhat")) - models_plot <- ggplot( - dt_plotNLS, aes(x = .data$x, y = .data$y, color = .data$models) - ) + - geom_point() + - geom_line(aes(y = .data$yhat, x = .data$x, color = .data$models)) + - labs( - title = "Exposure-Spend Models Fit Comparison", - x = sprintf("Spend [%s]", paid_media_spends[i]), - y = sprintf("Exposure [%s]", paid_media_vars[i]), - caption = caption, - color = "Model" - ) + - theme_lares(background = "white", legend = "top") + - scale_x_abbr() + - scale_y_abbr() - - # Save results into modNLSCollect. plotNLSCollect, yhatCollect - modNLSCollect[[paid_media_vars[i]]] <- mod - plotNLSCollect[[paid_media_vars[i]]] <- models_plot - yhatCollect[[paid_media_vars[i]]] <- dt_plotNLS - } - } - modNLSCollect <- bind_rows(modNLSCollect) - yhatNLSCollect <- bind_rows(yhatCollect) - yhatNLSCollect$ds <- rep(dt_transformRollWind$ds, nrow(yhatNLSCollect) / nrow(dt_transformRollWind)) - } else { - modNLSCollect <- plotNLSCollect <- yhatNLSCollect <- NULL - } - - # Give recommendations and show warnings - if (!is.null(modNLSCollect) && !quiet) { - threshold <- 0.80 - final_print <- these <- NULL # TRUE if we accumulate a common message - metrics <- c("R2 (nls)", "R2 (lm)") - names(metrics) <- c("rsq_nls", "rsq_lm") - for (m in seq_along(metrics)) { - temp <- which(modNLSCollect[[names(metrics)[m]]] < threshold) - if (length(temp) > 0) { - # warning(sprintf( - # "%s: weak relationship for %s and %s spend", - # metrics[m], - # v2t(modNLSCollect$channel[temp], and = "and"), - # ifelse(length(temp) > 1, "their", "its") - # )) - final_print <- TRUE - these <- modNLSCollect$channel[temp] - } - } - if (isTRUE(final_print)) { - message( - paste( - "NOTE: potential improvement on splitting channels for better exposure fitting.", - "Threshold (Minimum R2) =", threshold, - "\n Check: InputCollect$modNLS$plots outputs" - ), - "\n Weak relationship for: ", v2t(these), " and their spend" - ) - } - } - - ################################################################ - #### Clean & aggregate data + ## Standardise ds and dep_var cols + dt_transform <- dt_input %>% + rename("ds" = InputCollect$date_var, + "dep_var" = InputCollect$dep_var) %>% + arrange(.data$ds) ## Transform all factor variables if (length(factor_vars) > 0) { @@ -763,6 +625,7 @@ robyn_engineering <- function(x, quiet = FALSE, ...) { context_vars = InputCollect$context_vars, organic_vars = InputCollect$organic_vars, paid_media_spends = paid_media_spends, + paid_media_vars = paid_media_vars, intervalType = InputCollect$intervalType, dayInterval = InputCollect$dayInterval, custom_params = custom_params @@ -770,17 +633,23 @@ robyn_engineering <- function(x, quiet = FALSE, ...) { } ################################################################ - #### Finalize enriched input + #### Model exposure metric from spend + + ExposureCollect <- exposure_handling( + dt_transform, + window_start_loc = rollingWindowStartWhich, + window_end_loc = rollingWindowEndWhich, + paid_media_spends, + paid_media_vars, + quiet) + ################################################################ + #### Finalize enriched input + dt_transform <- ExposureCollect$dt_transform dt_transform <- subset(dt_transform, select = c("ds", "dep_var", InputCollect$all_ind_vars)) InputCollect[["dt_mod"]] <- dt_transform InputCollect[["dt_modRollWind"]] <- dt_transform[rollingWindowStartWhich:rollingWindowEndWhich, ] - InputCollect[["dt_inputRollWind"]] <- dt_inputRollWind - InputCollect[["modNLS"]] <- list( - results = modNLSCollect, - yhat = yhatNLSCollect, - plots = plotNLSCollect - ) + InputCollect[["ExposureCollect"]] <- ExposureCollect return(InputCollect) } @@ -803,7 +672,7 @@ robyn_engineering <- function(x, quiet = FALSE, ...) { prophet_decomp <- function(dt_transform, dt_holidays, prophet_country, prophet_vars, prophet_signs, factor_vars, context_vars, organic_vars, paid_media_spends, - intervalType, dayInterval, custom_params) { + paid_media_vars, intervalType, dayInterval, custom_params) { check_prophet(dt_holidays, prophet_country, prophet_vars, prophet_signs, dayInterval) recurrence <- select(dt_transform, .data$ds, .data$dep_var) %>% rename("y" = "dep_var") holidays <- set_holidays(dt_transform, dt_holidays, intervalType) @@ -814,7 +683,7 @@ prophet_decomp <- function(dt_transform, dt_holidays, use_weekday <- "weekday" %in% prophet_vars | "weekly.seasonality" %in% prophet_vars dt_regressors <- bind_cols(recurrence, select( - dt_transform, all_of(c(paid_media_spends, context_vars, organic_vars)) + dt_transform, all_of(c(paid_media_spends, paid_media_vars, context_vars, organic_vars)) )) %>% mutate(ds = as.Date(.data$ds)) @@ -879,93 +748,69 @@ prophet_decomp <- function(dt_transform, dt_holidays, return(dt_transform) } -#################################################################### -#' Fit a nonlinear model for media spend and exposure -#' -#' This function is called in \code{robyn_engineering()}. It uses -#' the Michaelis-Menten function to fit the nonlinear model. Fallback -#' model is the simple linear model \code{lm()} in case the nonlinear -#' model is fitting worse. A bad fit here might result in unreasonable -#' model results. Two options are recommended: Either splitting the -#' channel into sub-channels to achieve better fit, or just use -#' spend as \code{paid_media_vars} -#' -#' @param dt_spendModInput data.frame. Containing channel spends and -#' exposure data. -#' @param mediaCostFactor Numeric vector. The ratio between raw media -#' exposure and spend metrics. -#' @param paid_media_var Character. Paid media variable. -#' @return List. Containing the all spend-exposure model results. -fit_spend_exposure <- function(dt_spendModInput, mediaCostFactor, paid_media_var) { - if (ncol(dt_spendModInput) != 2) stop("Pass only 2 columns") - colnames(dt_spendModInput) <- c("spend", "exposure") - - # Model 1: Michaelis-Menten model Vmax * spend/(Km + spend) - tryCatch( - { - nlsStartVal <- list( - Vmax = max(dt_spendModInput$exposure), - Km = max(dt_spendModInput$exposure) / 2 - ) - - modNLS <- nlsLM(exposure ~ Vmax * spend / (Km + spend), - data = dt_spendModInput, - start = nlsStartVal, - control = nls.control(warnOnly = TRUE) - ) - yhatNLS <- predict(modNLS) - modNLSSum <- summary(modNLS) - rsq_nls <- get_rsq(true = dt_spendModInput$exposure, predicted = yhatNLS) - - # # QA nls model prediction: check - # yhatNLSQA <- modNLSSum$coefficients[1,1] * dt_spendModInput$spend / (modNLSSum$coefficients[2,1] + dt_spendModInput$spend) #exposure = v * spend / (k + spend) - # identical(yhatNLS, yhatNLSQA) - }, - error = function(cond) { - modNLS <- yhatNLS <- modNLSSum <- rsq_nls <- NULL - }, - warning = function(cond) { - modNLS <- yhatNLS <- modNLSSum <- rsq_nls <- NULL - }, - finally = if (!exists("modNLS")) modNLS <- yhatNLS <- modNLSSum <- rsq_nls <- NULL - ) +exposure_handling <- function(dt_transform, + window_start_loc, + window_end_loc, + paid_media_spends, + paid_media_vars, + quiet) { + exposure_selector <- paid_media_spends != paid_media_vars + paid_media_selected <- ifelse(exposure_selector, paid_media_vars, paid_media_spends) + df_cpe <- list() + df_expo_p <- list() + for (i in seq_along(exposure_selector)) { + temp_spend <- dt_transform %>% select(paid_media_spends[i]) + temp_expo <- dt_transform %>% select(paid_media_vars[i]) + temp_spend_window <- temp_spend[window_start_loc:window_end_loc, ] + temp_expo_window <- temp_expo[window_start_loc:window_end_loc, ] + ## cpe = cost per exposure, an internal linear scaler between spend & exposure + temp_cpe <- sum(temp_spend)/ sum(temp_expo) + temp_cpe_window <- sum(temp_spend_window)/ sum(temp_expo_window) + temp_spend_scaled <- ifelse(exposure_selector[i], temp_expo * temp_cpe, temp_spend) + temp_spend_scaled_window <- ifelse(exposure_selector[i], temp_expo_window * temp_cpe_window, temp_spend_window) + df_cpe[[i]] <- data.frame( + paid_media_selected = paid_media_selected[i], + cpe = temp_cpe, + cpe_window = temp_cpe_window, + adj_rsq = get_rsq(true = unlist(temp_spend), + predicted = unlist(temp_spend_scaled)), + adj_rsq_window = get_rsq(true = unlist(temp_spend_window), + predicted = unlist(temp_spend_scaled_window)) + ) + ## Use window cpe to predict the whole dataset to keep the window spend scale right + spend_scaled_extrapolated <- temp_expo * temp_cpe_window + df_expo_p[[i]] <- data.frame(spend = unlist(temp_spend), + exposure = unlist(temp_expo), + media = paid_media_selected[i]) + dt_transform <- dt_transform %>% + mutate_at(vars(paid_media_selected[i]), function(x) unlist(spend_scaled_extrapolated)) + } + df_cpe <- bind_rows(df_cpe) + df_expo_p <- bind_rows(df_expo_p) + p_expo <- df_expo_p %>% ggplot(aes(x = .data$spend, y = .data$exposure)) + + geom_point() + + geom_smooth(method = "lm", formula = y ~ x) + + facet_wrap(~ .data$media, scales = "free") + + labs(title = "Spend & exposure relationship for paid media.", + subtitle = "Re-consider media splits if a media shows multiple patterns.") - # Model 2: Build lm comparison model - modLM <- lm(exposure ~ spend - 1, data = dt_spendModInput) - yhatLM <- predict(modLM) - modLMSum <- summary(modLM) - rsq_lm <- modLMSum$adj.r.squared - if (is.na(rsq_lm)) stop("Please check if ", paid_media_var, " contains only 0s") - # if (max(rsq_lm, rsq_nls) < 0.7) { - # warning(paste( - # "Spend-exposure fitting for", paid_media_var, - # "has rsq = ", round(max(rsq_lm, rsq_nls), 4), - # "To increase the fit, try splitting the variable.", - # "Otherwise consider using spend instead." - # )) - # } - - output <- list( - res = data.frame( - channel = paid_media_var, - Vmax = if (!is.null(modNLS)) modNLSSum$coefficients[1, 1] else NA, - Km = if (!is.null(modNLS)) modNLSSum$coefficients[2, 1] else NA, - aic_nls = if (!is.null(modNLS)) AIC(modNLS) else NA, - aic_lm = AIC(modLM), - bic_nls = if (!is.null(modNLS)) BIC(modNLS) else NA, - bic_lm = BIC(modLM), - rsq_nls = if (!is.null(modNLS)) rsq_nls else 0, - rsq_lm = rsq_lm, - coef_lm = coef(modLMSum)[1] - ), - yhatNLS = yhatNLS, - modNLS = modNLS, - yhatLM = yhatLM, - modLM = modLM, - data = dt_spendModInput, - type = ifelse(is.null(modNLS), "lm", "mm") - ) - return(output) + # Give recommendations and show warnings + threshold <- 0.8 + temp_names <- df_cpe %>% filter(.data$adj_rsq_window < threshold) %>% pull(paid_media_selected) + if (!quiet & any(exposure_selector) & length(temp_names) > 1) { + message( + paste( + "NOTE: potential improvement on splitting channels for better spend exposure fitting.", + "Threshold (min.adj.R2) =", threshold, + "\n Check: InputCollect$ExposureCollect$plot_spend_exposure outputs" + ), + "\n Weak relationship for: ", v2t(temp_names), " and their spend" + ) + } + return(list(df_cpe = df_cpe, + plot_spend_exposure = p_expo, + dt_transform = dt_transform, + paid_media_selected = paid_media_selected)) } #################################################################### @@ -1012,3 +857,39 @@ set_holidays <- function(dt_transform, dt_holidays, intervalType) { return(holidays) } + +#################################################################### +#' Set default hyperparameters +#' +#' For quick setting of hyperparameter ranges. +#' +#' @param adstock Character. InputCollect$adstock +#' @param all_media Character. Provide InputCollect$all_media. +#' @param list_default A List. Default ranges for hyperparameters. +#' @return List. Expanded range of hyperparameters for all media. +#' @export +set_default_hyppar <- function( + adstock = NULL, + all_media = NULL, + list_default = list(alpha = c(0.5, 3), + gamma = c(0.01, 1), + theta = c(0, 0.8), + shape = c(0, 10), + scale = c(0, 0.1), + train_size = c(0.5, 0.9)) + +) { + hpnames <- hyper_names(adstock = adstock, all_media = all_media) + hyperparameters <- list() + for (i in seq_along(hpnames)) { + hyperparameters[[i]] <- dplyr::case_when( + str_detect(hpnames[[i]], "_alphas") ~ list_default[["alpha"]], + str_detect(hpnames[[i]], "_gammas") ~ list_default[["gamma"]], + str_detect(hpnames[[i]], "_thetas") ~ list_default[["theta"]], + str_detect(hpnames[[i]], "_shapes") ~ list_default[["shape"]], + str_detect(hpnames[[i]], "_scales") ~ list_default[["scale"]]) + names(hyperparameters)[[i]] <- hpnames[[i]] + } + hyperparameters[["train_size"]] <- list_default[["train_size"]] + return(hyperparameters) +} diff --git a/R/R/model.R b/R/R/model.R index 487ea9e65..c4bed8893 100644 --- a/R/R/model.R +++ b/R/R/model.R @@ -371,7 +371,7 @@ robyn_train <- function(InputCollect, hyper_collect, OutputModels[[1]]$trial <- 1 # Set original solID (to overwrite default 1_1_1) if ("solID" %in% names(dt_hyper_fixed)) { - these <- c("resultHypParam", "xDecompVec", "xDecompAgg", "decompSpendDist") + these <- c("resultHypParam", "xDecompVec", "xDecompAgg") for (tab in these) OutputModels[[1]]$resultCollect[[tab]]$solID <- dt_hyper_fixed$solID } } else { @@ -407,9 +407,9 @@ robyn_train <- function(InputCollect, hyper_collect, seed = seed + ngt, quiet = quiet ) - check_coef0 <- any(model_output$resultCollect$decompSpendDist$decomp.rssd == Inf) + check_coef0 <- any(model_output$resultCollect$resultHypParam$decomp.rssd == Inf) if (check_coef0) { - num_coef0_mod <- filter(model_output$resultCollect$decompSpendDist, is.infinite(.data$decomp.rssd)) %>% + num_coef0_mod <- filter(model_output$resultCollect$resultHypParam, is.infinite(.data$decomp.rssd)) %>% distinct(.data$iterNG, .data$iterPar) %>% nrow() num_coef0_mod <- ifelse(num_coef0_mod > iterations, iterations, num_coef0_mod) @@ -516,6 +516,8 @@ robyn_mmm <- function(InputCollect, refresh_steps <- InputCollect$refresh_steps rollingWindowLength <- InputCollect$rollingWindowLength paid_media_spends <- InputCollect$paid_media_spends + paid_media_selected <- InputCollect$paid_media_selected + exposure_vars <- InputCollect$exposure_vars organic_vars <- InputCollect$organic_vars context_vars <- InputCollect$context_vars prophet_vars <- InputCollect$prophet_vars @@ -535,7 +537,7 @@ robyn_mmm <- function(InputCollect, dt_inputTrain <- InputCollect$dt_input[rollingWindowStartWhich:rollingWindowEndWhich, ] temp <- select(dt_inputTrain, all_of(paid_media_spends)) dt_spendShare <- data.frame( - rn = paid_media_spends, + rn = paid_media_selected, total_spend = unlist(summarise_all(temp, sum)), # mean_spend = unlist(summarise_all(temp, function(x) { # ifelse(is.na(mean(x[x > 0])), 0, mean(x[x > 0])) @@ -543,12 +545,15 @@ robyn_mmm <- function(InputCollect, mean_spend = unlist(summarise_all(temp, mean)) ) %>% mutate(spend_share = .data$total_spend / sum(.data$total_spend)) + temp <- select(dt_inputTrain, all_of(c(exposure_vars, organic_vars))) %>% summarise_all(mean) %>% unlist + temp <- data.frame(rn = c(exposure_vars, organic_vars), mean_exposure = temp) + dt_spendShare <- full_join(dt_spendShare, temp, by = "rn") # When not refreshing, dt_spendShareRF = dt_spendShare refreshAddedStartWhich <- which(dt_modRollWind$ds == refreshAddedStart) temp <- select(dt_inputTrain, all_of(paid_media_spends)) %>% slice(refreshAddedStartWhich:rollingWindowLength) dt_spendShareRF <- data.frame( - rn = paid_media_spends, + rn = paid_media_selected, total_spend = unlist(summarise_all(temp, sum)), # mean_spend = unlist(summarise_all(temp, function(x) { # ifelse(is.na(mean(x[x > 0])), 0, mean(x[x > 0])) @@ -557,8 +562,14 @@ robyn_mmm <- function(InputCollect, ) %>% mutate(spend_share = .data$total_spend / sum(.data$total_spend)) # Join both dataframes into a single one + temp <- select(dt_inputTrain, all_of(c(exposure_vars, organic_vars))) %>% + slice(refreshAddedStartWhich:rollingWindowLength) %>% + summarise_all(mean) %>% unlist + temp <- data.frame(rn = c(exposure_vars, organic_vars), mean_exposure = temp) + dt_spendShareRF <- full_join(dt_spendShareRF, temp, by = "rn") dt_spendShare <- left_join(dt_spendShare, dt_spendShareRF, "rn", suffix = c("", "_refresh")) + ################################################ #### Get lambda lambda_min_ratio <- 0.0001 # default value from glmnet @@ -662,15 +673,17 @@ robyn_mmm <- function(InputCollect, adstock <- check_adstock(adstock) #### Transform media for model fitting - temp <- run_transformations(InputCollect, hypParamSam, ...) - dt_modSaturated <- temp$dt_modSaturated - dt_saturatedImmediate <- temp$dt_saturatedImmediate - dt_saturatedCarryover <- temp$dt_saturatedCarryover + temp <- run_transformations(all_media = InputCollect$all_media, + window_start_loc = InputCollect$rollingWindowStartWhich, + window_end_loc = InputCollect$rollingWindowEndWhich, + dt_mod = InputCollect$dt_mod, + adstock = InputCollect$adstock, + dt_hyppar = hypParamSam, ...) ##################################### #### Split train & test and prepare data for modelling - dt_window <- dt_modSaturated + dt_window <- temp$dt_modSaturated ## Contrast matrix because glmnet does not treat categorical variables (one hot encoding) y_window <- dt_window$dep_var @@ -697,7 +710,7 @@ robyn_mmm <- function(InputCollect, ## Define and set sign control dt_sign <- select(dt_window, -.data$dep_var) x_sign <- c(prophet_signs, context_signs, paid_media_signs, organic_signs) - names(x_sign) <- c(prophet_vars, context_vars, paid_media_spends, organic_vars) + names(x_sign) <- c(prophet_vars, context_vars, paid_media_selected, organic_vars) check_factor <- unlist(lapply(dt_sign, is.factor)) lower.limits <- rep(0, length(prophet_signs)) upper.limits <- rep(1, length(prophet_signs)) @@ -753,9 +766,12 @@ robyn_mmm <- function(InputCollect, ## If no lift calibration, refit using best lambda mod_out <- model_refit( - x_train, y_train, - x_val, y_val, - x_test, y_test, + x_train = x_train, + y_train = y_train, + x_val = x_val, + y_val = y_val, + x_test = x_test, + y_test = y_test, lambda = lambda_scaled, lower.limits = lower.limits, upper.limits = upper.limits, @@ -768,9 +784,9 @@ robyn_mmm <- function(InputCollect, inputs = list( coefs = mod_out$coefs, y_pred = mod_out$y_pred, - dt_modSaturated = dt_modSaturated, - dt_saturatedImmediate = dt_saturatedImmediate, - dt_saturatedCarryover = dt_saturatedCarryover, + dt_modSaturated = temp$dt_modSaturated, + dt_saturatedImmediate = temp$dt_saturatedImmediate, + dt_saturatedCarryover = temp$dt_saturatedCarryover, dt_modRollWind = dt_modRollWind, refreshAddedStart = refreshAddedStart )) @@ -781,7 +797,7 @@ robyn_mmm <- function(InputCollect, ##################################### #### MAPE: Calibration error if (!is.null(calibration_input)) { - liftCollect <- robyn_calibrate( + liftCollect <- lift_calibration( calibration_input = calibration_input, df_raw = dt_mod, hypParamSam = hypParamSam, @@ -798,36 +814,37 @@ robyn_mmm <- function(InputCollect, ##################################### #### DECOMP.RSSD: Business error # Sum of squared distance between decomp share and spend share to be minimized - dt_decompSpendDist <- decompCollect$xDecompAgg %>% - filter(.data$rn %in% paid_media_spends) %>% + dt_loss_calc <- decompCollect$xDecompAgg %>% + filter(.data$rn %in% c(paid_media_selected, organic_vars)) %>% select( - .data$rn, .data$xDecompAgg, .data$xDecompPerc, .data$xDecompMeanNon0Perc, - .data$xDecompMeanNon0, .data$xDecompPercRF, .data$xDecompMeanNon0PercRF, - .data$xDecompMeanNon0RF - ) %>% - left_join( - select( - dt_spendShare, - .data$rn, .data$spend_share, .data$spend_share_refresh, - .data$mean_spend, .data$total_spend - ), - by = "rn" - ) %>% - mutate( - effect_share = .data$xDecompPerc / sum(.data$xDecompPerc), - effect_share_refresh = .data$xDecompPercRF / sum(.data$xDecompPercRF) - ) - dt_decompSpendDist <- left_join( - filter(decompCollect$xDecompAgg, .data$rn %in% paid_media_spends), - select(dt_decompSpendDist, .data$rn, contains("_spend"), contains("_share")), + .data$rn, .data$xDecompPerc, .data$xDecompPercRF + ) %>% left_join( + select( + dt_spendShare, + c("rn", "spend_share", "spend_share_refresh","mean_spend", + "total_spend", "mean_exposure", "mean_exposure_refresh") + ), by = "rn" ) + dt_loss_calc <- bind_rows( + dt_loss_calc %>% filter(.data$rn %in% paid_media_selected) %>% + mutate( + effect_share = .data$xDecompPerc / sum(.data$xDecompPerc), + effect_share_refresh = .data$xDecompPercRF / sum(.data$xDecompPercRF) + ), + dt_loss_calc %>% filter(.data$rn %in% organic_vars) %>% + mutate( + effect_share = NA, effect_share_refresh = NA) + ) %>% select(-c("xDecompPerc", "xDecompPercRF")) + decompCollect$xDecompAgg <- left_join( + decompCollect$xDecompAgg, dt_loss_calc, by = "rn") + dt_loss_calc <- dt_loss_calc %>% filter(.data$rn %in% paid_media_selected) if (!refresh) { - decomp.rssd <- sqrt(sum((dt_decompSpendDist$effect_share - dt_decompSpendDist$spend_share)^2)) + decomp.rssd <- sqrt(sum((dt_loss_calc$effect_share - dt_loss_calc$spend_share)^2)) # Penalty for models with more 0-coefficients if (rssd_zero_penalty) { - is_0eff <- round(dt_decompSpendDist$effect_share, 4) == 0 - share_0eff <- sum(is_0eff) / length(dt_decompSpendDist$effect_share) + is_0eff <- round(dt_loss_calc$effect_share, 4) == 0 + share_0eff <- sum(is_0eff) / length(dt_loss_calc$effect_share) decomp.rssd <- decomp.rssd * (1 + share_0eff) } } else { @@ -836,11 +853,11 @@ robyn_mmm <- function(InputCollect, by = "rn" ) decomp.rssd.media <- dt_decompRF %>% - filter(.data$rn %in% paid_media_spends) %>% + filter(.data$rn %in% paid_media_selected) %>% summarise(rssd.media = sqrt(mean((.data$decomp_perc - .data$decomp_perc_prev)^2))) %>% pull(.data$rssd.media) decomp.rssd.nonmedia <- dt_decompRF %>% - filter(!.data$rn %in% paid_media_spends) %>% + filter(!.data$rn %in% paid_media_selected) %>% summarise(rssd.nonmedia = sqrt(mean((.data$decomp_perc - .data$decomp_perc_prev)^2))) %>% pull(.data$rssd.nonmedia) decomp.rssd <- decomp.rssd.media + decomp.rssd.nonmedia / @@ -849,7 +866,8 @@ robyn_mmm <- function(InputCollect, # When all media in this iteration have 0 coefficients if (is.nan(decomp.rssd)) { decomp.rssd <- Inf - dt_decompSpendDist$effect_share <- 0 + decompCollect$xDecompAgg <- decompCollect$xDecompAgg %>% + mutate(effect_share = ifelse(is.na(.data$effect_share), NA, 0)) } ##################################### @@ -882,6 +900,8 @@ robyn_mmm <- function(InputCollect, resultCollect[["resultHypParam"]] <- as_tibble(hypParamSam) %>% select(-.data$lambda) %>% + bind_cols(as_tibble(t(temp$inflexions))) %>% + bind_cols(as_tibble(t(temp$inflations))) %>% bind_cols(common[, 1:split_common]) %>% mutate( pos = prod(decompCollect$xDecompAgg$pos), @@ -900,9 +920,8 @@ robyn_mmm <- function(InputCollect, bind_cols(common) } - resultCollect[["decompSpendDist"]] <- dt_decompSpendDist %>% - bind_cols(common) - + # resultCollect[["decompSpendDist"]] <- dt_decompSpendDist %>% + # bind_cols(common) resultCollect <- append(resultCollect, as.list(common)) return(resultCollect) } @@ -910,7 +929,6 @@ robyn_mmm <- function(InputCollect, ########### Parallel start nrmse.collect <- NULL decomp.rssd.collect <- NULL - best_mape <- Inf if (cores == 1) { doparCollect <- lapply(1:iterPar, robyn_iterations) } else { @@ -1009,11 +1027,11 @@ robyn_mmm <- function(InputCollect, arrange(.data$mape, .data$liftMedia, .data$liftStart)) } - resultCollect[["decompSpendDist"]] <- as_tibble(bind_rows( - lapply(resultCollectNG, function(x) { - bind_rows(lapply(x, function(y) y$decompSpendDist)) - }) - )) + # resultCollect[["decompSpendDist"]] <- as_tibble(bind_rows( + # lapply(resultCollectNG, function(x) { + # bind_rows(lapply(x, function(y) y$decompSpendDist)) + # }) + # )) resultCollect$iter <- length(resultCollect$mape) resultCollect$elapsed.min <- sysTimeDopar[3] / 60 @@ -1147,7 +1165,6 @@ model_refit <- function(x_train, y_train, x_val, y_val, x_test, y_test, intercept = intercept, ... ) # coef(mod) - df.int <- 1 ## Drop intercept if negative and intercept_sign == "non_negative" @@ -1160,6 +1177,7 @@ model_refit <- function(x_train, y_train, x_val, y_val, x_test, y_test, lambda = lambda, lower.limits = lower.limits, upper.limits = upper.limits, + type.measure = "mse", penalty.factor = penalty.factor, intercept = FALSE, ... diff --git a/R/R/outputs.R b/R/R/outputs.R index fd830ce84..661c32c19 100644 --- a/R/R/outputs.R +++ b/R/R/outputs.R @@ -180,7 +180,7 @@ robyn_outputs <- function(InputCollect, OutputModels, ) %>% left_join( pareto_results$df_caov_pct_all %>% - filter(type == "Carryover") %>% + filter(.data$type == "Carryover") %>% select("solID", "rn", "carryover_pct"), by = c("solID", "rn") ) diff --git a/R/R/pareto.R b/R/R/pareto.R index 01cc87d03..b52b2733a 100644 --- a/R/R/pareto.R +++ b/R/R/pareto.R @@ -58,9 +58,8 @@ robyn_pareto <- function(InputCollect, OutputModels, xDecompAgg <- left_join(xDecompAgg, bootstrap, by = c("rn" = "variable")) } } - - xDecompAggCoef0 <- xDecompAgg %>% - filter(.data$rn %in% InputCollect$paid_media_spends) %>% + xDecompAggPaid <- xDecompAgg %>% filter(.data$rn %in% InputCollect$paid_media_selected) + xDecompAggCoef0 <- xDecompAggPaid %>% group_by(.data$solID) %>% summarise(coef0 = min(.data$coef, na.rm = TRUE) == 0) @@ -101,13 +100,8 @@ robyn_pareto <- function(InputCollect, OutputModels, # Bind robynPareto results xDecompAgg <- left_join(xDecompAgg, select(resultHypParam, .data$robynPareto, .data$solID), by = "solID") - decompSpendDist <- bind_rows(lapply(OutModels, function(x) { - mutate(x$resultCollect$decompSpendDist, trial = x$trial) - })) %>% - { - if (!hyper_fixed) mutate(., solID = paste(.data$trial, .data$iterNG, .data$iterPar, sep = "_")) else . - } %>% - left_join(select(resultHypParam, .data$robynPareto, .data$solID), by = "solID") + xDecompAggMedia <- xDecompAgg %>% filter(.data$rn %in% InputCollect$all_media) %>% + select(c("rn", "solID", "coef", "mean_spend", "mean_exposure", "xDecompAgg", "total_spend", "robynPareto")) # Prepare parallel loop if (TRUE) { @@ -143,88 +137,67 @@ robyn_pareto <- function(InputCollect, OutputModels, } pareto_fronts_vec <- 1:pareto_fronts - decompSpendDistPar <- decompSpendDist[decompSpendDist$robynPareto %in% pareto_fronts_vec, ] + # decompSpendDistPar <- decompSpendDist[decompSpendDist$robynPareto %in% pareto_fronts_vec, ] resultHypParamPar <- resultHypParam[resultHypParam$robynPareto %in% pareto_fronts_vec, ] - xDecompAggPar <- xDecompAgg[xDecompAgg$robynPareto %in% pareto_fronts_vec, ] + # xDecompAggPar <- xDecompAgg[xDecompAgg$robynPareto %in% pareto_fronts_vec, ] + xDecompAggMediaPar <- xDecompAggMedia %>% filter(.data$robynPareto %in% pareto_fronts_vec) respN <- NULL } if (!quiet) { message(sprintf( ">>> Calculating response curves for all models' media variables (%s)...", - nrow(decompSpendDistPar) + nrow(xDecompAggMediaPar) )) } - run_dt_resp <- function(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) { - get_solID <- decompSpendDistPar$solID[respN] - get_spendname <- decompSpendDistPar$rn[respN] - startRW <- InputCollect$rollingWindowStartWhich - endRW <- InputCollect$rollingWindowEndWhich - - get_resp <- robyn_response( - select_model = get_solID, - metric_name = get_spendname, - # metric_value = decompSpendDistPar$total_spend[respN], - # date_range = range(InputCollect$dt_modRollWind$ds), - date_range = "all", - dt_hyppar = resultHypParamPar, - dt_coef = xDecompAggPar, - InputCollect = InputCollect, - OutputCollect = OutputModels, - quiet = TRUE, - ... - ) - # Median value (but must be within the curve) - # med_in_curve <- sort(get_resp$response_total)[round(length(get_resp$response_total) / 2)] - - ## simulate mean response adstock from get_resp$input_carryover - # mean_response <- mean(get_resp$response_total) - mean_spend_adstocked <- mean(get_resp$input_total[startRW:endRW]) - mean_carryover <- mean(get_resp$input_carryover[startRW:endRW]) - dt_hyppar <- resultHypParamPar %>% filter(.data$solID == get_solID) - chnAdstocked <- data.frame(v1 = get_resp$input_total[startRW:endRW]) - colnames(chnAdstocked) <- get_spendname - dt_coef <- xDecompAggPar %>% - filter(.data$solID == get_solID & .data$rn == get_spendname) %>% - select(c("rn", "coef")) - hills <- get_hill_params( - InputCollect, NULL, dt_hyppar, dt_coef, - mediaSpendSorted = get_spendname, - select_model = get_solID, chnAdstocked - ) - mean_response <- fx_objective( - x = decompSpendDistPar$mean_spend[respN], - coeff = hills$coefs_sorted, - alpha = hills$alphas, - inflexion = hills$inflexions, - x_hist_carryover = mean_carryover, - get_sum = FALSE - ) - dt_resp <- data.frame( - mean_response = mean_response, - mean_spend_adstocked = mean_spend_adstocked, - mean_carryover = mean_carryover, - rn = decompSpendDistPar$rn[respN], - solID = decompSpendDistPar$solID[respN] - ) - return(dt_resp) - } - if (OutputModels$cores > 1) { - resp_collect <- foreach( - respN = seq_along(decompSpendDistPar$rn), .combine = bind_rows - ) %dorng% { - run_dt_resp(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) - } - stopImplicitCluster() - } else { - resp_collect <- bind_rows(lapply(seq_along(decompSpendDistPar$rn), function(respN) { - run_dt_resp(respN, InputCollect, OutputModels, decompSpendDistPar, resultHypParamPar, xDecompAggPar, ...) - })) - } - decompSpendDist <- left_join( - decompSpendDist, - resp_collect, + cnt_resp <- nrow(xDecompAggMediaPar) + pb_resp <- txtProgressBar(min = 0, max = cnt_resp, style = 3) + resp_collect <- lapply( + 1:cnt_resp, + function(respN) { + setTxtProgressBar(pb_resp, respN) + get_solID <- xDecompAggMediaPar$solID[respN] + get_media_name <- xDecompAggMediaPar$rn[respN] + window_start_loc <- InputCollect$rollingWindowStartWhich + window_end_loc <- InputCollect$rollingWindowEndWhich + + get_resp <- robyn_response( + select_model = get_solID, + metric_name = get_media_name, + date_range = "all", + dt_hyppar = resultHypParamPar, + dt_coef = xDecompAggMediaPar, + InputCollect = InputCollect, + OutputCollect = OutputModels, + quiet = TRUE, + ... + ) + list_response <- list( + dt_resp = data.frame( + mean_response = get_resp$mean_response_total, + mean_spend_adstocked = get_resp$mean_input_immediate + get_resp$mean_input_carryover, + mean_carryover = get_resp$mean_input_carryover, + rn = get_media_name, + solID = get_solID + ), + dt_resp_vec = data.frame( + channel = rep(get_media_name, length(get_resp$response_total)), + response = get_resp$response_total, + response_carryover = get_resp$response_carryover, + spend = get_resp$input_total[window_start_loc:window_end_loc], + solID = rep(get_solID, length(get_resp$response_total)) + ) + ) + return(list_response) + } + ) + close(pb_resp) + dt_resp <- bind_rows(lapply(resp_collect, function(x) x[["dt_resp"]])) + dt_resp_vec <- bind_rows(lapply(resp_collect, function(x) x[["dt_resp_vec"]])) + + xDecompAgg <- xDecompAgg %>% left_join( + dt_resp, by = c("solID", "rn") ) %>% mutate( @@ -233,15 +206,6 @@ robyn_pareto <- function(InputCollect, OutputModels, cpa_mean = .data$mean_spend / .data$mean_response, cpa_total = .data$total_spend / .data$xDecompAgg ) - # decompSpendDist %>% filter(solID == select_model) %>% arrange(rn) %>% select(rn, mean_spend, mean_response, roi_mean) - xDecompAgg <- left_join( - xDecompAgg, - select( - decompSpendDist, .data$rn, .data$solID, .data$total_spend, .data$mean_spend, .data$mean_spend_adstocked, .data$mean_carryover, - .data$mean_response, .data$spend_share, .data$effect_share, .data$roi_mean, .data$roi_total, .data$cpa_total - ), - by = c("solID", "rn") - ) # Pareto loop (no plots) mediaVecCollect <- list() @@ -252,12 +216,13 @@ robyn_pareto <- function(InputCollect, OutputModels, dt_modRollWind <- InputCollect$dt_modRollWind rw_start_loc <- InputCollect$rollingWindowStartWhich rw_end_loc <- InputCollect$rollingWindowEndWhich + dt_ds <- dt_mod[rw_start_loc:rw_end_loc, "ds"] for (pf in pareto_fronts_vec) { plotMediaShare <- filter( xDecompAgg, .data$robynPareto == pf, - .data$rn %in% InputCollect$paid_media_spends + .data$rn %in% InputCollect$paid_media_selected ) uniqueSol <- unique(plotMediaShare$solID) plotWaterfall <- xDecompAgg %>% filter(.data$robynPareto == pf) @@ -265,15 +230,6 @@ robyn_pareto <- function(InputCollect, OutputModels, message(sprintf(">> Pareto-Front: %s [%s models]", pf, length(uniqueSol))) } - # # To recreate "xDecompVec", "xDecompVecImmediate", "xDecompVecCarryover" for each model - # temp <- OutputModels[names(OutputModels) %in% paste0("trial", 1:OutputModels$trials)] - # xDecompVecImmCarr <- bind_rows(lapply(temp, function(x) x$resultCollect$xDecompVec)) - # if (!"solID" %in% colnames(xDecompVecImmCarr)) { - # xDecompVecImmCarr <- xDecompVecImmCarr %>% - # mutate(solID = paste(.data$trial, .data$iterNG, .data$iterPar, sep = "_")) %>% - # filter(.data$solID %in% uniqueSol) - # } - # Calculations for pareto AND pareto plots for (sid in uniqueSol) { # parallelResult <- foreach(sid = uniqueSol) %dorng% { @@ -288,7 +244,7 @@ robyn_pareto <- function(InputCollect, OutputModels, c("spend_share", "effect_share", "roi_total", "cpa_total") ) %>% select(c("rn", "nrmse", "decomp.rssd", "rsq_train", "variable", "value")) %>% - mutate(rn = factor(.data$rn, levels = sort(InputCollect$paid_media_spends))) + mutate(rn = factor(.data$rn, levels = sort(InputCollect$paid_media_selected))) plotMediaShareLoopBar <- filter(temp, .data$variable %in% c("spend_share", "effect_share")) plotMediaShareLoopLine <- filter(temp, .data$variable == ifelse( InputCollect$dep_var_type == "conversion", "cpa_total", "roi_total" @@ -366,143 +322,91 @@ robyn_pareto <- function(InputCollect, OutputModels, ) ## 4. Spend response curve - dt_transformPlot <- select(dt_mod, .data$ds, all_of(InputCollect$all_media)) # independent variables - dt_transformSpend <- cbind(dt_transformPlot[, "ds"], InputCollect$dt_input[, c(InputCollect$paid_media_spends)]) # spends of indep vars - dt_transformSpendMod <- dt_transformPlot[rw_start_loc:rw_end_loc, ] - # update non-spend variables - # if (length(InputCollect$exposure_vars) > 0) { - # for (expo in InputCollect$exposure_vars) { - # sel_nls <- ifelse(InputCollect$modNLSCollect[channel == expo, rsq_nls > rsq_lm], "nls", "lm") - # dt_transformSpendMod[, (expo) := InputCollect$yhatNLSCollect[channel == expo & models == sel_nls, yhat]] - # } - # } - dt_transformAdstock <- dt_transformPlot - dt_transformSaturation <- dt_transformPlot[ - rw_start_loc:rw_end_loc, - ] - - m_decayRate <- list() - for (med in seq_along(InputCollect$all_media)) { - med_select <- InputCollect$all_media[med] - m <- dt_transformPlot[, med_select][[1]] - # Adstocking - adstock <- InputCollect$adstock - if (adstock == "geometric") { - theta <- hypParam[paste0(InputCollect$all_media[med], "_thetas")][[1]] - } - if (grepl("weibull", adstock)) { - shape <- hypParam[paste0(InputCollect$all_media[med], "_shapes")][[1]] - scale <- hypParam[paste0(InputCollect$all_media[med], "_scales")][[1]] - } - x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) - m_adstocked <- x_list$x_decayed - dt_transformAdstock[med_select] <- m_adstocked - m_adstockedRollWind <- m_adstocked[ - rw_start_loc:rw_end_loc - ] - ## Saturation - alpha <- hypParam[paste0(InputCollect$all_media[med], "_alphas")][[1]] - gamma <- hypParam[paste0(InputCollect$all_media[med], "_gammas")][[1]] - dt_transformSaturation[med_select] <- saturation_hill( - x = m_adstockedRollWind, alpha = alpha, gamma = gamma - ) - } - dt_transformSaturationDecomp <- dt_transformSaturation - for (i in seq_along(InputCollect$all_media)) { - coef <- plotWaterfallLoop$coef[plotWaterfallLoop$rn == InputCollect$all_media[i]] - dt_transformSaturationDecomp[InputCollect$all_media[i]] <- coef * - dt_transformSaturationDecomp[InputCollect$all_media[i]] - } - dt_transformSaturationSpendReverse <- dt_transformAdstock[ - rw_start_loc:rw_end_loc, - ] - - ## Reverse MM fitting - # dt_transformSaturationSpendReverse <- copy(dt_transformAdstock[, c("ds", InputCollect$all_media), with = FALSE]) - # for (i in seq_along(InputCollect$paid_media_spends)) { - # chn <- InputCollect$paid_media_vars[i] - # if (chn %in% InputCollect$paid_media_vars[InputCollect$exposure_selector]) { - # # Get Michaelis Menten nls fitting param - # get_chn <- dt_transformSaturationSpendReverse[, chn, with = FALSE] - # Vmax <- InputCollect$modNLSCollect[channel == chn, Vmax] - # Km <- InputCollect$modNLSCollect[channel == chn, Km] - # # Reverse exposure to spend - # dt_transformSaturationSpendReverse[, (chn) := mic_men(x = .SD, Vmax = Vmax, Km = Km, reverse = TRUE), .SDcols = chn] # .SD * Km / (Vmax - .SD) exposure to spend, reverse Michaelis Menthen: x = y*Km/(Vmax-y) - # } else if (chn %in% InputCollect$exposure_vars) { - # coef_lm <- InputCollect$modNLSCollect[channel == chn, coef_lm] - # dt_transformSaturationSpendReverse[, (chn) := .SD / coef_lm, .SDcols = chn] - # } - # } - # dt_transformSaturationSpendReverse <- dt_transformSaturationSpendReverse[rw_start_loc:rw_end_loc] - - dt_scurvePlot <- tidyr::gather( - dt_transformSaturationDecomp, "channel", "response", - 2:ncol(dt_transformSaturationDecomp) - ) %>% - mutate(spend = tidyr::gather( - dt_transformSaturationSpendReverse, "channel", "spend", - 2:ncol(dt_transformSaturationSpendReverse) - )$spend) - - # Remove outlier introduced by MM nls fitting - dt_scurvePlot <- dt_scurvePlot[dt_scurvePlot$spend >= 0, ] + dt_resp_vec_loop <- cbind( + dt_ds, + dt_resp_vec %>% + filter(.data$solID == sid) %>% + select(c("channel", "spend","response")) + ) + dt_transformAdstock <- dt_resp_vec_loop %>% + select(c("ds","channel", "spend")) %>% + pivot_wider(values_from = "spend", names_from = "channel") + dt_transformSaturationDecomp <- dt_resp_vec_loop %>% + select(c("ds","channel", "response")) %>% + pivot_wider(values_from = "response", names_from = "channel") dt_scurvePlotMean <- plotWaterfall %>% filter(.data$solID == sid & !is.na(.data$mean_spend)) %>% - select(c(channel = "rn", "mean_spend", "mean_spend_adstocked", "mean_carryover", "mean_response", "solID")) - + select(c(channel = "rn", "mean_spend", "mean_spend_adstocked", + "mean_carryover", "mean_response", "solID")) # Exposure response curve plot4data <- list( - dt_scurvePlot = dt_scurvePlot, + dt_scurvePlot = dt_resp_vec_loop, dt_scurvePlotMean = dt_scurvePlotMean ) ## 5. Fitted vs actual - col_order <- c("ds", "dep_var", InputCollect$all_ind_vars) - dt_transformDecomp <- select( - dt_modRollWind, .data$ds, .data$dep_var, - any_of(c(InputCollect$prophet_vars, InputCollect$context_vars)) - ) %>% - bind_cols(select(dt_transformSaturation, all_of(InputCollect$all_media))) %>% - select(all_of(col_order)) + temp_order1 <- c("ds", "dep_var") + temp_order2 <- c("(Intercept)", InputCollect$prophet_vars, InputCollect$context_vars) + dt_transformDecomp <- dt_modRollWind %>% + mutate("(Intercept)" = 1) %>% + select(all_of(c(temp_order1, temp_order2))) xDecompVec <- xDecompAgg %>% - filter(.data$solID == sid) %>% - select(.data$solID, .data$rn, .data$coef) %>% - tidyr::spread(.data$rn, .data$coef) - if (!("(Intercept)" %in% names(xDecompVec))) xDecompVec[["(Intercept)"]] <- 0 - xDecompVec <- select(xDecompVec, c("solID", "(Intercept)", col_order[!(col_order %in% c("ds", "dep_var"))])) - intercept <- xDecompVec$`(Intercept)` - xDecompVec <- data.frame(mapply( - function(scurved, coefs) scurved * coefs, - scurved = select(dt_transformDecomp, -.data$ds, -.data$dep_var), - coefs = select(xDecompVec, -.data$solID, -.data$`(Intercept)`) - )) - xDecompVec <- mutate(xDecompVec, - intercept = intercept, - depVarHat = rowSums(xDecompVec) + intercept, solID = sid - ) - xDecompVec <- bind_cols(select(dt_transformDecomp, .data$ds, .data$dep_var), xDecompVec) + filter(.data$solID == sid & .data$rn %in% temp_order2) %>% + select(.data$rn, .data$coef) %>% + pivot_wider(values_from = "coef", names_from = "rn") %>% + mutate("(Intercept)" = ifelse( + "(Intercept)" %in% levels(plotWaterfallLoop$rn), + .data$`(Intercept)`, 0)) + xDecompVec <- bind_cols( + dt_transformDecomp %>% select(temp_order1), + data.frame(mapply( + function(vec, coefs) {vec * coefs}, + vec = select(dt_transformDecomp, -temp_order1), + coefs = xDecompVec + ), check.names = FALSE), + dt_transformSaturationDecomp %>% select(-"ds") + ) %>% + rename("intercept" = "(Intercept)") %>% + mutate(depVarHat = rowSums(select(., -temp_order1)), + solID = sid) %>% + select(c("ds", "dep_var", InputCollect$all_ind_vars, + "intercept", "depVarHat", "solID")) + xDecompVecPlot <- select(xDecompVec, .data$ds, .data$dep_var, .data$depVarHat) %>% rename("actual" = "dep_var", "predicted" = "depVarHat") - xDecompVecPlotMelted <- tidyr::gather( - xDecompVecPlot, - key = "variable", value = "value", -.data$ds - ) - rsq <- filter(xDecompAgg, .data$solID == sid) %>% - pull(.data$rsq_train) %>% - .[1] + xDecompVecPlotMelted <- xDecompVecPlot %>% + pivot_longer(names_to = "variable", values_to = "value", -.data$ds) %>% + arrange(.data$variable, .data$ds) + rsq <- filter(resultHypParam, .data$solID == sid) %>% + pull(.data$rsq_train) plot5data <- list(xDecompVecPlotMelted = xDecompVecPlotMelted, rsq = rsq) ## 6. Diagnostic: fitted vs residual plot6data <- list(xDecompVecPlot = xDecompVecPlot) ## 7. Immediate vs carryover response - plot7data <- robyn_immcarr( - InputCollect, - OutputCollect = list( - resultHypParam = resultHypParam, - xDecompAgg = xDecompAgg - ), - solID = sid, ...) + + temp_p7 <- dt_resp_vec %>% + filter(.data$solID == sid) %>% + group_by(.data$channel) %>% + summarise(Total = sum(.data$response), Carryover = sum(.data$response_carryover)) %>% + mutate(Immediate = .data$Total - .data$Carryover, + perc_imme = 1 - .data$Carryover / .data$Total, + perc_caov = .data$Carryover / .data$Total, + carryover_pct = .data$Carryover / .data$Total) + plot7data <- bind_cols( + temp_p7 %>% + select(rn = "channel", "Immediate", "Carryover") %>% + pivot_longer(names_to = "type", values_to = "response", cols = -"rn"), + temp_p7 %>% + select(rn = "channel", Immediate = "perc_imme", Carryover = "perc_caov") %>% + pivot_longer(names_to = "type", values_to = "percentage", cols = -"rn") %>% + select("percentage"), + temp_p7 %>% + select(rn = "channel", Immediate = "perc_caov", Carryover = "perc_caov") %>% + pivot_longer(names_to = "type", values_to = "carryover_pct", cols = -"rn") %>% + select("carryover_pct") + ) %>% mutate(solID = sid) df_caov_pct_all <- rbind(df_caov_pct_all, plot7data) ## 8. Bootstrapped ROI/CPA with CIs @@ -510,12 +414,7 @@ robyn_pareto <- function(InputCollect, OutputModels, # Gather all results mediaVecCollect <- bind_rows(mediaVecCollect, list( - mutate(dt_transformPlot, type = "rawMedia", solID = sid), - mutate(dt_transformSpend, type = "rawSpend", solID = sid), - mutate(dt_transformSpendMod, type = "predictedExposure", solID = sid), mutate(dt_transformAdstock, type = "adstockedMedia", solID = sid), - mutate(dt_transformSaturation, type = "saturatedMedia", solID = sid), - mutate(dt_transformSaturationSpendReverse, type = "saturatedSpendReversed", solID = sid), mutate(dt_transformSaturationDecomp, type = "decompMedia", solID = sid) )) xDecompVecCollect <- bind_rows(xDecompVecCollect, xDecompVec) @@ -592,7 +491,12 @@ robyn_immcarr <- function( rollingWindow <- rollingWindowStartWhich:rollingWindowEndWhich # Calculate saturated dataframes with carryover and immediate parts hypParamSam <- OutputCollect$resultHypParam[OutputCollect$resultHypParam$solID == solID, ] - dt_saturated_dfs <- run_transformations(InputCollect, hypParamSam, ...) + dt_saturated_dfs <- run_transformations(all_media = InputCollect$all_media, + window_start_loc = InputCollect$rollingWindowStartWhich, + window_end_loc = InputCollect$rollingWindowEndWhich, + dt_mod = InputCollect$dt_mod, + adstock = InputCollect$adstock, + dt_hyppar = hypParamSam, ...) # Calculate decomposition coefs <- OutputCollect$xDecompAgg$coef[OutputCollect$xDecompAgg$solID == solID] names(coefs) <- OutputCollect$xDecompAgg$rn[OutputCollect$xDecompAgg$solID == solID] @@ -651,4 +555,3 @@ robyn_immcarr <- function( left_join(df_caov_pct, c("solID", "rn")) return(xDecompVecImmeCaov) } - diff --git a/R/R/plots.R b/R/R/plots.R index 42cded489..d043f6ffe 100644 --- a/R/R/plots.R +++ b/R/R/plots.R @@ -487,10 +487,14 @@ robyn_onepagers <- function( trim_rate <- 1.3 # maybe enable as a parameter if (trim_rate > 0) { dt_scurvePlot <- dt_scurvePlot %>% + select(-"ds") %>% + bind_rows(data.frame( + channel = unique(dt_scurvePlot$channel), spend = 0, response = 0 + )) %>% filter( .data$spend < max(dt_scurvePlotMean$mean_spend_adstocked) * trim_rate, .data$response < max(dt_scurvePlotMean$mean_response) * trim_rate, - .data$channel %in% InputCollect$paid_media_spends + .data$channel %in% InputCollect$paid_media_selected ) %>% left_join( dt_scurvePlotMean[, c("channel", "mean_carryover")], "channel" @@ -591,6 +595,7 @@ robyn_onepagers <- function( ## 7. Immediate vs carryover df_imme_caov <- temp[[sid]]$plot7data p7 <- df_imme_caov %>% + replace(is.na(.), 0) %>% mutate(type = factor(.data$type, levels = c("Immediate", "Carryover"))) %>% ggplot(aes( x = .data$percentage, y = .data$rn, fill = reorder(.data$type, as.integer(.data$type)), @@ -819,13 +824,16 @@ allocation_plots <- function( .data$value / dplyr::first(.data$value) }) metric_vals <- if (metric == "ROAS") resp_metric$total_roi else resp_metric$total_cpa + resp_delta <- df_roi %>% group_by(.data$type) %>% + summarise(resp_delta = unique(.data$total_response_lift)) %>% + pull(resp_delta) labs <- paste( paste(levs2, "\n"), paste("Spend:", formatNum( 100 * (resp_metric$total_spend - resp_metric$total_spend[1]) / resp_metric$total_spend[1], signif = 3, pos = "%", sign = TRUE )), - unique(paste("Resp:", formatNum(100 * df_roi$total_response_lift, signif = 3, pos = "%", sign = TRUE))), + paste("Resp:", formatNum(100 * resp_delta, signif = 3, pos = "%", sign = TRUE)), paste(metric, ":", round(metric_vals, 2)), sep = "\n" ) @@ -1585,7 +1593,7 @@ decomp_plot <- function( # Sort variables by baseline first & amount of absolute impact levs <- df %>% group_by(.data$variable) %>% - summarize(impact = sum(abs(.data$value))) %>% + summarise(impact = sum(abs(.data$value))) %>% mutate(is_baseline = grepl("Baseline_L", .data$variable)) %>% arrange(desc(.data$is_baseline), desc(.data$impact)) %>% filter(.data$impact > 0) %>% @@ -1600,8 +1608,8 @@ decomp_plot <- function( filter(.data$variable %in% levs) %>% mutate(variable = factor(.data$variable, levels = rev(levs))) - p <- ggplot(df, aes(x = as.character(ds), y = value, fill = variable)) + - facet_grid(solID ~ .) + + p <- ggplot(df, aes(x = as.character(.data$ds), y = .data$value, fill = .data$variable)) + + facet_grid(.data$solID ~ .) + labs( title = paste(varType, "Decomposition by Variable"), x = NULL, y = paste(intType, varType), fill = NULL @@ -1619,9 +1627,32 @@ decomp_plot <- function( return(p) } +# Custom plot for geom_density interval +geom_density_ci <- function( + gg_density, # ggplot object that contains geom_density + ci_low, + ci_up, + fill = "grey" +) { + build_object <- ggplot_build(gg_density) + x_dens <- build_object$data[[1]]$x + y_dens <- build_object$data[[1]]$y + ind_low <- min(which(x_dens >= ci_low)) + ind_up <- max(which(x_dens <= ci_up)) + + gg_density <- gg_density + + geom_area( + data=data.frame( + x=x_dens[ind_low:ind_up], + density=y_dens[ind_low:ind_up]), + aes(x=.data$x,y=.data$density), + fill=fill, + alpha=0.6) + return(gg_density) + } + get_evenly_separated_dates <- function(dates, n = 6) { dates <- sort(dates) - intervals <- n - 1 indices <- round(seq(1, length(dates), length.out = n)) selected_dates <- dates[indices] return(as.character(selected_dates)) diff --git a/R/R/refresh.R b/R/R/refresh.R index a992ac548..7dac35464 100644 --- a/R/R/refresh.R +++ b/R/R/refresh.R @@ -276,7 +276,8 @@ robyn_refresh <- function(json_file = NULL, window_start = InputCollectRF$window_start, window_end = InputCollectRF$window_end, paid_media_spends = InputCollectRF$paid_media_spends, - organic_vars = InputCollectRF$organic_vars + organic_vars = InputCollectRF$organic_vars, + paid_media_selected = InputCollectRF$paid_media_selected ) InputCollectRF$calibration_input <- calibration_input } diff --git a/R/R/response.R b/R/R/response.R index 7334a7ba0..bf22fea21 100644 --- a/R/R/response.R +++ b/R/R/response.R @@ -14,10 +14,10 @@ #' @param metric_name A character. Selected media variable for the response. #' Must be one value from paid_media_spends, paid_media_vars or organic_vars #' @param metric_value Numeric. Desired metric value to return a response for. -#' @param dt_hyppar A data.frame. When \code{robyn_object} is not provided, use +#' @param dt_hyppar A data.frame. When \code{json_file} is not provided, use #' \code{dt_hyppar = OutputCollect$resultHypParam}. It must be provided along #' \code{select_model}, \code{dt_coef} and \code{InputCollect}. -#' @param dt_coef A data.frame. When \code{robyn_object} is not provided, use +#' @param dt_coef A data.frame. When \code{json_file} is not provided, use #' \code{dt_coef = OutputCollect$xDecompAgg}. It must be provided along #' \code{select_model}, \code{dt_hyppar} and \code{InputCollect}. #' @examples @@ -106,7 +106,6 @@ robyn_response <- function(InputCollect = NULL, OutputCollect = NULL, json_file = NULL, - robyn_object = NULL, select_build = NULL, select_model = NULL, metric_name = NULL, @@ -133,40 +132,12 @@ robyn_response <- function(InputCollect = NULL, if (is.null(dt_hyppar)) dt_hyppar <- OutputCollect$resultHypParam if (is.null(dt_coef)) dt_coef <- OutputCollect$xDecompAgg } else { - if (!is.null(robyn_object)) { - if (!file.exists(robyn_object)) { - stop("File does not exist or is somewhere else. Check: ", robyn_object) - } else { - Robyn <- readRDS(robyn_object) - objectPath <- dirname(robyn_object) - objectName <- sub("'\\..*$", "", basename(robyn_object)) - } - select_build_all <- 0:(length(Robyn) - 1) - if (is.null(select_build)) { - select_build <- max(select_build_all) - if (!quiet && length(select_build_all) > 1) { - message( - "Using latest model: ", ifelse(select_build == 0, "initial model", paste0("refresh model #", select_build)), - " for the response function. Use parameter 'select_build' to specify which run to use" - ) - } - } - if (!(select_build %in% select_build_all) || length(select_build) != 1) { - stop("'select_build' must be one value of ", paste(select_build_all, collapse = ", ")) - } - listName <- ifelse(select_build == 0, "listInit", paste0("listRefresh", select_build)) - InputCollect <- Robyn[[listName]][["InputCollect"]] - OutputCollect <- Robyn[[listName]][["OutputCollect"]] - dt_hyppar <- OutputCollect$resultHypParam - dt_coef <- OutputCollect$xDecompAgg - } else { - # Try to get some pre-filled values + # Get pre-filled values if (is.null(dt_hyppar)) dt_hyppar <- OutputCollect$resultHypParam if (is.null(dt_coef)) dt_coef <- OutputCollect$xDecompAgg if (any(is.null(dt_hyppar), is.null(dt_coef), is.null(InputCollect), is.null(OutputCollect))) { - stop("When 'robyn_object' is not provided, 'InputCollect' & 'OutputCollect' must be provided") + stop("When 'json_file' is not provided, 'InputCollect' & 'OutputCollect' must be provided") } - } } if ("selectID" %in% names(OutputCollect)) { @@ -176,12 +147,15 @@ robyn_response <- function(InputCollect = NULL, ## Prep environment if (TRUE) { dt_input <- InputCollect$dt_input - startRW <- InputCollect$rollingWindowStartWhich - endRW <- InputCollect$rollingWindowEndWhich + dt_mod <- InputCollect$dt_mod + window_start_loc <- InputCollect$rollingWindowStartWhich + window_end_loc <- InputCollect$rollingWindowEndWhich + window_loc <- window_start_loc:window_end_loc adstock <- InputCollect$adstock - spendExpoMod <- InputCollect$modNLS$results + # spendExpoMod <- InputCollect$ExposureCollect$df_cpe paid_media_vars <- InputCollect$paid_media_vars paid_media_spends <- InputCollect$paid_media_spends + paid_media_selected <- InputCollect$paid_media_selected exposure_vars <- InputCollect$exposure_vars organic_vars <- InputCollect$organic_vars allSolutions <- unique(dt_hyppar$solID) @@ -198,139 +172,91 @@ robyn_response <- function(InputCollect = NULL, ## Get use case based on inputs usecase <- which_usecase(metric_value, date_range) - ## Check inputs with usecases - metric_type <- check_metric_type(metric_name, paid_media_spends, paid_media_vars, exposure_vars, organic_vars) - all_dates <- pull(dt_input, InputCollect$date_var) - all_values <- pull(dt_input, metric_name) - - if (usecase == "all_historical_vec") { - ds_list <- check_metric_dates(date_range = "all", all_dates[1:endRW], dayInterval, quiet, ...) - metric_value <- NULL - # val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) - } else if (usecase == "unit_metric_default_last_n") { - ds_list <- check_metric_dates(date_range = paste0("last_", length(metric_value)), all_dates[1:endRW], dayInterval, quiet, ...) - # val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) - } else { - ds_list <- check_metric_dates(date_range, all_dates[1:endRW], dayInterval, quiet, ...) + ## Check inputs + metric_type <- check_metric_type(metric_name, paid_media_spends, paid_media_vars, paid_media_selected, exposure_vars, organic_vars) + metric_name_updated <- metric_type$metric_name_updated + all_dates <- dt_input[[InputCollect$date_var]] + all_values <- dt_mod[[metric_name_updated]] + ds_list <- check_metric_dates(date_range = date_range, all_dates[1:window_end_loc], dayInterval, quiet, ...) + val_list <- check_metric_value(metric_value, metric_name_updated, all_values, ds_list$metric_loc) + if (!is.null(metric_value) & is.null(date_range)) { + stop("Must specify date_range when using metric_value") } - val_list <- check_metric_value(metric_value, metric_name, all_values, ds_list$metric_loc) date_range_updated <- ds_list$date_range_updated - metric_value_updated <- val_list$metric_value_updated all_values_updated <- val_list$all_values_updated - ## Transform exposure to spend when necessary - if (metric_type == "exposure") { - get_spend_name <- paid_media_spends[which(paid_media_vars == metric_name)] - # expo_vec <- dt_input[, metric_name][[1]] - # Use non-0 mean as marginal level if metric_value not provided - # if (is.null(metric_value)) { - # metric_value <- mean(expo_vec[startRW:endRW][expo_vec[startRW:endRW] > 0]) - # if (!quiet) message("Input 'metric_value' not provided. Using mean of ", metric_name, " instead") - # } - # Fit spend to exposure - # spend_vec <- dt_input[, get_spend_name][[1]] - if (is.null(spendExpoMod)) { - stop("Can't calculate exposure to spend response. Please, recreate your InputCollect object") - } - temp <- filter(spendExpoMod, .data$channel == metric_name) - nls_select <- temp$rsq_nls > temp$rsq_lm - if (nls_select) { - Vmax <- spendExpoMod$Vmax[spendExpoMod$channel == metric_name] - Km <- spendExpoMod$Km[spendExpoMod$channel == metric_name] - input_immediate <- mic_men(x = metric_value_updated, Vmax = Vmax, Km = Km, reverse = TRUE) - } else { - coef_lm <- spendExpoMod$coef_lm[spendExpoMod$channel == metric_name] - input_immediate <- metric_value_updated / coef_lm - } - all_values_updated[ds_list$metric_loc] <- input_immediate - hpm_name <- get_spend_name - } else { - # use non-0 means marginal level if spend not provided - # if (is.null(metric_value)) { - # metric_value <- mean(media_vec[startRW:endRW][media_vec[startRW:endRW] > 0]) - # if (!quiet) message("Input 'metric_value' not provided. Using mean of ", metric_name, " instead") - # } - input_immediate <- metric_value_updated - hpm_name <- metric_name - } - - ## Adstocking original - media_vec_origin <- dt_input[, metric_name][[1]] + ## Get hyperparameters & beta coef theta <- scale <- shape <- NULL if (adstock == "geometric") { - theta <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_thetas")]][[1]] + theta <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(metric_name_updated, "_thetas")]][[1]] } if (grepl("weibull", adstock)) { - shape <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_shapes")]][[1]] - scale <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_scales")]][[1]] + shape <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(metric_name_updated, "_shapes")]][[1]] + scale <- dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(metric_name_updated, "_scales")]][[1]] } - x_list <- transform_adstock(media_vec_origin, adstock, theta = theta, shape = shape, scale = scale) - m_adstocked <- x_list$x_decayed - # net_carryover_ref <- m_adstocked - media_vec_origin - - ## Adstocking simulation - x_list_sim <- transform_adstock(all_values_updated, adstock, theta = theta, shape = shape, scale = scale) - media_vec_sim <- x_list_sim$x_decayed - media_vec_sim_imme <- if (adstock == "weibull_pdf") x_list_sim$x_imme else x_list_sim$x - input_total <- media_vec_sim[ds_list$metric_loc] - input_immediate <- media_vec_sim_imme[ds_list$metric_loc] - input_carryover <- input_total - input_immediate + alpha <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(metric_name_updated, "_alphas")]], 1) + gamma <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(metric_name_updated, "_gammas")]], 1) + coeff <- dt_coef[dt_coef$solID == select_model & dt_coef$rn == metric_name_updated, ][["coef"]] - ## Saturation - m_adstockedRW <- m_adstocked[startRW:endRW] - alpha <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_alphas")]], 1) - gamma <- head(dt_hyppar[dt_hyppar$solID == select_model, ][[paste0(hpm_name, "_gammas")]], 1) - if (usecase == "all_historical_vec") { - metric_saturated_total <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) - metric_saturated_carryover <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) - } else { - metric_saturated_total <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma, x_marginal = input_total) - metric_saturated_carryover <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma, x_marginal = input_carryover) + ## Historical transformation + hist_transform <- transform_decomp( + all_values = all_values, + adstock, theta, shape, scale, alpha, gamma, + window_loc, coeff, metric_loc = ds_list$metric_loc) + dt_line <- data.frame( + metric = hist_transform$input_total[window_loc], + response = hist_transform$response_total, + channel = metric_name_updated) + dt_point <- data.frame( + mean_input_immediate = hist_transform$mean_input_immediate, + mean_input_carryover = hist_transform$mean_input_carryover, + mean_input_total = hist_transform$mean_input_immediate + hist_transform$mean_input_carryover, + mean_response_immediate = hist_transform$mean_response_total - hist_transform$mean_response_carryover, + mean_response_carryover = hist_transform$mean_response_carryover, + mean_response_total = hist_transform$mean_response_total + ) + if (!is.null(date_range)) { + dt_point_sim <- data.frame( + input = hist_transform$sim_mean_spend + hist_transform$sim_mean_carryover, + output = hist_transform$sim_mean_response) } - metric_saturated_immediate <- metric_saturated_total - metric_saturated_carryover - ## Decomp - coeff <- dt_coef[dt_coef$solID == select_model & dt_coef$rn == hpm_name, ][["coef"]] - m_saturated <- saturation_hill(x = m_adstockedRW, alpha = alpha, gamma = gamma) - m_resposne <- m_saturated * coeff - response_total <- as.numeric(metric_saturated_total * coeff) - response_carryover <- as.numeric(metric_saturated_carryover * coeff) - response_immediate <- response_total - response_carryover - - dt_line <- data.frame(metric = m_adstockedRW, response = m_resposne, channel = metric_name) - if (usecase == "all_historical_vec") { - dt_point <- data.frame(input = input_total[startRW:endRW], output = response_total, ds = date_range_updated[startRW:endRW]) - dt_point_caov <- data.frame(input = input_carryover[startRW:endRW], output = response_carryover) - dt_point_imme <- data.frame(input = input_immediate[startRW:endRW], output = response_immediate) - } else { - dt_point <- data.frame(input = input_total, output = response_total, ds = date_range_updated) - dt_point_caov <- data.frame(input = input_carryover, output = response_carryover) - dt_point_imme <- data.frame(input = input_immediate, output = response_immediate) + ## Simulated transformation + if (!is.null(metric_value)) { + hist_transform_sim <- transform_decomp( + all_values = all_values_updated, + adstock, theta, shape, scale, alpha, gamma, + window_loc, coeff, metric_loc = ds_list$metric_loc, + calibrate_inflexion = hist_transform$inflexion) + dt_point_sim <- data.frame( + input = hist_transform_sim$sim_mean_spend + hist_transform_sim$sim_mean_carryover, + output = hist_transform_sim$sim_mean_response) } ## Plot optimal response p_res <- ggplot(dt_line, aes(x = .data$metric, y = .data$response)) + geom_line(color = "steelblue") + - geom_point(data = dt_point, aes(x = .data$input, y = .data$output), size = 3) + + geom_point( + data = dt_point, + aes(x = .data$mean_input_total, y = .data$mean_response_total), + size = 3, color = "grey") + labs( title = paste( - "Saturation curve of", - ifelse(metric_type == "organic", "organic", "paid"), - "media:", metric_name, - ifelse(!is.null(date_range_updated), "adstocked", ""), - ifelse(metric_type == "spend", "spend metric", "exposure metric") + "Saturation curve of", metric_type$metric_type, + "media:", metric_type$metric_name_updated + ), + subtitle = sprintf(paste( + "Response: %s @ mean input %s", + "Response: %s @ mean input carryover %s", + "Response: %s @ mean input immediate %s", + sep = "\n"), + num_abbr(dt_point$mean_response_total), + num_abbr(dt_point$mean_input_total), + num_abbr(dt_point$mean_response_carryover), + num_abbr(dt_point$mean_input_carryover), + num_abbr(dt_point$mean_response_immediate), + num_abbr(dt_point$mean_input_immediate) ), - subtitle = ifelse(length(unique(input_total)) == 1, sprintf( - paste( - "Carryover* Response: %s @ Input %s", - "Immediate Response: %s @ Input %s", - "Total (C+I) Response: %s @ Input %s", - sep = "\n" - ), - num_abbr(dt_point_caov$output), num_abbr(dt_point_caov$input), - num_abbr(dt_point_imme$output), num_abbr(dt_point_imme$input), - num_abbr(dt_point$output), num_abbr(dt_point$input) - ), ""), x = "Input", y = "Response", caption = sprintf( "Response period: %s%s%s", @@ -342,20 +268,35 @@ robyn_response <- function(InputCollect = NULL, theme_lares(background = "white") + scale_x_abbr() + scale_y_abbr() - if (length(unique(metric_value)) == 1) { + if (!is.null(metric_value) | !is.null(date_range)) { p_res <- p_res + - geom_point(data = dt_point_caov, aes(x = .data$input, y = .data$output), size = 3, shape = 8) + geom_point(data = dt_point_sim, aes(x = .data$input, y = .data$output), size = 3, color = "blue") + } + if (!is.null(metric_value)) { + sim_mean_spend <- hist_transform_sim$sim_mean_spend + sim_mean_carryover <- hist_transform_sim$sim_mean_carryover + sim_mean_response <- hist_transform_sim$sim_mean_response + } else { + sim_mean_spend <- sim_mean_carryover <- sim_mean_response <- NULL } ret <- list( - metric_name = metric_name, + metric_name = metric_name_updated, date = date_range_updated, - input_total = input_total, - input_carryover = input_carryover, - input_immediate = input_immediate, - response_total = response_total, - response_carryover = response_carryover, - response_immediate = response_immediate, + input_total = hist_transform$input_total, + input_carryover = hist_transform$input_carryover, + input_immediate = hist_transform$input_immediate, + response_total = hist_transform$response_total, + response_carryover = hist_transform$response_carryover, + response_immediate = hist_transform$response_immediate, + inflexion = hist_transform$inflexion, + mean_input_immediate = hist_transform$mean_input_immediate, + mean_input_carryover = hist_transform$mean_input_carryover, + mean_response_total = hist_transform$mean_response_total, + mean_response_carryover = hist_transform$mean_response_carryover, + sim_mean_spend = sim_mean_spend, + sim_mean_carryover = sim_mean_carryover, + sim_mean_response = sim_mean_response, usecase = usecase, plot = p_res ) @@ -385,6 +326,79 @@ which_usecase <- function(metric_value, date_range) { return(usecase) } +transform_decomp <- function(all_values, adstock, theta, shape, scale, alpha, gamma, + window_loc, coeff, metric_loc, calibrate_inflexion = NULL) { + ## adstock + x_list <- transform_adstock(x = all_values, adstock, theta, shape, scale) + input_total <- x_list$x_decayed + input_immediate <- if (adstock == "weibull_pdf") x_list$x_imme else x_list$x + input_carryover <- input_total - input_immediate + input_total_rw <- input_total[window_loc] + input_carryover_rw <- input_carryover[window_loc] + ## saturation + saturated_total <- saturation_hill( + x = input_total_rw, + alpha = alpha, gamma = gamma + ) + saturated_carryover <- saturation_hill( + x = input_total_rw, + alpha = alpha, gamma = gamma, x_marginal = input_carryover_rw + ) + saturated_immediate <- saturated_total$x_saturated - saturated_carryover$x_saturated + ## simulate mean response of all_values periods + mean_input_immediate <- mean(input_immediate[window_loc]) + mean_input_carryover <- mean(input_carryover_rw) + mean_response_total <- fx_objective( + x = mean_input_immediate, + coeff = coeff, + alpha = alpha, + inflexion = saturated_total$inflexion, + x_hist_carryover = mean_input_carryover, + get_sum = FALSE + ) + mean_response_carryover <- fx_objective( + x = 0, + coeff = coeff, + alpha = alpha, + inflexion = saturated_total$inflexion, + x_hist_carryover = mean_input_carryover, + get_sum = FALSE + ) + ## simulate mean response of date_range periods + sim_mean_spend <- mean(input_immediate[metric_loc]) + sim_mean_carryover <- mean(input_carryover[metric_loc]) + if (is.null(calibrate_inflexion)) calibrate_inflexion <- saturated_total$inflexion + sim_mean_response <- fx_objective( + x = sim_mean_spend, + coeff = coeff, + alpha = alpha, + # use historical true inflexion when metric_value is provided + inflexion = calibrate_inflexion, + x_hist_carryover = sim_mean_carryover, + get_sum = FALSE + ) + + ret <- list( + input_total = input_total, + input_immediate = input_immediate, + input_carryover = input_carryover, + saturated_total = saturated_total$x_saturated, + saturated_carryover = saturated_carryover$x_saturated, + saturated_immediate = saturated_immediate, + response_total = saturated_total$x_saturated * coeff, + response_carryover = saturated_carryover$x_saturated * coeff, + response_immediate = saturated_immediate * coeff, + inflexion = saturated_total$inflexion, + mean_input_immediate = mean_input_immediate, + mean_input_carryover = mean_input_carryover, + mean_response_total = mean_response_total, + mean_response_carryover = mean_response_carryover, + sim_mean_spend = sim_mean_spend, + sim_mean_carryover = sim_mean_carryover, + sim_mean_response = sim_mean_response + ) + return(ret) +} # ####### SCENARIOS CHECK FOR date_range # metric_value <- 71427 # all_dates <- dt_input$DATE diff --git a/R/R/transformation.R b/R/R/transformation.R index 89c6266b3..5bb4d98f0 100644 --- a/R/R/transformation.R +++ b/R/R/transformation.R @@ -119,18 +119,14 @@ adstock_geometric <- function(x, theta) { #' peak value occurring after the first period when shape >=1, allowing lagged #' effect. #' @examples -#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "CDF") -#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "PDF") +#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "cdf") +#' adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "pdf") #' -#' # Wrapped function for either adstock -#' transform_adstock(rep(100, 10), "weibull_pdf", shape = 1, scale = 0.5) #' @rdname adstocks #' @export -adstock_weibull <- function(x, shape, scale, windlen = length(x), type = "cdf") { - stopifnot(length(shape) == 1) - stopifnot(length(scale) == 1) +adstock_weibull <- function(x, shape, scale, windlen = length(x), type = "pdf") { if (length(x) > 1) { - check_opts(tolower(type), c("cdf", "pdf")) + # check_opts(tolower(type), c("cdf", "pdf")) x_bin <- 1:windlen scaleTrans <- round(quantile(1:windlen, scale), 0) if (shape == 0 | scale == 0) { @@ -138,11 +134,11 @@ adstock_weibull <- function(x, shape, scale, windlen = length(x), type = "cdf") thetaVecCum <- thetaVec <- rep(0, windlen) x_imme <- x } else { - if ("cdf" %in% tolower(type)) { + if ("pdf" %in% type) { + thetaVecCum <- .normalize(dweibull(x_bin, shape = shape, scale = scaleTrans)) # plot(thetaVecCum) + } else if ("cdf" %in% type) { thetaVec <- c(1, 1 - pweibull(head(x_bin, -1), shape = shape, scale = scaleTrans)) # plot(thetaVec) thetaVecCum <- cumprod(thetaVec) # plot(thetaVecCum) - } else if ("pdf" %in% tolower(type)) { - thetaVecCum <- .normalize(dweibull(x_bin, shape = shape, scale = scaleTrans)) # plot(thetaVecCum) } x_decayed <- mapply(function(x_val, x_pos) { x.vec <- c(rep(0, x_pos - 1), rep(x_val, windlen - x_pos + 1)) @@ -171,13 +167,12 @@ adstock_weibull <- function(x, shape, scale, windlen = length(x), type = "cdf") #' @param adstock Character. One of: "geometric", "weibull_cdf", "weibull_pdf". #' @export transform_adstock <- function(x, adstock, theta = NULL, shape = NULL, scale = NULL, windlen = length(x)) { - check_adstock(adstock) + # check_adstock(adstock) if (adstock == "geometric") { x_list_sim <- adstock_geometric(x = x, theta = theta) - } else if (adstock == "weibull_cdf") { - x_list_sim <- adstock_weibull(x = x, shape = shape, scale = scale, windlen = windlen, type = "cdf") - } else if (adstock == "weibull_pdf") { - x_list_sim <- adstock_weibull(x = x, shape = shape, scale = scale, windlen = windlen, type = "pdf") + } else { + get_type <- substr(adstock, nchar(adstock)-2, nchar(adstock)) + x_list_sim <- adstock_weibull(x = x, shape = shape, scale = scale, windlen = windlen, type = get_type) } return(x_list_sim) } @@ -206,18 +201,20 @@ transform_adstock <- function(x, adstock, theta = NULL, shape = NULL, scale = NU #' Hill-transformed value of the x_marginal input. #' @examples #' saturation_hill(c(100, 150, 170, 190, 200), alpha = 3, gamma = 0.5) -#' @return Numeric values. Transformed values. +#' @return List. x_saturated as transformed values and inflexion point #' @export saturation_hill <- function(x, alpha, gamma, x_marginal = NULL) { stopifnot(length(alpha) == 1) stopifnot(length(gamma) == 1) - inflexion <- c(range(x) %*% c(1 - gamma, gamma)) # linear interpolation by dot product + inflexion <- max(x) * gamma + # inflexion <- .dot_product(range = range(x), proportion = gamma) # linear interpolation by dot product if (is.null(x_marginal)) { - x_scurve <- x**alpha / (x**alpha + inflexion**alpha) # plot(x_scurve) summary(x_scurve) + x_saturated <- x**alpha / (x**alpha + inflexion**alpha) # plot(x_saturated) summary(x_saturated) } else { - x_scurve <- x_marginal**alpha / (x_marginal**alpha + inflexion**alpha) + x_saturated <- x_marginal**alpha / (x_marginal**alpha + inflexion**alpha) } - return(x_scurve) + return(list(x_saturated = x_saturated, + inflexion = inflexion)) } @@ -367,18 +364,27 @@ plot_saturation <- function(plot = TRUE) { #### Transform media for model fitting #' @name transformations #' @inheritParams robyn_inputs +#' @param all_media Character. Vector of all selected paid media variable names. +#' @param window_start_loc Integer. Rolling window start location. +#' @param window_end_loc Integer. Rolling window end location. +#' @param dt_mod dataframe. Transformed input table for transformation. +#' @param adstock Character. Adstock config. +#' @param dt_hyppar data.frame. All hyperparaters for provided media. #' @export -run_transformations <- function(InputCollect, hyperparameters, ...) { - all_media <- InputCollect$all_media - rollingWindowStartWhich <- InputCollect$rollingWindowStartWhich - rollingWindowEndWhich <- InputCollect$rollingWindowEndWhich - dt_modAdstocked <- select(InputCollect$dt_mod, -.data$ds) - adstock <- InputCollect$adstock - - mediaAdstocked <- list() - mediaSaturated <- list() - mediaSaturatedImmediate <- list() - mediaSaturatedCarryover <- list() +run_transformations <- function(all_media, + window_start_loc, + window_end_loc, + dt_mod, + adstock, + dt_hyppar, ...) { + dt_modAdstocked <- select(dt_mod, -.data$ds) + window_loc <- window_start_loc:window_end_loc + adstocked_collect <- list() + saturated_total_collect <- list() + saturated_immediate_collect <- list() + saturated_carryover_collect <- list() + inflexion_collect <- list() + inflation_collect <- list() for (v in seq_along(all_media)) { ################################################ @@ -386,59 +392,68 @@ run_transformations <- function(InputCollect, hyperparameters, ...) { # Decayed/adstocked response = Immediate response + Carryover response m <- dt_modAdstocked[, all_media[v]][[1]] if (adstock == "geometric") { - theta <- hyperparameters[paste0(all_media[v], "_thetas")][[1]][[1]] + theta <- dt_hyppar[paste0(all_media[v], "_thetas")][[1]][[1]] } if (grepl("weibull", adstock)) { - shape <- hyperparameters[paste0(all_media[v], "_shapes")][[1]][[1]] - scale <- hyperparameters[paste0(all_media[v], "_scales")][[1]][[1]] + shape <- dt_hyppar[paste0(all_media[v], "_shapes")][[1]][[1]] + scale <- dt_hyppar[paste0(all_media[v], "_scales")][[1]][[1]] } x_list <- transform_adstock(m, adstock, theta = theta, shape = shape, scale = scale) - m_imme <- if (adstock == "weibull_pdf") x_list$x_imme else m - m_adstocked <- x_list$x_decayed - mediaAdstocked[[v]] <- m_adstocked - m_carryover <- m_adstocked - m_imme - # mediaImmediate[[v]] <- m_imme - # mediaCarryover[[v]] <- m_carryover + input_total <- x_list$x_decayed + input_immediate <- if (adstock == "weibull_pdf") x_list$x_imme else m + adstocked_collect[[v]] <- input_total + input_carryover <- input_total - input_immediate + # mediaImmediate[[v]] <- input_immediate + # mediaCarryover[[v]] <- input_carryover # mediaVecCum[[v]] <- x_list$thetaVecCum ################################################ ## 2. Saturation (only window data) # Saturated response = Immediate response + carryover response - m_adstockedRollWind <- m_adstocked[rollingWindowStartWhich:rollingWindowEndWhich] - m_carryoverRollWind <- m_carryover[rollingWindowStartWhich:rollingWindowEndWhich] + input_total_rw <- input_total[window_loc] + input_carryover_rw <- input_carryover[window_loc] + alpha <- dt_hyppar[paste0(all_media[v], "_alphas")][[1]][[1]] + gamma <- dt_hyppar[paste0(all_media[v], "_gammas")][[1]][[1]] - alpha <- hyperparameters[paste0(all_media[v], "_alphas")][[1]][[1]] - gamma <- hyperparameters[paste0(all_media[v], "_gammas")][[1]][[1]] - mediaSaturated[[v]] <- saturation_hill( - m_adstockedRollWind, + sat_temp_total <- saturation_hill(x = input_total_rw, alpha = alpha, gamma = gamma ) - mediaSaturatedCarryover[[v]] <- saturation_hill( - m_adstockedRollWind, - alpha = alpha, gamma = gamma, x_marginal = m_carryoverRollWind + sat_temp_caov <- saturation_hill( + x = input_total_rw, + alpha = alpha, gamma = gamma, x_marginal = input_carryover_rw ) - mediaSaturatedImmediate[[v]] <- mediaSaturated[[v]] - mediaSaturatedCarryover[[v]] - # plot(m_adstockedRollWind, mediaSaturated[[1]]) + saturated_total_collect[[v]] <- sat_temp_total[["x_saturated"]] + saturated_carryover_collect[[v]] <- sat_temp_caov[["x_saturated"]] + saturated_immediate_collect[[v]] <- saturated_total_collect[[v]] - saturated_carryover_collect[[v]] + inflexion_collect[[v]] <- sat_temp_total[["inflexion"]] + inflation_collect[[v]] <- x_list$inflation_total + # plot(input_total_rw, saturated_total_collect[[1]]) } - names(mediaAdstocked) <- names(mediaSaturated) <- names(mediaSaturatedImmediate) <- - names(mediaSaturatedCarryover) <- all_media + names(adstocked_collect) <- names(saturated_total_collect) <- names(saturated_immediate_collect) <- + names(saturated_carryover_collect) <- all_media dt_modAdstocked <- dt_modAdstocked %>% select(-all_of(all_media)) %>% - bind_cols(mediaAdstocked) + bind_cols(adstocked_collect) # dt_mediaImmediate <- bind_cols(mediaImmediate) # dt_mediaCarryover <- bind_cols(mediaCarryover) # mediaVecCum <- bind_cols(mediaVecCum) - dt_modSaturated <- dt_modAdstocked[rollingWindowStartWhich:rollingWindowEndWhich, ] %>% + dt_modSaturated <- dt_modAdstocked[window_loc, ] %>% select(-all_of(all_media)) %>% - bind_cols(mediaSaturated) - dt_saturatedImmediate <- bind_cols(mediaSaturatedImmediate) + bind_cols(saturated_total_collect) + dt_saturatedImmediate <- bind_cols(saturated_immediate_collect) dt_saturatedImmediate[is.na(dt_saturatedImmediate)] <- 0 - dt_saturatedCarryover <- bind_cols(mediaSaturatedCarryover) + dt_saturatedCarryover <- bind_cols(saturated_carryover_collect) dt_saturatedCarryover[is.na(dt_saturatedCarryover)] <- 0 + inflexion_collect <- unlist(inflexion_collect) + inflation_collect <- unlist(inflation_collect) + names(inflexion_collect) <- paste0(all_media, "_inflexion") + names(inflation_collect) <- paste0(all_media, "_inflation") return(list( dt_modSaturated = dt_modSaturated, dt_saturatedImmediate = dt_saturatedImmediate, - dt_saturatedCarryover = dt_saturatedCarryover + dt_saturatedCarryover = dt_saturatedCarryover, + inflexions = inflexion_collect, + inflations = inflation_collect )) } diff --git a/R/data/df_curve_reach_freq.RData b/R/data/df_curve_reach_freq.RData new file mode 100644 index 000000000..0678e6e67 Binary files /dev/null and b/R/data/df_curve_reach_freq.RData differ diff --git a/R/man/Robyn.Rd b/R/man/Robyn.Rd index d0d9f095d..e2712bd9c 100644 --- a/R/man/Robyn.Rd +++ b/R/man/Robyn.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/imports.R \docType{package} \name{Robyn} -\alias{Robyn-package} \alias{Robyn} +\alias{Robyn-package} \title{Robyn MMM Project from Meta Marketing Science} \description{ Robyn is an automated Marketing Mix Modeling (MMM) code. It aims to reduce human diff --git a/R/man/adstocks.Rd b/R/man/adstocks.Rd index 286c4bc76..0cfd3a599 100644 --- a/R/man/adstocks.Rd +++ b/R/man/adstocks.Rd @@ -9,7 +9,7 @@ \usage{ adstock_geometric(x, theta) -adstock_weibull(x, shape, scale, windlen = length(x), type = "cdf") +adstock_weibull(x, shape, scale, windlen = length(x), type = "pdf") transform_adstock( x, @@ -88,11 +88,9 @@ Run \code{plot_adstock()} to see the difference visually. } \examples{ adstock_geometric(rep(100, 5), theta = 0.5) -adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "CDF") -adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "PDF") +adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "cdf") +adstock_weibull(rep(100, 5), shape = 0.5, scale = 0.5, type = "pdf") -# Wrapped function for either adstock -transform_adstock(rep(100, 10), "weibull_pdf", shape = 1, scale = 0.5) } \seealso{ Other Transformations: diff --git a/R/man/df_curve_reach_freq.Rd b/R/man/df_curve_reach_freq.Rd new file mode 100644 index 000000000..a3d572118 --- /dev/null +++ b/R/man/df_curve_reach_freq.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{df_curve_reach_freq} +\alias{df_curve_reach_freq} +\title{Robyn Dataset: Reach & frequency simulated dataset} +\format{ +An object of class \code{"data.frame"} +\describe{ + \item{spend_cumulated}{cumulated spend of paid media} + \item{response_cumulated}{cumulated reach of paid media} + \item{freq_bucket}{Frequency bucket for cumulated reach} +} +} +\usage{ +data(df_curve_reach_freq) +} +\value{ +data.frame + +Dataframe. +} +\description{ +A simulated cumulated reach and spend dataset by frequency buckets. +The headers must be kept as +\code{c("spend_cumulated", "response_cumulated", "freq_bucket")}. +} +\examples{ +data(df_curve_reach_freq) +head(df_curve_reach_freq) +} +\seealso{ +Other Dataset: +\code{\link{dt_prophet_holidays}}, +\code{\link{dt_simulated_weekly}} +} +\concept{Dataset} +\keyword{datasets} diff --git a/R/man/dt_prophet_holidays.Rd b/R/man/dt_prophet_holidays.Rd index 2ec9cdeee..5fe53f57f 100644 --- a/R/man/dt_prophet_holidays.Rd +++ b/R/man/dt_prophet_holidays.Rd @@ -32,6 +32,7 @@ head(dt_prophet_holidays) } \seealso{ Other Dataset: +\code{\link{df_curve_reach_freq}}, \code{\link{dt_simulated_weekly}} } \concept{Dataset} diff --git a/R/man/dt_simulated_weekly.Rd b/R/man/dt_simulated_weekly.Rd index 026ffeadb..8c66d3ac5 100644 --- a/R/man/dt_simulated_weekly.Rd +++ b/R/man/dt_simulated_weekly.Rd @@ -31,6 +31,7 @@ head(dt_simulated_weekly) } \seealso{ Other Dataset: +\code{\link{df_curve_reach_freq}}, \code{\link{dt_prophet_holidays}} } \concept{Dataset} diff --git a/R/man/fit_spend_exposure.Rd b/R/man/fit_spend_exposure.Rd deleted file mode 100644 index 16b9d0b46..000000000 --- a/R/man/fit_spend_exposure.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inputs.R -\name{fit_spend_exposure} -\alias{fit_spend_exposure} -\title{Fit a nonlinear model for media spend and exposure} -\usage{ -fit_spend_exposure(dt_spendModInput, mediaCostFactor, paid_media_var) -} -\arguments{ -\item{dt_spendModInput}{data.frame. Containing channel spends and -exposure data.} - -\item{mediaCostFactor}{Numeric vector. The ratio between raw media -exposure and spend metrics.} - -\item{paid_media_var}{Character. Paid media variable.} -} -\value{ -List. Containing the all spend-exposure model results. -} -\description{ -This function is called in \code{robyn_engineering()}. It uses -the Michaelis-Menten function to fit the nonlinear model. Fallback -model is the simple linear model \code{lm()} in case the nonlinear -model is fitting worse. A bad fit here might result in unreasonable -model results. Two options are recommended: Either splitting the -channel into sub-channels to achieve better fit, or just use -spend as \code{paid_media_vars} -} diff --git a/R/man/hyper_names.Rd b/R/man/hyper_names.Rd index 3cb0442d9..81f0911ab 100644 --- a/R/man/hyper_names.Rd +++ b/R/man/hyper_names.Rd @@ -24,54 +24,29 @@ hyperparameters that is inserted into \code{robyn_inputs(hyperparameters = ...)} } \section{Guide to setup hyperparameters}{ - \enumerate{ - \item Get correct hyperparameter names: - All variables in \code{paid_media_vars} or \code{organic_vars} require hyperprameters - and will be transformed by adstock & saturation. Difference between \code{paid_media_vars} - and \code{organic_vars} is that \code{paid_media_vars} has spend that - needs to be specified in \code{paid_media_spends} specifically. Run \code{hyper_names()} - to get correct hyperparameter names. All names in hyperparameters must - equal names from \code{hyper_names()}, case sensitive. - \item Get guidance for setting hyperparameter bounds: - For geometric adstock, use theta, alpha & gamma. For both weibull adstock options, - use shape, scale, alpha, gamma. - \itemize{ - \item Theta: In geometric adstock, theta is decay rate. guideline for usual media genre: - TV c(0.3, 0.8), OOH/Print/Radio c(0.1, 0.4), digital c(0, 0.3) - \item Shape: In weibull adstock, shape controls the decay shape. Recommended c(0.0001, 2). - The larger, the more S-shape. The smaller, the more L-shape. Channel-type specific - values still to be investigated - \item Scale: In weibull adstock, scale controls the decay inflexion point. Very conservative - recommended bounce c(0, 0.1), because scale can increase adstocking half-life greatly. - Channel-type specific values still to be investigated - \item Gamma: In s-curve transformation with hill function, gamma controls the inflexion point. - Recommended bounce c(0.3, 1). The larger the gamma, the later the inflection point - in the response curve - } - \item Set each hyperparameter bounds. They either contains two values e.g. c(0, 0.5), - or only one value (in which case you've "fixed" that hyperparameter) -} +See section "Hyperparameter interpretation & recommendation" in demo +https://github.com/facebookexperimental/Robyn/blob/main/demo/demo.R } \section{Helper plots}{ \describe{ - \item{plot_adstock}{Get adstock transformation example plot, + \item{plot_adstock(TRUE)}{Get adstock transformation example plot, helping you understand geometric/theta and weibull/shape/scale transformation} - \item{plot_saturation}{Get saturation curve transformation example plot, + \item{plot_saturation(TRUE)}{Get saturation curve transformation example plot, helping you understand hill/alpha/gamma transformation} } } \examples{ \donttest{ -media <- c("facebook_S", "print_S", "tv_S") +media <- c("facebook_I", "print_S", "tv_S") hyper_names(adstock = "geometric", all_media = media) hyperparameters <- list( - facebook_S_alphas = c(0.5, 3), # example bounds for alpha - facebook_S_gammas = c(0.3, 1), # example bounds for gamma - facebook_S_thetas = c(0, 0.3), # example bounds for theta + facebook_I_alphas = c(0.5, 3), # example bounds for alpha + facebook_I_gammas = c(0.3, 1), # example bounds for gamma + facebook_I_thetas = c(0, 0.3), # example bounds for theta print_S_alphas = c(0.5, 3), print_S_gammas = c(0.3, 1), print_S_thetas = c(0.1, 0.4), @@ -81,13 +56,13 @@ hyperparameters <- list( ) # Define hyper_names for weibull adstock -hyper_names(adstock = "weibull", all_media = media) +hyper_names(adstock = "weibull_pdf", all_media = media) hyperparameters <- list( - facebook_S_alphas = c(0.5, 3), # example bounds for alpha - facebook_S_gammas = c(0.3, 1), # example bounds for gamma - facebook_S_shapes = c(0.0001, 2), # example bounds for shape - facebook_S_scales = c(0, 0.1), # example bounds for scale + facebook_I_alphas = c(0.5, 3), # example bounds for alpha + facebook_I_gammas = c(0.3, 1), # example bounds for gamma + facebook_I_shapes = c(0.0001, 2), # example bounds for shape + facebook_I_scales = c(0, 0.1), # example bounds for scale print_S_alphas = c(0.5, 3), print_S_gammas = c(0.3, 1), print_S_shapes = c(0.0001, 2), diff --git a/R/man/prophet_decomp.Rd b/R/man/prophet_decomp.Rd index b60bae467..5a2d57dc9 100644 --- a/R/man/prophet_decomp.Rd +++ b/R/man/prophet_decomp.Rd @@ -14,6 +14,7 @@ prophet_decomp( context_vars, organic_vars, paid_media_spends, + paid_media_vars, intervalType, dayInterval, custom_params @@ -33,6 +34,14 @@ Prophet holidays using \code{data("dt_prophet_holidays")}} push-notifications, social media posts etc. Compared to \code{paid_media_vars} \code{organic_vars} are often marketing activities without clear spends.} +\item{paid_media_vars}{Character vector. Names of the paid media variables' +exposure level metrics (impressions, clicks, GRP etc) other than spend. +The values on each of these variables must be numeric. These variables are not +being used to train the model but to check relationship and recommend to +split media channels into sub-channels (e.g. fb_retargeting, fb_prospecting, +etc.) to gain more variance. \code{paid_media_vars} must have same +order and length as \code{paid_media_spends} respectively and is not required.} + \item{custom_params}{List. Custom parameters passed to \code{prophet()}} } \value{ diff --git a/R/man/robyn_allocator.Rd b/R/man/robyn_allocator.Rd index 8ac07f8a8..cbe78645c 100644 --- a/R/man/robyn_allocator.Rd +++ b/R/man/robyn_allocator.Rd @@ -87,7 +87,7 @@ for each paid media variable when maximizing total media response. For example, \code{channel_constr_low = 0.7} means minimum spend of the variable is 70% of historical average, using non-zero spend values, within \code{date_min} and \code{date_max} date range. Both constrains must be length 1 (same for all values) OR same length and order as -\code{paid_media_spends}. It's not recommended to 'exaggerate' upper bounds, especially +\code{paid_media_selected}. It's not recommended to 'exaggerate' upper bounds, especially if the new level is way higher than historical level. Lower bound must be >=0.01, and upper bound should be < 5.} diff --git a/R/man/robyn_calibrate.Rd b/R/man/robyn_calibrate.Rd new file mode 100644 index 000000000..3478321c9 --- /dev/null +++ b/R/man/robyn_calibrate.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calibration.R +\name{robyn_calibrate} +\alias{robyn_calibrate} +\title{Robyn Calibration Function - BETA} +\usage{ +robyn_calibrate( + df_curve = NULL, + curve_type = NULL, + force_shape = NULL, + hp_bounds = NULL, + max_trials = 10, + max_iters = 2500, + loss_min_step_rel = 1e-04, + loss_stop_rel = 0.05, + burn_in_rel = 0.1, + sim_n = 30, + hp_interval = 0.5, + quiet = FALSE, + ... +) +} +\arguments{ +\item{df_curve}{data.frame. Requires two columns named spend and response. +Recommended sources of truth are Halo R&F or Meta conversion lift.} + +\item{curve_type}{Character. Currently only allows "saturation_reach_hill" +and only supports Hill function.} + +\item{force_shape}{Character. Allows c("c", "s") with default NULL that's no +shape forcing. It's recommended for offline media to have "c" shape, while +for online can be "s" or NULL. Shape forcing only works if hp_bounds is null.} + +\item{hp_bounds}{list. Currently only allows Hill for saturation. Ranges +for alpha and gamma are provided as Hill parameters. If NULL, hp_bounds takes +on default ranges.} + +\item{max_trials}{integer. Different trials have different starting point +and provide diversified sampling paths. Default to 10.} + +\item{max_iters}{integer. Loss is minimized while iteration increases. +Default to 2500.} + +\item{loss_min_step_rel}{numeric. Default to 0.01 and value is between 0-0.1. +0.01 means the optimisation is considered converged if error minimization is +<1 percent of maximal error.} + +\item{loss_stop_rel}{numeric. Default is 0.05 and value is between 0-0.5. +0.05 means 5 percent of the max_iters is used as the length of iterations to +calculate the mean error for convergence.} + +\item{burn_in_rel}{numeric. Default to 0.1 and value is between 0.0.5. 0.1 +means 10 percent of iterations is used as burn-in period.} + +\item{sim_n}{integer. Number of simulation for plotting fitted curve.} + +\item{hp_interval}{numeric. Default to 0.95 and is between 0.8-1. 0.95 means +2.5 - 97.5 percent percentile are used as parameter range for output.} + +\item{quiet}{Boolean. Keep messages off?} + +\item{...}{Additional parameters passed to \code{robyn_outputs()}.} +} +\value{ +List. Class: \code{curve_out}. Contains the results of all trials +and iterations modeled. +} +\description{ +\code{robyn_calibrate()} consumes source of truth or proxy data for +saturation or adstock curve estimation. This is an experimental feature and +can be used independently from Robyn's main model. +} +\examples{ +\dontrun{ +# Dummy input data for Meta spend. This is derived from Halo's reach & frequency data. +# Note that spend and response need to be cumulative metrics. +data("df_curve_reach_freq") + +# Using reach saturation from Halo as proxy +curve_out <- robyn_calibrate( + df_curve = df_curve_reach_freq, + curve_type = "saturation_reach_hill" +) +# For the simulated reach and frequency dataset, it's recommended to use +# "reach 1+" for gamma lower bound and "reach 10+" for gamma upper bound +facebook_I_gammas <- c( + curve_out[["curve_collect"]][["reach 1+"]][["hill"]][["gamma_best"]], + curve_out[["curve_collect"]][["reach 10+"]][["hill"]][["gamma_best"]]) +print(facebook_I_gammas) +} +} diff --git a/R/man/robyn_response.Rd b/R/man/robyn_response.Rd index 8bbc33a74..551621615 100644 --- a/R/man/robyn_response.Rd +++ b/R/man/robyn_response.Rd @@ -8,7 +8,6 @@ robyn_response( InputCollect = NULL, OutputCollect = NULL, json_file = NULL, - robyn_object = NULL, select_build = NULL, select_model = NULL, metric_name = NULL, @@ -32,9 +31,6 @@ recreate a model. To generate this file, use \code{robyn_write()}. If you didn't export your data in the json file as "raw_data", \code{dt_input} must be provided; \code{dt_holidays} input is optional.} -\item{robyn_object}{Character or List. Path of the \code{Robyn.RDS} object -that contains all previous modeling information or the imported list.} - \item{select_build}{Integer. Default to the latest model build. \code{select_build = 0} selects the initial model. \code{select_build = 1} selects the first refresh model.} @@ -54,11 +50,11 @@ per channel. Set one of: "all", "last", or "last_n" (where n is the last N dates available), date (i.e. "2022-03-27"), or date range (i.e. \code{c("2022-01-01", "2022-12-31")}). Default to "all".} -\item{dt_hyppar}{A data.frame. When \code{robyn_object} is not provided, use +\item{dt_hyppar}{A data.frame. When \code{json_file} is not provided, use \code{dt_hyppar = OutputCollect$resultHypParam}. It must be provided along \code{select_model}, \code{dt_coef} and \code{InputCollect}.} -\item{dt_coef}{A data.frame. When \code{robyn_object} is not provided, use +\item{dt_coef}{A data.frame. When \code{json_file} is not provided, use \code{dt_coef = OutputCollect$xDecompAgg}. It must be provided along \code{select_model}, \code{dt_hyppar} and \code{InputCollect}.} diff --git a/R/man/saturation_hill.Rd b/R/man/saturation_hill.Rd index 2d24ad9cc..e3e94d6e4 100644 --- a/R/man/saturation_hill.Rd +++ b/R/man/saturation_hill.Rd @@ -24,7 +24,7 @@ Hill-transformed value of the x_marginal input.} \item{plot}{Boolean. Do you wish to return the plot?} } \value{ -Numeric values. Transformed values. +List. x_saturated as transformed values and inflexion point } \description{ \code{saturation_hill} is a two-parametric version of the Hill diff --git a/R/man/set_default_hyppar.Rd b/R/man/set_default_hyppar.Rd new file mode 100644 index 000000000..063b03a34 --- /dev/null +++ b/R/man/set_default_hyppar.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inputs.R +\name{set_default_hyppar} +\alias{set_default_hyppar} +\title{Set default hyperparameters} +\usage{ +set_default_hyppar( + adstock = NULL, + all_media = NULL, + list_default = list(alpha = c(0.5, 3), gamma = c(0.01, 1), theta = c(0, 0.8), shape = + c(0, 10), scale = c(0, 0.1), train_size = c(0.5, 0.9)) +) +} +\arguments{ +\item{adstock}{Character. InputCollect$adstock} + +\item{all_media}{Character. Provide InputCollect$all_media.} + +\item{list_default}{A List. Default ranges for hyperparameters.} +} +\value{ +List. Expanded range of hyperparameters for all media. +} +\description{ +For quick setting of hyperparameter ranges. +} diff --git a/R/man/transformations.Rd b/R/man/transformations.Rd index 20f267118..2ac4ef986 100644 --- a/R/man/transformations.Rd +++ b/R/man/transformations.Rd @@ -8,7 +8,15 @@ \usage{ mic_men(x, Vmax, Km, reverse = FALSE) -run_transformations(InputCollect, hyperparameters, ...) +run_transformations( + all_media, + window_start_loc, + window_end_loc, + dt_mod, + adstock, + dt_hyppar, + ... +) } \arguments{ \item{x}{Numeric value or vector. Input media spend when @@ -22,12 +30,17 @@ GRPs, etc.) when \code{reverse = TRUE}.} \item{reverse}{Boolean. Input media spend when \code{reverse = FALSE}. Input media exposure metrics (impression, clicks, GRPs etc.) when \code{reverse = TRUE}.} -\item{InputCollect}{Default to NULL. \code{robyn_inputs}'s output when -\code{hyperparameters} are not yet set.} +\item{all_media}{Character. Vector of all selected paid media variable names.} -\item{hyperparameters}{List. Contains hyperparameter lower and upper bounds. -Names of elements in list must be identical to output of \code{hyper_names()}. -To fix hyperparameter values, provide only one value.} +\item{window_start_loc}{Integer. Rolling window start location.} + +\item{window_end_loc}{Integer. Rolling window end location.} + +\item{dt_mod}{dataframe. Transformed input table for transformation.} + +\item{adstock}{Character. Adstock config.} + +\item{dt_hyppar}{data.frame. All hyperparaters for provided media.} \item{...}{Additional parameters passed to \code{prophet} functions.} } diff --git a/demo/demo.R b/demo/demo.R index e59f3b6a3..0259bd7ad 100644 --- a/demo/demo.R +++ b/demo/demo.R @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. ############################################################################################# -#################### Meta MMM Open Source: Robyn 3.11.0 ####################### +#################### Meta MMM Open Source: Robyn 3.12.0 ####################### #################### Quick demo guide ####################### ############################################################################################# @@ -32,9 +32,6 @@ packageVersion("Robyn") Sys.setenv(R_FUTURE_FORK_ENABLE = "true") options(future.fork.enable = TRUE) -# Set to FALSE to avoid the creation of files locally -create_files <- TRUE - ## IMPORTANT: Must install and setup the python library "Nevergrad" once before using Robyn ## Guide: https://github.com/facebookexperimental/Robyn/blob/main/demo/install_nevergrad.R @@ -46,7 +43,7 @@ data("dt_simulated_weekly") head(dt_simulated_weekly) ## Check holidays from Prophet -# 59 countries included. If your country is not included, please manually add it. +# 123 countries included. If your country is not included, please manually add it. # Tipp: any events can be added into this table, school break, events etc. data("dt_prophet_holidays") head(dt_prophet_holidays) @@ -59,44 +56,41 @@ robyn_directory <- "~/Desktop" #### 2a-1: First, specify input variables -## All sign control are now automatically provided: "positive" for media & organic -## variables and "default" for all others. User can still customise signs if necessary. -## Documentation is available, access it anytime by running: ?robyn_inputs +## For documentation of all arguments run ?robyn_inputs InputCollect <- robyn_inputs( dt_input = dt_simulated_weekly, dt_holidays = dt_prophet_holidays, date_var = "DATE", # date format must be "2020-01-01" - dep_var = "revenue", # there should be only one dependent variable - dep_var_type = "revenue", # "revenue" (ROI) or "conversion" (CPA) - prophet_vars = c("trend", "season", "holiday"), # "trend","season", "weekday" & "holiday" - prophet_country = "DE", # input country code. Check: dt_prophet_holidays + dep_var = "revenue", # currently one dependent variable allowed + dep_var_type = "revenue", # "revenue" or "conversion" allowed + prophet_vars = c("trend", "season", "holiday"), # "trend","season", "weekday" & "holiday" allowed + prophet_country = "DE", # 123 countries included in dt_prophet_holidays context_vars = c("competitor_sales_B", "events"), # e.g. competitors, discount, unemployment etc paid_media_spends = c("tv_S", "ooh_S", "print_S", "facebook_S", "search_S"), # mandatory input - paid_media_vars = c("tv_S", "ooh_S", "print_S", "facebook_I", "search_clicks_P"), # mandatory. - # paid_media_vars must have same order as paid_media_spends. Use media exposure metrics like - # impressions, GRP etc. If not applicable, use spend instead. - organic_vars = "newsletter", # marketing activity without media spend - # factor_vars = c("events"), # force variables in context_vars or organic_vars to be categorical + paid_media_vars = c("tv_S", "ooh_S", "print_S", "facebook_I", "search_clicks_P"), # if provided, + # Robyn will use this instead of paid_media_spends for modelling. Media exposure metrics + # include typically, but not limited to impressions, GRP etc. If not applicable, use spend instead. + organic_vars = c("newsletter"), # marketing activity without media spend + factor_vars = c("events"), # indicate categorical varibales in context_vars or organic_vars window_start = "2016-01-01", window_end = "2018-12-31", - adstock = "geometric" # geometric, weibull_cdf or weibull_pdf. + adstock = "geometric" # geometric or weibull_pdf ) print(InputCollect) #### 2a-2: Second, define and add hyperparameters -## Default media variable for modelling has changed from paid_media_vars to paid_media_spends. -## Also, calibration_input are required to be spend names. -## hyperparameter names are based on paid_media_spends names too. See right hyperparameter names: +## hyperparameter names are based on paid_media_vars. See right hyperparameter names: hyper_names(adstock = InputCollect$adstock, all_media = InputCollect$all_media) ## Guide to setup & understand hyperparameters -## Robyn's hyperparameters have four components: +## Robyn's hyperparameters have five components: ## - Adstock parameters (theta or shape/scale) ## - Saturation parameters (alpha/gamma) ## - Regularisation parameter (lambda). No need to specify manually ## - Time series validation parameter (train_size) +## - Beta coefficient penalty factor (_penalty). No need to specify manually ## 1. IMPORTANT: set plot = TRUE to create example plots for adstock & saturation ## hyperparameters and their influence in curve transformation. @@ -104,7 +98,7 @@ plot_adstock(plot = FALSE) plot_saturation(plot = FALSE) ## 2. Get correct hyperparameter names: -# All variables in paid_media_spends and organic_vars require hyperparameter and will be +# All variables in paid_media_vars and organic_vars require hyperparameter and will be # transformed by adstock & saturation. # Run hyper_names(adstock = InputCollect$adstock, all_media = InputCollect$all_media) # to get correct media hyperparameter names. All names in hyperparameters must equal @@ -119,14 +113,6 @@ plot_saturation(plot = FALSE) # to convert weekly to daily we can transform the parameter to the power of (1/7), # so to convert 30% daily to weekly is 0.3^(1/7) = 0.84. -## Weibull CDF adstock: The Cumulative Distribution Function of Weibull has two parameters, -# shape & scale, and has flexible decay rate, compared to Geometric adstock with fixed -# decay rate. The shape parameter controls the shape of the decay curve. Recommended -# bound is c(0, 2). The larger the shape, the more S-shape. The smaller, the more -# L-shape. Scale controls the inflexion point of the decay curve. We recommend very -# conservative bounce of c(0, 0.1), because scale increases the adstock half-life greatly. -# When shape or scale is 0, adstock will be 0. - ## Weibull PDF adstock: The Probability Density Function of the Weibull also has two # parameters, shape & scale, and also has flexible decay rate as Weibull CDF. The # difference is that Weibull PDF offers lagged effect. When shape > 2, the curve peaks @@ -144,10 +130,18 @@ plot_saturation(plot = FALSE) # in hyperparameter spaces for Nevergrad to explore, it also requires larger iterations # to converge. When shape or scale is 0, adstock will be 0. +## Weibull CDF adstock (to be depreceated): The Cumulative Distribution Function of Weibull has two parameters, +# shape & scale, and has flexible decay rate, compared to Geometric adstock with fixed +# decay rate. The shape parameter controls the shape of the decay curve. Recommended +# bound is c(0, 2). The larger the shape, the more S-shape. The smaller, the more +# L-shape. Scale controls the inflexion point of the decay curve. We recommend very +# conservative bounce of c(0, 0.1), because scale increases the adstock half-life greatly. +# When shape or scale is 0, adstock will be 0. + ## Hill function for saturation: Hill function is a two-parametric function in Robyn with # alpha and gamma. Alpha controls the shape of the curve between exponential and s-shape. # Recommended bound is c(0.5, 3). The larger the alpha, the more S-shape. The smaller, the -# more C-shape. Gamma controls the inflexion point. Recommended bounce is c(0.3, 1). The +# more C-shape. Gamma controls the inflexion point. Recommended bounce is c(0.01, 1). The # larger the gamma, the later the inflection point in the response curve. ## Regularization for ridge regression: Lambda is the penalty term for regularised regression. @@ -167,19 +161,19 @@ hyper_limits() # Example hyperparameters ranges for Geometric adstock hyperparameters <- list( - facebook_S_alphas = c(0.5, 3), - facebook_S_gammas = c(0.3, 1), - facebook_S_thetas = c(0, 0.3), - print_S_alphas = c(0.5, 3), + facebook_I_alphas = c(0.5, 3), + facebook_I_gammas = c(0.3, 1), + facebook_I_thetas = c(0, 0.3), + print_S_alphas = c(0.5, 1), print_S_gammas = c(0.3, 1), print_S_thetas = c(0.1, 0.4), - tv_S_alphas = c(0.5, 3), + tv_S_alphas = c(0.5, 1), tv_S_gammas = c(0.3, 1), tv_S_thetas = c(0.3, 0.8), - search_S_alphas = c(0.5, 3), - search_S_gammas = c(0.3, 1), - search_S_thetas = c(0, 0.3), - ooh_S_alphas = c(0.5, 3), + search_clicks_P_alphas = c(0.5, 3), + search_clicks_P_gammas = c(0.3, 1), + search_clicks_P_thetas = c(0, 0.3), + ooh_S_alphas = c(0.5, 1), ooh_S_gammas = c(0.3, 1), ooh_S_thetas = c(0.1, 0.4), newsletter_alphas = c(0.5, 3), @@ -188,6 +182,8 @@ hyperparameters <- list( train_size = c(0.5, 0.8) ) +# hyperparameters <- set_default_hyppar("weibull_pdf", InputCollect$all_media) + # Example hyperparameters ranges for Weibull CDF adstock # facebook_S_alphas = c(0.5, 3) # facebook_S_gammas = c(0.3, 1) @@ -200,16 +196,50 @@ hyperparameters <- list( # facebook_S_shapes = c(0, 10) # facebook_S_scales = c(0, 0.1) -#### 2a-3: Third, add hyperparameters into robyn_inputs() +#### 2a-3 (Optional) Saturation calibration - The curve calibrator - Beta feature + +## Dummy input data for Meta spend. This is derived from Halo's reach & frequency data. +## Note that spend and response need to be cumulative metrics. Headers need to be +## kept the same as dummy dataset. + +# data("df_curve_reach_freq") +# +# # Currently only supports curve_type = "saturation_reach" +# curve_out <- robyn_calibrate( +# df_curve = df_curve_reach_freq, +# curve_type = "saturation_reach" +# ) +# curve_out$plot_reach_freq +# +# ## Reach & frequency calibration reasoning +# # When calibrating response saturation with reach and frequency, it's +# # recommended to use "reach 1+" for gamma lower bound and higher frequency bucket, +# # e.g. "reach 10+" in the dummy dataset, for gamma upper bound calibration. +# # Assuming an extreme situation when every user sees one impression and purchases +# # immediately, In such case, response curve equals to the reach 1+ curve. Gamma +# # controls the inflexion point of a Hill curve and a lower gamma means earlier +# # and faster saturation at a lower spend level. We believe that the cumulative +# # reach 1+ curve represents the earliest inflexion, thus we believe it's a +# # reasonable lower boundary for gamma for a selected channel. As frequency +# # increases, the inflexion point also delays and approaches the hidden true +# # response curve. In the dummy dataset, we've simulated reach 10+ to represent +# # upper bound for gamma. Use domain expertise to further narrow down or widen +# # the bounds. For alpha, we recommend to keep the value flexible as in default. +# facebook_I_gammas <- c( +# curve_out[["curve_collect"]][["reach 1+"]][["hill"]][["gamma_best"]], +# curve_out[["curve_collect"]][["reach 10+"]][["hill"]][["gamma_best"]]) +# hyperparameters$facebook_I_gammas <- facebook_I_gammas + +#### 2a-4: Third, add hyperparameters into robyn_inputs() InputCollect <- robyn_inputs(InputCollect = InputCollect, hyperparameters = hyperparameters) print(InputCollect) -#### 2a-4: Fourth (optional), model calibration / add experimental input +#### 2a-5: (Optional) Effect size calibration -## Guide for calibration +## Effect size calibration -# 1. Calibration channels need to be paid_media_spends or organic_vars names. +# 1. Calibration channels need to be paid_media_vars or organic_vars names. # 2. We strongly recommend to use Weibull PDF adstock for more degree of freedom when # calibrating Robyn. # 3. We strongly recommend to use experimental and causal results that are considered @@ -234,16 +264,16 @@ print(InputCollect) # to indicate combination of channels, case sensitive. # calibration_input <- data.frame( -# # channel name must in paid_media_vars +# # channel name should be in paid_media_vars and / or organic_vars # channel = c("facebook_S", "tv_S", "facebook_S+search_S", "newsletter"), # # liftStartDate must be within input data range -# liftStartDate = as.Date(c("2018-05-01", "2018-04-03", "2018-07-01", "2017-12-01")), +# liftStartDate = as.Date(c("2018-05-01", "2018-01-01", "2018-07-01", "2017-12-01")), # # liftEndDate must be within input data range -# liftEndDate = as.Date(c("2018-06-10", "2018-06-03", "2018-07-20", "2017-12-31")), +# liftEndDate = as.Date(c("2018-06-10", "2018-03-01", "2018-07-20", "2017-12-31")), # # Provided value must be tested on same campaign level in model and same metric as dep_var_type -# liftAbs = c(400000, 300000, 700000, 200), +# liftAbs = c(40000, 120000, 60000, 200), # # Spend within experiment: should match within a 10% error your spend on date range for each channel from dt_input -# spend = c(421000, 7100, 350000, 0), +# spend = c(12000, 86000, 22000, 0), # # Confidence: if frequentist experiment, you may use 1 - pvalue # confidence = c(0.85, 0.8, 0.99, 0.95), # # KPI measured: must match your dep_var @@ -253,7 +283,6 @@ print(InputCollect) # ) # InputCollect <- robyn_inputs(InputCollect = InputCollect, calibration_input = calibration_input) - ################################################################ #### Step 2b: For known model specification, setup in one single step @@ -279,10 +308,8 @@ print(InputCollect) # ,calibration_input = calibration_input # as in 2a-4 above # ) -#### Check spend exposure fit if available -if (length(InputCollect$exposure_vars) > 0) { - lapply(InputCollect$modNLS$plots, plot) -} +#### Check spend exposure fit and consider channel split if applicable +InputCollect$ExposureCollect$plot_spend_exposure ##### Manually save and import InputCollect as JSON file # robyn_write(InputCollect, dir = "~/Desktop") @@ -300,8 +327,8 @@ OutputModels <- robyn_run( cores = NULL, # NULL defaults to (max available - 1) iterations = 2000, # 2000 recommended for the dummy dataset with no calibration trials = 5, # 5 recommended for the dummy dataset - ts_validation = TRUE, # 3-way-split time series for NRMSE validation. - add_penalty_factor = FALSE # Experimental feature. Use with caution. + ts_validation = FALSE, # 3-way-split time series for NRMSE validation. + add_penalty_factor = FALSE # Experimental feature to add more flexibility ) print(OutputModels) @@ -311,8 +338,8 @@ OutputModels$convergence$moo_distrb_plot OutputModels$convergence$moo_cloud_plot ## Check time-series validation plot (when ts_validation == TRUE) -# Read more and replicate results: ?ts_validation -if (OutputModels$ts_validation) OutputModels$ts_validation_plot +## Read more and replicate results: ?ts_validation +# if (OutputModels$ts_validation) OutputModels$ts_validation_plot ## Calculate Pareto fronts, cluster and export results and plots. See ?robyn_outputs OutputCollect <- robyn_outputs( @@ -322,9 +349,9 @@ OutputCollect <- robyn_outputs( # calibration_constraint = 0.1, # range c(0.01, 0.1) & default at 0.1 csv_out = "pareto", # "pareto", "all", or NULL (for none) clusters = TRUE, # Set to TRUE to cluster similar models by ROAS. See ?robyn_clusters - export = create_files, # this will create files locally + export = TRUE, # this will create files locally plot_folder = robyn_directory, # path for plots exports and files creation - plot_pareto = create_files # Set to FALSE to deactivate plotting and saving model one-pagers + plot_pareto = TRUE # Set to FALSE to deactivate plotting and saving model one-pagers ) print(OutputCollect) @@ -341,10 +368,10 @@ print(OutputCollect) ## Compare all model one-pagers and select one that mostly reflects your business reality print(OutputCollect) -select_model <- "1_122_7" # Pick one of the models from OutputCollect to proceed +select_model <- "5_257_2" # Pick one of the models from OutputCollect to proceed #### Version >=3.7.1: JSON export and import (faster and lighter than RDS files) -ExportedModel <- robyn_write(InputCollect, OutputCollect, select_model, export = create_files) +ExportedModel <- robyn_write(InputCollect, OutputCollect, select_model, export = TRUE) print(ExportedModel) # To plot any model's one-pager: @@ -367,7 +394,7 @@ print(ExportedModel) # Run ?robyn_allocator to check parameter definition # NOTE: The order of constraints should follow: -InputCollect$paid_media_spends +InputCollect$paid_media_selected # Scenario "max_response": "What's the max. return given certain spend?" # Example 1: max_response default setting: maximize response for latest month @@ -380,8 +407,7 @@ AllocatorCollect1 <- robyn_allocator( channel_constr_low = 0.7, channel_constr_up = c(1.2, 1.5, 1.5, 1.5, 1.5), # channel_constr_multiplier = 3, - scenario = "max_response", - export = create_files + scenario = "max_response" ) # Print & plot allocator's output print(AllocatorCollect1) @@ -393,12 +419,11 @@ AllocatorCollect2 <- robyn_allocator( OutputCollect = OutputCollect, select_model = select_model, date_range = "last_10", # Last 10 periods, same as c("2018-10-22", "2018-12-31") - total_budget = 5000000, # Total budget for date_range period simulation + total_budget = 1500000, # Total budget for date_range period simulation channel_constr_low = c(0.8, 0.7, 0.7, 0.7, 0.7), - channel_constr_up = c(1.2, 1.5, 1.5, 1.5, 1.5), + channel_constr_up = 1.5, channel_constr_multiplier = 5, # Customise bound extension for wider insights - scenario = "max_response", - export = create_files + scenario = "max_response" ) print(AllocatorCollect2) plot(AllocatorCollect2) @@ -412,51 +437,31 @@ AllocatorCollect3 <- robyn_allocator( InputCollect = InputCollect, OutputCollect = OutputCollect, select_model = select_model, + #channel_constr_low = 0.1, + #channel_constr_up = 10, # date_range = NULL, # Default: "all" available dates scenario = "target_efficiency", - # target_value = 2, # Customize target ROAS or CPA value - export = create_files + # target_value = 5 # Customize target ROAS or CPA value ) print(AllocatorCollect3) plot(AllocatorCollect3) # Example 4: Customize target_value for ROAS or CPA (using json_file) -json_file = "~/Desktop/Robyn_202302221206_init/RobynModel-1_117_11.json" +json_file = "~/Desktop/Robyn_202412181043_init/RobynModel-5_257_2.json" AllocatorCollect4 <- robyn_allocator( json_file = json_file, # Using json file from robyn_write() for allocation dt_input = dt_simulated_weekly, dt_holidays = dt_prophet_holidays, - date_range = NULL, # Default last month as initial period + # date_range = NULL, scenario = "target_efficiency", target_value = 2, # Customize target ROAS or CPA value - plot_folder = "~/Desktop/my_dir", - plot_folder_sub = "my_subdir", - export = create_files + plot_folder = "~/Desktop", + plot_folder_sub = "my_subdir" ) ## A csv is exported into the folder for further usage. Check schema here: ## https://github.com/facebookexperimental/Robyn/blob/main/demo/schema.R -## QA optimal response -# Pick any media variable: InputCollect$all_media -select_media <- "search_S" -# For paid_media_spends set metric_value as your optimal spend -metric_value <- AllocatorCollect1$dt_optimOut$optmSpendUnit[ - AllocatorCollect1$dt_optimOut$channels == select_media -]; metric_value -# # For paid_media_vars and organic_vars, manually pick a value -# metric_value <- 10000 - -## Saturation curve for adstocked metric results (example) -robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = select_media, - metric_value = metric_value, - date_range = "last_5" -) - ################################################################ #### Step 6: Model refresh based on selected model and saved results @@ -469,24 +474,23 @@ robyn_response( # Provide JSON file with your InputCollect and ExportedModel specifications # It can be any model, initial or a refresh model -json_file <- "~/Desktop/Robyn_202211211853_init/RobynModel-1_100_6.json" RobynRefresh <- robyn_refresh( json_file = json_file, dt_input = dt_simulated_weekly, dt_holidays = dt_prophet_holidays, - refresh_steps = 13, - refresh_iters = 1000, # 1k is an estimation - refresh_trials = 1 + refresh_steps = 4, + refresh_iters = 2000, + refresh_trials = 5 ) # Now refreshing a refreshed model, following the same approach -json_file_rf1 <- "~/Desktop/Robyn_202208231837_init/Robyn_202208231841_rf1/RobynModel-1_12_5.json" +json_file_rf1 <- "~/Desktop/Robyn_202412181043_init/Robyn_202412181054_rf1/RobynModel-1_133_7.json" RobynRefresh <- robyn_refresh( json_file = json_file_rf1, dt_input = dt_simulated_weekly, dt_holidays = dt_prophet_holidays, - refresh_steps = 7, - refresh_iters = 1000, # 1k is an estimation - refresh_trials = 1 + refresh_steps = 4, + refresh_iters = 2000, + refresh_trials = 5 ) # Continue with refreshed new InputCollect, OutputCollect, select_model values @@ -519,7 +523,7 @@ Response <- robyn_response( InputCollect = InputCollect, OutputCollect = OutputCollect, select_model = select_model, - metric_name = "facebook_S" + metric_name = "facebook_I" ) Response$plot @@ -532,14 +536,14 @@ Response$plot # ) ## Get the "next 100 dollar" marginal response on Spend1 -Spend1 <- 20000 +Spend1 <- 80000 Response1 <- robyn_response( InputCollect = InputCollect, OutputCollect = OutputCollect, select_model = select_model, - metric_name = "facebook_S", + metric_name = "facebook_I", metric_value = Spend1, # total budget for date_range - date_range = "last_1" # last two periods + date_range = "last_10" # last two periods ) Response1$plot @@ -548,36 +552,13 @@ Response2 <- robyn_response( InputCollect = InputCollect, OutputCollect = OutputCollect, select_model = select_model, - metric_name = "facebook_S", + metric_name = "facebook_I", metric_value = Spend2, - date_range = "last_1" + date_range = "last_10" ) # ROAS for the 100$ from Spend1 level -(Response2$response_total - Response1$response_total) / (Spend2 - Spend1) - -## Get response from for a given budget and date_range -Spend3 <- 100000 -Response3 <- robyn_response( - InputCollect = InputCollect, - OutputCollect = OutputCollect, - select_model = select_model, - metric_name = "facebook_S", - metric_value = Spend3, # total budget for date_range - date_range = "last_5" # last 5 periods -) -Response3$plot - -## Example of getting paid media exposure response curves -# imps <- 10000000 -# response_imps <- robyn_response( -# InputCollect = InputCollect, -# OutputCollect = OutputCollect, -# select_model = select_model, -# metric_name = "facebook_I", -# metric_value = imps -# ) -# response_imps$response_total / imps * 1000 -# response_imps$plot +(Response2$sim_mean_response - Response1$sim_mean_response) / + (Response2$sim_mean_spend - Response1$sim_mean_spend) ## Example of getting organic media exposure response curves sendings <- 30000 @@ -586,11 +567,11 @@ response_sending <- robyn_response( OutputCollect = OutputCollect, select_model = select_model, metric_name = "newsletter", - metric_value = sendings + metric_value = sendings, + date_range = "last_10" ) -# response per 1000 sendings -response_sending$response_total / sendings * 1000 -response_sending$plot +# Simulated cost per thousand sendings +response_sending$sim_mean_spend / response_sending$sim_mean_response * 1000 ################################################################ #### Optional: recreate old models and replicate results @@ -610,7 +591,7 @@ robyn_write(InputCollect, OutputCollect, select_model) ############ READ ############ # Recreate `InputCollect` and `OutputCollect` objects # Pick any exported model (initial or refreshed) -json_file <- "~/Desktop/Robyn_202208231837_init/RobynModel-1_100_6.json" +json_file <- "~/Desktop/Robyn_202412181043_init/RobynModel-5_257_2.json" # Optional: Manually read and check data stored in file json_data <- robyn_read(json_file) @@ -625,12 +606,12 @@ InputCollectX <- robyn_inputs( # Re-create OutputCollect OutputCollectX <- robyn_run( InputCollect = InputCollectX, - json_file = json_file, - export = create_files) + json_file = json_file + ) # Or re-create both by simply using robyn_recreate() RobynRecreated <- robyn_recreate( - json_file = "~/Desktop/Robyn_202303131448_init/RobynModel-1_103_7.json", + json_file = json_file, dt_input = dt_simulated_weekly, dt_holidays = dt_prophet_holidays, quiet = FALSE) diff --git a/website/docs/features.mdx b/website/docs/features.mdx index 643596c6d..fc7c04009 100644 --- a/website/docs/features.mdx +++ b/website/docs/features.mdx @@ -5,7 +5,7 @@ title: Key Features import useBaseUrl from '@docusaurus/useBaseUrl'; -An in-depth discussion of both the implementation and technical underpinnings of Robyn follows. +This documentation is updated in December 2024 on the R dev version v3.12.0 of Robyn. --- ## Model Inputs @@ -36,11 +36,11 @@ InputCollect <- robyn_inputs( Refer to the [Data Collection](https://facebookexperimental.github.io/Robyn/docs/analysts-guide-to-MMM#data-collection) section of the Analyst's Guide to MMM for a detailed discussion on best practices for selecting your paid media variables. -For paid media variables, we currently require that users designate both `paid_media_vars` and `paid_media_spends`. `paid_media_vars` refers to media exposure metrics, e.g. TV GRP, search clicks or Facebook impressions. `paid_media_spends` refers to media spending. The two vectors must have the same length and the same order of media. When exposure metrics are not available, use the same as in `paid_media_spends`. +For paid media variables, we require users to assign values to both `paid_media_vars` and `paid_media_spends`. `paid_media_vars` refers to media exposure metrics, e.g. TV GRP, search clicks or Facebook impressions. `paid_media_spends` refers to media spending. The two vectors must have the same length and the same order of media. When exposure metrics are not available, use the same as in `paid_media_spends`. -**Note**: Robyn currently fits the main model using only media spend. Exposure as modeling varibale is deprecated due to the uncertainty it introduces to the budget allocation. Robyn uses the Michaelis-Menten function to translate between spend and exposure. Despite the nonlinear flexibility this function provides, the relationship between spend and exposure is often too complex for a univariate transformation. In earlier versions, this inaccuracy in spend-exposure translation has caused unreliable budget allocation. Even though we understand the narrative of using exposure metrics, we've decided to trade it off against the reliability of budget allocation. We'll re-introduce exposure fitting as soon as we have a working solution. +Robyn currently fits the main model using a mix of media spend and exposure. When `paid_media_vars` is specified with exposure metrics, these will be prioritised and used for fitting with the fallback in `paid_media_spends`. Using exposure as modeling varibale was not enabled in Robyn for a long time due to the uncertainty it introduces to the budget allocation. The reason was that the relationship between spend and exposure is often too complex for a univariate transformation. In earlier versions of Robyn, this inaccuracy in spend-exposure translation has caused unreliable budget allocation. However, this is reactivated in v3.12.0, not only because of the interests from users, but also due to the introduction of the [new curve calibrator](https://facebookexperimental.github.io/Robyn/docs/features#holistic-calibration). While the above mentioned inaccuracy in spend-exposure translation still exists, we believe that it's mitigated through the calibration of response curve from ground truth. -Despite the deprecation mentioned above, there's still value in providing exposure metrics in Robyn. If there's weak association detected between spend & exposure, it indicates that exposure metrics have different underlying pattern than spends. In this case, Robyn will recommend to splitting the channel for potentially better modeling result. Take Meta as example, retargeting and prospecting campaigns might have very different CPMs and efficiencies. In which case, it would be meaningful to split Meta by retargeting and prospecting. +Moreover, we highly recommend users to closely examine the relationship beteween exposure and spend metrics. If there's weak association detected between spend & exposure, it indicates that exposure metrics have different underlying pattern than spends. In this case, we recommend to splitting the channel for potentially better modeling result. Take Meta as example, retargeting and prospecting campaigns might have very different CPMs and efficiencies. In which case, it would be meaningful to split Meta by retargeting and prospecting. For details see the demo [here](https://github.com/facebookexperimental/Robyn/blob/main/demo/demo.R#L311). In general, it's important to ensure that the paid media data is complete and accurate before proceeding. We will talk more about the variable transformations paid media variables will be subject to in the Variable Transformation section. @@ -333,7 +333,7 @@ As depicted in plot 4 in [session model onepager](#model-onepager) below, the k- --- -## Calibration with causal experiments +## Calibration of average effect size with causal experiments Randomised controlled trial (RCT) is an established academic gold standard to infer causality in science. By applying results from RCT in ads measurement that are considered ground truth, you can introduce causality into your marketing mix models. Robyn implements the MMM calibration as an objective function in the multi-objective optimization by parameterizing the difference between causal results and predicted media contribution. @@ -363,6 +363,57 @@ There're two major types of experiements in ads measurement, as pointed out by t Robyn accepts a dataframe as calibration input in the `robyn_inputs()` function. The function usage can be found in the [demo](https://github.com/facebookexperimental/Robyn/blob/main/demo/demo.R#L262). +--- + +## Holistic calibration + +### Rethinking calibration + +The triangulation of MMM, experiments and attribution is the centerpiece of [modern measurement](https://www.facebook.com/business/news/advanced-measurement-strategy). While there's no universally accepted definition of calibration, it often refers to the adjustment of estimated impact of media between different measurement solutions. + +In MMM, calibration usually refers to adjusting the average effect size of a certain channel by causal experiments, as explained in details above. However, the average effect size, or the beta coefficient in a regression model, is not the only estimate in an MMM system. The bare minimum of a set of estimates in MMM includes the **average effect size, adstock and saturation**. And just as the effect size, adstock and saturation are uncertain parametric quantities that can and should be calibrated by ground truth whenever possible. We believe that **holistic calibration** is the next step of triangulation and integrated marketing measurement system. + +Reach & frequency curve calibration + +### The curve calibrator (beta) + +Robyn is releasing a new feature **"the curve calibrator"** `robyn_calibrate()` as a step towards holistic calibration. The first use case is to calibrate the response saturation curve using cumulative reach and frequency data as input. This type of data is usually available as siloed media reports for most offline and online channels. The latest choice of reach and frequency data is **[Project Halo](https://wfanet.org/leadership/cross-media-measurement)**, an industry-wide collaboration that aims to deliver privacy-safe and always-on cross-channel reach deduplication. The above graphic is derived and simulated based on a real Halo dataset with cumulative spend and cumulative reach by frequency buckets. For example, "reach 3+" means reaching 3 impressions on average per person. There's certainly a gap between saturation of reach and business outcome (purchase, sales etc.). However, they're also interconnected along the same conversion funnel (upper funnel -> lower funnel), while the reach & frequency saturation curve is often more available. Therefore, **we're exploring the potential of using reach & frequency to guide response saturation estimation.** + +According to [a recent research paper](https://arxiv.org/abs/2408.07678) from the Wharton School and the London Business School by Dew, Padilla and Shchetkina, a common MMM cannot reliably identify saturation parameters, quote _"as practitioners attempt to capture increasingly complex effects in MMMs, like nonlinearities and dynamics, our results suggest caution is warranted: the simple data used for building such models often cannot uniquely identify such complexity."_ In other words, **saturation should be calibrated by ground truth whenever possible.** + +**Our approach for saturation calibration by reach & frequency**: Assuming an extreme situation where every user sees the first impression and purchases immediately. In such a case, response saturation curve equals the reach 1+ saturation curve. [Hill function](https://facebookexperimental.github.io/Robyn/docs/features#saturation) is a popular choice for saturation transformation and implemented in Robyn, where gamma controls the latitude of the inflexion point. A lower gamma means earlier and faster saturation at a lower spend level. **Our hypothesis** is that the cumulative reach 1+ curve represents the earliest inflexion and thus serves as a reasonable lower boundary for gamma for a selected channel. **As frequency increases, the reach inflexion point delays and approaches lower-funnel. We believe that the hidden true response curve lies within the spectrum of cumulative reach and frequency curves**. In the dummy dataset, we've simulated reach 10+ to represent the upper bound for gamma. The "best converting frequency" varies strongly across verticals. We believe that reach & frequency it's one step closer to identifying the true saturation relationship. Use domain expertise to further narrow down or widen the bounds. For alpha, we recommend to keeping the value flexible as in default. + +The below graphic is an exemplary visualisation of a curve fitting process, where Nevergrad is used to estimate alpha, gamma as well as the beta. Note that the distribution of alpha and gamma are often multimodal and non-normal, because they rather reflect the hyperparameter optimization path of Nevergrad than their underlying distribution. + +Reach & frequency curve fitting + +To try out `robyn_calibrate()`, please see [this tutorial](https://github.com/facebookexperimental/Robyn/blob/main/demo/demo.R#L200) in the demo. + +``` +library(Robyn) +data("df_curve_reach_freq") + +# Using reach saturation as proxy +curve_out <- robyn_calibrate( + df_curve = df_curve_reach_freq, + curve_type = "saturation_reach_hill" +) +# For the simulated reach and frequency dataset, it's recommended to use +# "reach 1+" for gamma lower bound and "reach 10+" for gamma upper bound +facebook_I_gammas <- c( + curve_out[["curve_collect"]][["reach 1+"]][["hill"]][["gamma_best"]], + curve_out[["curve_collect"]][["reach 10+"]][["hill"]][["gamma_best"]]) +print(facebook_I_gammas) + +``` + +### Customizable for 3rd-party MMM +**While the curve calibrator is released within the Robyn package, it can be used as a standalone feature without having built a model in Robyn.** The current beta version is piloting the two-parametric Hill function for saturation. Any MMM solution, not only Robyn, that employs the two-parametric Hill function can be calibrated by the curve calibrator. + +In the future, we're planning to partner with our community, advertisers, agencies and measurement vendors to further explore this area and also to expand the curve calibrator to cover other popular nonlinear functions for saturation (e.g. exponential, arctan or power function) as well as adstock (geometric or weibull function). + + + --- ## Model onepager @@ -405,6 +456,17 @@ following deepdive articles: - **"The convergence of marginal ROAS in the budget allocation in Robyn"**: [here](https://medium.com/@gufengzhou/the-convergence-of-marginal-roas-in-the-budget-allocation-in-robyn-5d407aebf021) - **"Hitting ROAS target using Robyn’s budget allocator"**: [here](https://medium.com/@gufengzhou/hitting-roas-target-using-robyns-budget-allocator-274ace3add4f) +--- +## The reach & frequency allocator (prototype) + +Reach and frequency planning is conducted frequently by advertisers and media agencies. To further harvest the power of **[Project Halo](https://wfanet.org/leadership/cross-media-measurement)** and provide more actionability for R&F data, we're experimenting on a new feature "reach and frequency allocator". It answers the question “what’s the optimal combination of reach & frequency” on a channel given a total budget and average CPM. It consumes the saturation information from R&F data and uses nonlinear optimization to find the optimal point with highest response. + +Decription of the graphic below: Given 100k budget and 6$ CPM, as well as the estimated saturation curve for reach and frequency separately using Halo data, the optimum R&F combination is 5.67M reach x 2.94 frequency, with the maximum response (sales) of 317.8k$. + +ModelResults1 chart + +It's currently a prototype and not yet available. An MVP is expected in 2025. + --- ## Model refresh diff --git a/website/docs/installation.mdx b/website/docs/installation.mdx index e3313641d..f4bb2277c 100644 --- a/website/docs/installation.mdx +++ b/website/docs/installation.mdx @@ -5,13 +5,7 @@ title: Installation import useBaseUrl from '@docusaurus/useBaseUrl'; -## Installing R ---- - -Robyn is built in R-only at the moment and requries the [R 4.0.0 version (or higher)](https://www.r-project.org/). The recommended IDE is [RStudio](https://www.rstudio.com/products/rstudio/download/). - - -## Installing Robyn +## Installing Robyn R --- There're two versions of Robyn: The stable version on CRAN and the dev version on GitHub. @@ -25,21 +19,9 @@ Install latest dev version, run: ``` remotes::install_github("facebookexperimental/Robyn/R") ``` -**NOTE**: Robyn requires an one-time installation of the Python library [Nevergrad](https://facebookresearch.github.io/nevergrad/) using the R package `reticulate`. Please refer to this [Nevergrad installation](https://github.com/facebookexperimental/Robyn/blob/main/demo/install_nevergrad.R) guide for R. +**NOTE**: Robyn requires an one-time installation of the Python library [Nevergrad](https://facebookresearch.github.io/nevergrad/) using the R package `reticulate`. Please refer to this [Nevergrad installation](https://github.com/facebookexperimental/Robyn/blob/main/demo/install_nevergrad.R) guide for R. For detailed installation and usage guide please see [demo.R](https://github.com/facebookexperimental/Robyn/blob/main/demo/demo.R). -## Getting started +## Installing Robyn Python --- -The usage guide is detailed in the script [demo.R](https://github.com/facebookexperimental/Robyn/blob/main/demo/demo.R), where you will see working examples of all major functions, including: -- Model input -- Hyperparameter setting -- Calibration input -- Model run -- Model output -- Model selection and export -- Budget allocator -- Model refresh -- Recreate models -- Getting response and more... - ---- +For detailed Robyn Python installation and usage guide please see [here](https://facebookexperimental.github.io/Robyn/docs/robyn-api). diff --git a/website/docs/robyn-api.mdx b/website/docs/robyn-api.mdx index 0e0270795..1dcf338c0 100644 --- a/website/docs/robyn-api.mdx +++ b/website/docs/robyn-api.mdx @@ -1,15 +1,53 @@ --- id: robyn-api -title: Robyn API for Python +title: Robyn Python (Beta) --- import useBaseUrl from '@docusaurus/useBaseUrl'; -Enabling Robyn for Python has been a long-standing ask from the community. Robyn has started as an experimental R package. While we understand the needs of the users, it's difficult to maintain a natively translated Python package during active development on the R-side. +## Alternative 1: Quick start for RobynPy (Beta) + +The Python version of Robyn is rewritten from Robyn's R package version 3.11.1 to Python using object oriented programming principles and modular architecture for a robust solution. It was developed by utilizing various LLMs and AI workflows like Llama. As is common with any AI-based solutions, there may be potential challenges in translating code from one language to another. In this case, we anticipate that there could be some issues in the translation from R to Python. However, we believe in the power of community collaboration and open-source contribution. Therefore, we are opening this project to the community to participate and contribute. Together, we can address and resolve any issues that may arise, enhancing the functionality and efficiency of the Python version of Robyn. We look forward to your contributions and to the continuous improvement of this project. + + +### 1. Installing the package + +Install the latest Robyn Python package version from pypi +``` +pip3 install robynpy +``` + +Install from Github using requirements.txt +``` +pip3 install -r requirements.txt +``` +### 2. Getting started + +The directory `python/src/robyn/tutorials` contains tutorials for most common scenarios. Tutorials use simulated dataset provided in the package. + +### 3. Running end-to-end + +There are two ways of running Python Robyn. + +**Option 1:** + +tutorial1.ipynb is the main notebook that runs the end-to-end flow. It is designed for majority of the users who would prefer a one click solution that runs the robyn flow end-to-end with minimal knowledge of the underlying logic. It should run without any changes required if you wish to use the simulated dataset for testing purposes. + +This notebook uses APIs available in python/src/robyn/robyn.py to set the configs, run feature engineering, run model training, evaluate models with clustering, generate one pagers and perform budget allocation. + +Change any of the configs directly in the notebook and avoid changes to robyn.py for what can be configurable. + +**Option 2:** + +tutorial1_src.ipynb runs the end-to-end flow of robyn python but with a lot more flexibility. It is designed for users who would like to have more control over which modules are and aren't run (ie. skipping clustering/one pager plots/budget allocation etc.). It should run without any changes required if you wish to use the simulated dataset for testing purposes. + +This notebook doesn't use APIs available in python/src/robyn/robyn.py but instead, calls the modules directly with the appropriate parameters. In this way, it is more flexible but still expects the users to understand the underlying logic that may change when using various parameter values. + +## Alternative 2: The Python wrapper The idea of a plumber API for Python is originally proposed by [Alex Rowley](https://www.facebook.com/groups/robynmmm/posts/1493036524797809/) from the Robyn community in August 2023. The Robyn team has assessed the proposal that is not only a great work-around for Python users to start with Robyn, but it actually allows API calls from any languages. We're very grateful for the collective wisdom of the open source community. #### Robyn API for Python beta release -The first version of the API is released on Nov.22nd 2023 on the [Meta OST summit](https://metaostsummit23.splashthat.com/?fbclid=IwAR1SRBTZGw0GIoaF0XJq_eCWFZsZbyK0KP7P4RLKoee1IVbs8H56so3giwg). This [Jupyter notebook](https://github.com/facebookexperimental/Robyn/blob/main/robyn_api/robyn_python_notebook.ipynb) shows how to call the API from Python. In the beta version, the user needs to have the Robyn R package successfully installed first. We'll work on the migitation of installation friction in the future. +The first version of the API is released on Nov.22nd 2023 on the [Meta OST summit](https://metaostsummit23.splashthat.com/?fbclid=IwAR1SRBTZGw0GIoaF0XJq_eCWFZsZbyK0KP7P4RLKoee1IVbs8H56so3giwg). This [Jupyter notebook](https://github.com/facebookexperimental/Robyn/blob/main/robyn_api/robyn_python_notebook.ipynb) shows how to call the API from Python. In the beta version, the user needs to have the Robyn R package successfully installed first. We'll work on the migitation of installation friction in the future. Robyn API for Python Architecture diff --git a/website/static/img/curve_calibrator.png b/website/static/img/curve_calibrator.png new file mode 100644 index 000000000..758a2217c Binary files /dev/null and b/website/static/img/curve_calibrator.png differ diff --git a/website/static/img/curve_calibrator_onepager.png b/website/static/img/curve_calibrator_onepager.png new file mode 100644 index 000000000..5769a81dd Binary files /dev/null and b/website/static/img/curve_calibrator_onepager.png differ diff --git a/website/static/img/rnf_allocator_prototype.png b/website/static/img/rnf_allocator_prototype.png new file mode 100644 index 000000000..2c15e63da Binary files /dev/null and b/website/static/img/rnf_allocator_prototype.png differ