diff --git a/.travis.yml b/.travis.yml index 66d28544..4dc6e333 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,7 @@ language: r sudo: required -dist: bionic +# due to https://travis-ci.community/t/r-install-broken-travis-ci-rstudio-org-returns-403/9640/2 +# dist: bionic cache: - directories: - $HOME/R/Library diff --git a/DESCRIPTION b/DESCRIPTION index c27ecfdb..95d4b906 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -89,7 +89,6 @@ Collate: annotationOfRanges.R beta-binomial-testing.R calculatePSIValue.R - calculateStats.R countRNAseqData.R example_functions.R filterExpression.R diff --git a/NAMESPACE b/NAMESPACE index 6f01cfd7..7c317fb6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,7 @@ export(getSplitReadCountsForAllSamples) export(hyperParams) export(injectOutliers) export(loadFraserDataSet) +export(makeSimulatedFraserDataSet) export(mapSeqlevels) export(mergeCounts) export(name) diff --git a/R/beta-binomial-testing.R b/R/beta-binomial-testing.R index db402c50..c74ce1a2 100644 --- a/R/beta-binomial-testing.R +++ b/R/beta-binomial-testing.R @@ -140,7 +140,7 @@ pvalueByBetaBinomialPerType <- function(fds, aname, psiType, pvalFun, pvalues_full[toTest,] <- pvalues # transform it to a hdf5 assay and save it to the dataset - assays(fds, type=psiType)[[aname]] <- pvalues_full + assay(fds, aname, type=psiType, withDimnames=FALSE) <- pvalues_full # save the pvalue calculations in the SE ranged object return(fds) @@ -201,7 +201,7 @@ betabinVglmTest <- function(cMat, alternative="two.sided", #' #' error/warning catching functions by martin morgan -#' http://stackoverflow.com/questions/4948361/how-do-i-save-warnings-and-errors-as-output-from-a-function +#' http://stackoverflow.com/questions/4948361 #' #' @noRd tryCatchFactory <- function(fun, timeout=300){ diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index 43ced501..af375985 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -36,18 +36,12 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, overwriteCts=overwriteCts, BPPARAM=BPPARAM) } - # save Fraser object to disk - fds <- saveFraserDataSet(fds) - # calculate the delta psi value for(psiType in types){ assayName <- paste0("delta_", psiType) fds <- calculateDeltaPsiValue(fds, psiType, assayName) } - # save final Fraser object to disk - fds <- saveFraserDataSet(fds) - # return it return(fds) } @@ -88,16 +82,10 @@ calculatePSIValuePrimeSite <- function(fds, psiType, overwriteCts, BPPARAM){ # check if other counts and psi values chache file exists already cacheFile <- getOtherCountsCacheFile(sample, fds) - if(file.exists(cacheFile)){ - h5 <- HDF5Array(filepath=cacheFile, name=h5DatasetName) - if((isFALSE(overwriteCts) | - !all(paste0("rawOtherCounts_psi", c(5, 3)) %in% - assayNames(fds)) ) && - nrow(h5) == nrow(K(fds, type="psi5"))){ - - return(h5) - } - unlink(cacheFile) + ans <- checkPsiCacheFile(cFile=cacheFile, dName=h5DatasetName, + overwrite=overwriteCts, ptype=c("psi5", "psi3"), fds=fds) + if(!is.null(ans)){ + return(ans) } # add sample specific counts to the data.table (K) @@ -128,41 +116,63 @@ calculatePSIValuePrimeSite <- function(fds, psiType, overwriteCts, BPPARAM){ countData[is.na(psi5),psi5:=1] countData[is.na(psi3),psi3:=1] + # if no HDF5 is requested return it as matrix + if(dontWriteHDF5(fds)){ + return(as.matrix(countData[,.(o5,o3,psi5,psi3)])) + } + # write other counts and psi values to h5 file - # get defind chunk sizes + # get defined chunk sizes chunkDims <- c( min(nrow(countData), options()[['FRASER-hdf5-chunk-nrow']]), 1) writeHDF5Array(as.matrix(countData[,.(o5,o3,psi5,psi3)]), - filepath=cacheFile, name=h5DatasetName, + filepath=cacheFile, name=h5DatasetName, chunkdim=chunkDims, level=7, verbose=FALSE) # get counts as DelayedMatrix HDF5Array(filepath=cacheFile, name=h5DatasetName) - } ) names(psiValues) <- samples(fds) # merge it and assign it to our object - assay(fds, type="j", "psi5", withDimnames=FALSE) <- - do.call(cbind, mapply('[', psiValues, TRUE, 3, drop=FALSE)) - assay(fds, type="j", "psi3", withDimnames=FALSE) <- - do.call(cbind, mapply('[', psiValues, TRUE, 4, drop=FALSE)) + assay(fds, type="j", "psi5", withDimnames=FALSE) <- do.call(cbind, + mapply('[', psiValues, TRUE, 3, drop=FALSE, SIMPLIFY=FALSE)) + assay(fds, type="j", "psi3", withDimnames=FALSE) <- do.call(cbind, + mapply('[', psiValues, TRUE, 4, drop=FALSE, SIMPLIFY=FALSE)) if(isTRUE(overwriteCts)){ assay(fds, type="j", "rawOtherCounts_psi5", withDimnames=FALSE) <- - do.call(cbind, bplapply(psiValues, BPPARAM=BPPARAM, - function(x){ x[,1,drop=FALSE] })) + do.call(cbind, mapply('[', psiValues, TRUE, 1, + drop=FALSE, SIMPLIFY=FALSE)) assay(fds, type="j", "rawOtherCounts_psi3", withDimnames=FALSE) <- - do.call(cbind, bplapply(psiValues, BPPARAM=BPPARAM, - function(x){ x[,2,drop=FALSE] })) + do.call(cbind, mapply('[', psiValues, TRUE, 2, + drop=FALSE, SIMPLIFY=FALSE)) } return(fds) } +#' +#' Helper function to check for PSI value cached data +#' +#' @noRd +checkPsiCacheFile <- function(cFile, dName, overwrite, ptype, fds){ + if(file.exists(cFile) && dName %in% h5ls(cFile)$name){ + h5 <- HDF5Array(filepath=cFile, name=dName) + aNames <- paste0("rawOtherCounts_", ptype) + if(isFALSE(overwrite) && all( aNames %in% assayNames(fds)) && + nrow(h5) == nrow(K(fds, type=ptype[1]))){ + return(h5) + } + h5delete(cFile, name=dName) + } + return(NULL) +} + + #' #' This function calculates the site PSI values for each splice site #' based on the FraserDataSet object @@ -206,16 +216,10 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){ # get counts and psiSite values from cache file if it exists cacheFile <- getOtherCountsCacheFile(sample, fds) - if(file.exists(cacheFile) && - psiH5datasetName %in% h5ls(cacheFile)$name){ - h5 <- HDF5Array(filepath=cacheFile, name=psiH5datasetName) - if((isFALSE(overwriteCts) | !psiROCName %in% assayNames(fds)) - && nrow(h5) == nrow(K(fds, type="psiSite"))){ - - return(h5) - } else{ - h5delete(cacheFile, name=psiH5datasetName) - } + ans <- checkPsiCacheFile(cFile=cacheFile, dName=psiH5datasetName, + overwrite=overwriteCts, ptype="psiSite", fds=fds) + if(!is.null(ans)){ + return(ans) } # add sample specific counts to the data.table @@ -226,7 +230,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){ sdata[,os:=sum(k)-k, by="spliceSiteID"] # remove the junction part since we only want to calculate the - # psi values for the splice sites themselfs + # psi values for the splice sites themselves sdata <- sdata[type=="spliceSite"] # calculate psi value @@ -235,8 +239,13 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){ # if psi is NA this means there were no reads at all so set it to 1 sdata[is.na(psiValue),psiValue:=1] + # if no HDF5 is requested return it as matrix + if(dontWriteHDF5(fds)){ + return(as.matrix(sdata[,.(os, psiValue)])) + } + # write other counts and psi values to h5 file - # get defind chunk sizes + # get defined chunk sizes chunkDims <- c( min(nrow(sdata), options()[['FRASER-hdf5-chunk-nrow']]), 2) @@ -251,12 +260,13 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){ names(psiSiteValues) <- samples(fds) # merge it and assign it to our object - assay(fds, type="ss", psiName, withDimnames=FALSE) <- - do.call(cbind, mapply('[', psiSiteValues, TRUE, 2, drop=FALSE)) + assay(fds, type="ss", psiName, withDimnames=FALSE) <- do.call(cbind, + mapply('[', psiSiteValues, TRUE, 2, drop=FALSE, + SIMPLIFY=FALSE)) if(isTRUE(overwriteCts)){ - assay(fds, type="ss", psiROCName, withDimnames=FALSE) <- - do.call(cbind, bplapply(psiSiteValues, BPPARAM=BPPARAM, - function(x) { x[,1,drop=FALSE] })) + assay(fds, type="ss", psiROCName, withDimnames=FALSE) <- do.call(cbind, + mapply('[', psiSiteValues, TRUE, 1, drop=FALSE, + SIMPLIFY=FALSE)) } return(fds) @@ -310,4 +320,4 @@ getOtherCountsCacheFolder <- function(fds){ # return it return(cachedir) -} \ No newline at end of file +} diff --git a/R/calculateStats.R b/R/calculateStats.R deleted file mode 100644 index 669ac009..00000000 --- a/R/calculateStats.R +++ /dev/null @@ -1,53 +0,0 @@ -## -## @author Christian Mertes \email{mertes@@in.tum.de} -## -## This file contains all functions for calculating the statistics -## First it starts with calculating the Z-score for each -## site and then the p-values are calculated dependend on the -## given method in the setting file -## - -#' -#' Calculate the zscore for each PSI value. -#' -#' @noRd -#' @return FraserDataSet -#' @examples -#' fds <- createTestFraserDataSet() -#' fds <- calculatePSIValues(fds) -#' fds <- calculateZScores(fds) -calculateZScores <- function(fds, type=psiTypes){ - - # check input - stopifnot(is(fds, "FraserDataSet")) - - # calculate zscore for each psi type - for(pt in type){ - fds <- calculateZScorePerDataSet(fds, pt) - } - - return(fds) -} - -#' -#' calculates the zscore for a given data type and a given psi type -#' and adds it directly to the dataset itself -#' -#' @noRd -calculateZScorePerDataSet <- function(fds, psiType){ - - message(date(), ": Calculate the Zscore for ", psiType, " values ...") - - # get raw data and replace NA's with zeros - psiVal <- assay(fds, psiType) - - # z = ( x - mean ) / sd - rowmean <- rowMeans(psiVal, na.rm = TRUE) - rowsd <- apply(psiVal, 1, sd, na.rm = TRUE) - zscores <- (psiVal - rowmean) / rowsd - - # use as.matrix to rewrite it as a new hdf5 array - zScores(fds, type=psiType) <- as.matrix(zscores) - - return(fds) -} diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index 366b3551..3212da98 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -475,6 +475,7 @@ countSplitReads <- function(sampleID, fds, NcpuPerSample=1, genome=NULL, } # remove chromosomes with different seqlength than in genome (if provided) + # and remove non annotation existing chromosomes from counting if(!is.null(genome)){ if(is.character(genome)){ genome <- getBSgenome(genome) @@ -483,19 +484,28 @@ countSplitReads <- function(sampleID, fds, NcpuPerSample=1, genome=NULL, chrLengths <- extractChromosomeLengths(bamFile) mismatchChrs <- which(seqlengths(genome)[chromosomes] != chrLengths) if(length(mismatchChrs) > 0){ - warning("Not counting chromosome(s) ", chromosomes[mismatchChrs], + warning("Not counting chromosome(s) ", + paste(chromosomes[mismatchChrs], collapse=", "), " in sample ", sampleID, " as it has a different length", " in the bamFile of this sample than in the provided", " genome.") chromosomes <- chromosomes[-mismatchChrs] } + missingChrs <- which(!chromosomes %in% seqnames(genome)) + if(length(missingChrs) > 0){ + warning("Not counting chromosome(s) ", + paste(chromosomes[missingChrs], collapse=", "), + " in sample ", sampleID, " as it is not specified in", + " the provided genome.") + chromosomes <- chromosomes[-missingChrs] + } } # extract the counts per chromosome countsList <- bplapply(chromosomes, FUN=countSplitReadsPerChromosome, bamFile=bamFile, pairedEnd=pairedEnd, settings=fds, genome=genome, BPPARAM=getBPParam(NcpuPerSample, length(chromosomes))) - + # sort and merge the results befor returning/saving countsGR <- sort(unlist(GRangesList(countsList))) saveRDS(countsGR, cacheFile) @@ -815,9 +825,6 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, # unstranded case: for counting only non spliced reads we # skip this information isPairedEnd <- pairedEnd(fds[,samples(fds) == sampleID])[[1]] - if(isFALSE(as.logical(strandSpecific(fds)))){ - isPairedEnd <- FALSE - } doAutosort <- isPairedEnd # check cache if available diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R index ca75a82d..4b1ad757 100644 --- a/R/find_encoding_dimensions.R +++ b/R/find_encoding_dimensions.R @@ -30,6 +30,9 @@ predict_outliers <- function(fds, type, implementation, BPPARAM){ } eval_prot <- function(fds, type){ + + message(date(), ": Calculating AUC-PR ...") + index <- getSiteIndex(fds, type) idx <- !duplicated(index) @@ -96,19 +99,21 @@ findEncodingDim <- function(i, fds, type, params, implementation, #' @param delayed If FALSE, count matrices will be loaded into memory (faster #' calculations), otherwise the function works on the delayedMatrix #' representations (more memory efficient). The default value depends on the -#' number of samples in the fds-object. +#' number of samples in the fds-object. +#' @param ... Additional parameters passed to \code{injectOutliers}. #' #' @return FraserDataSet #' #' @examples #' # generate data -#' fds <- createTestFraserDataSet() +#' fds <- makeSimulatedFraserDataSet(m=15, j=20) #' #' # run hyperparameter optimization -#' fds <- optimHyperParams(fds, type="psi5", implementation="PCA") +#' fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) #' #' # get estimated optimal dimension of the latent space #' bestQ(fds, type="psi5") +#' hyperParams(fds, type="psi5") #' #' @export optimHyperParams <- function(fds, type, implementation="PCA", @@ -116,7 +121,7 @@ optimHyperParams <- function(fds, type, implementation="PCA", noise_param=0, minDeltaPsi=0.1, iterations=5, setSubset=15000, injectFreq=1e-2, BPPARAM=bpparam(), internalThreads=1, plot=TRUE, - delayed=ifelse(ncol(fds) <= 300, FALSE, TRUE)){ + delayed=ifelse(ncol(fds) <= 300, FALSE, TRUE), ...){ if(isFALSE(needsHyperOpt(implementation))){ message(date(), ": For correction '", implementation, "' no hyper ", "parameter optimization is needed.") @@ -168,8 +173,7 @@ optimHyperParams <- function(fds, type, implementation="PCA", currentType(fds_copy) <- type # inject outliers - fds_copy <- injectOutliers(fds_copy, type=type, freq=injectFreq, - minDpsi=minDeltaPsi, method="samplePSI") + fds_copy <- injectOutliers(fds_copy, type=type, freq=injectFreq, ...) if(sum(getAssayMatrix(fds_copy, type=type, "trueOutliers") != 0) == 0){ warning(paste0("No outliers could be injected so the ", diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R index 79bcc659..9c573791 100644 --- a/R/fitCorrectionMethods.R +++ b/R/fitCorrectionMethods.R @@ -22,7 +22,7 @@ fit <- function(fds, implementation=c("PCA", "PCA-BB-Decoder", "AE", counts(fds, type=type, side="ofInterest")) # check q is set - if(method != "BB" & (missing(q) | is.null(q))){ + if(method != "BB" && (missing(q) | is.null(q))){ stop("Please provide a q to define the size of the latent space!") } @@ -168,7 +168,7 @@ fit <- function(fds, implementation=c("PCA", "PCA-BB-Decoder", "AE", nSubset = nSubset, minDeltaPsi = minDeltaPsi ), - BB = fitBB(fds = fds, psiType = type) + BB = fitBB(fds=fds, psiType=type, BPPARAM=BPPARAM) ) return(fds) @@ -208,7 +208,7 @@ getHyperOptimCorrectionMethod <- function(correction){ } fitPCA <- function(fds, q, psiType, rhoRange=c(1e-5, 1-1e-5), noiseAlpha=NULL, - BPPARAM=bpparam(), subset=FALSE, minDeltaPsi, + BPPARAM=bpparam(), subset=FALSE, minDeltaPsi=0.1, nSubset=15000, useLM=FALSE){ counts(fds, type=psiType, side="other", HDF5=FALSE) <- as.matrix( counts(fds, type=psiType, side="other")) @@ -279,15 +279,14 @@ fitPCA <- function(fds, q, psiType, rhoRange=c(1e-5, 1-1e-5), noiseAlpha=NULL, return(fds) } -fitBB <- function(fds, psiType){ +fitBB <- function(fds, psiType, BPPARAM=bpparam()){ currentType(fds) <- psiType fds <- pvalueByBetaBinomialPerType(fds=fds, - aname=paste0("pvalues_BB_", psiType), - psiType=psiType, - pvalFun=betabinVglmTest) + aname=paste0("pvalues_BB_", psiType), + psiType=psiType, pvalFun=betabinVglmTest, BPPARAM=BPPARAM) # predictedMeans(fds, type=psiType) <- rowMeans( # getAssayMatrix(fds, type=psiType)) - predictedMeans(fds, type=psiType) <- + predictedMeans(fds, type=psiType, withDimnames=FALSE) <- mcols(fds, type=psiType)[,paste0(psiType, "_alpha")] / ( mcols(fds, type=psiType)[,paste0(psiType, "_alpha")] + mcols(fds, type=psiType)[,paste0(psiType, "_beta")] ) diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index ee89607c..7c0beea0 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -280,11 +280,7 @@ pVals <- function(fds, type=currentType(fds), level="site", #' @export padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), ...){ dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) - if( paste(paste0("padj", dist), type, sep="_") %in% assayNames(fds)){ - return(getAssayMatrix(fds, paste0("padj", dist), type=type, ...)) - } else{ - return(getAssayMatrix(fds, paste0("pajd", dist), type=type, ...)) - } + return(getAssayMatrix(fds, paste0("padj", dist), type=type, ...)) } `padjVals<-` <- function(fds, type=currentType(fds), @@ -422,7 +418,11 @@ bestNoise <- function(fds, type=currentType(fds)){ #' assays should be stored as hdf5 files. #' @export dontWriteHDF5 <- function(fds){ - return(metadata(fds)[['dontWriteHDF5']]) + ans <- metadata(fds)[['dontWriteHDF5']] + if(is.null(ans)){ + ans <- FALSE + } + return(ans) } #' @describeIn getter_setter_functions Sets whether the assays should be stored diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R index dad3caf4..a27dabdc 100644 --- a/R/makeSimulatedDataset.R +++ b/R/makeSimulatedDataset.R @@ -25,21 +25,21 @@ #' #' @rdname makeSimulatedFraserDataSet #' @aliases makeSimulatedFraserDataSet -#' @noRd +#' @export makeSimulatedFraserDataSet <- function(m=100, j=500, q=10, - distribution=c("BB", "DM"), ...){ + distribution=c("BB", "DM"), ...){ distribution <- match.arg(distribution) fds <- switch (distribution, BB = makeSimulatedFraserDataSet_BetaBinomial(m=m, j=j, q=q, ...), DM = makeSimulatedFraserDataSet_Multinomial(m=m, j=j, q=q, ...)) - + fds } makeSimulatedFraserDataSet_BetaBinomial <- function(m=200, j=10000, q=10, - nbMeanLog=8, nbSize=0.55, rhoMeanlog=-5, rhoSdlog=1, - Hmean=0, Hsd=100, Dmean=0, Dsd=0.007, ...){ - + nbMeanLog=8, nbSize=0.55, rhoMeanlog=-5, rhoSdlog=1, + Hmean=0, Hsd=100, Dmean=0, Dsd=0.007, ...){ + # Simulate total coverage N = nonSplit + n at each junction using lognormal junction_n <- rlnorm(j, meanlog=nbMeanLog, sdlog=nbSize) N <- t(vapply(junction_n, function(x){ @@ -48,124 +48,124 @@ makeSimulatedFraserDataSet_BetaBinomial <- function(m=200, j=10000, q=10, double(m))) # N <- matrix(rnbinom(j*m, mu=nbMean, size=nbSize), nrow=j, ncol=m) mode(N) <- 'integer' - + # # Simulate covariates for SE # H_true <- matrix(rnorm(m*q, mean=Hmean, sd=Hsd), nrow=m, ncol=q) D_true <- matrix(rnorm(j*q, mean=Dmean, sd=Dsd), nrow=j, ncol=q) b_true <- sample(c(rnorm(round(j*(1/6)), mean=3.5, sd=1.5), - rnorm(round(j*(5/6)), mean=-2.5, sd=2.5))) + rnorm(j - round(j*(1/6)), mean=-2.5, sd=2.5))) y_se <- t(predictYCpp(H_true, D_true, b_true)) mu_se <- predictMuCpp(y_se) # betaBin dispersion rho_se <- rlnorm(j, meanlog=rhoMeanlog, sdlog=rhoSdlog) - + # Draw nonSplit reads from BB nonSplit <-matrix(rbetabinom(j*m, size=N, prob=mu_se, rho=rho_se), nrow=j, ncol=m) mode(nonSplit) <- 'integer' - + # Set n = N - nonSplit n <- N - nonSplit - + # # Simulate covariates for PSI3=PSI5 # H_true <- matrix(rnorm(m*q, mean=Hmean, sd=Hsd), nrow=m, ncol=q) D_true <- matrix(rnorm(j*q, mean=Dmean, sd=Dsd), nrow=j, ncol=q) b_true <- sample(c(rnorm(round(j*(1/6)), mean=-2.5, sd=2.5), - rnorm(round(j*(5/6)), mean=3.5, sd=1.5))) + rnorm(j-round(j*(1/6)), mean=3.5, sd=1.5))) y_psi <- t(predictYCpp(H_true, D_true, b_true)) - mu_psi <- predictMuCpp(y_psi) + mu_psi <- predictMuCpp(y_psi) # betaBin dispersion rho_psi <- rlnorm(j, meanlog=rhoMeanlog, sdlog=rhoSdlog) - + # Draw nonSplit reads from BB - k <-matrix(rbetabinom(j*m, size=n, prob=mu_psi, rho=rho_psi), + k <- matrix(rbetabinom(j*m, size=n, prob=mu_psi, rho=rho_psi), nrow=j, ncol=m) mode(k) <- 'integer' - - + + # # Create FRASER data set # sampleIDs <- paste0("sample", seq_len(m)) anno <- data.table(sampleID = sampleIDs, bamFile=rep(NA, m)) fds <- FraserDataSet(colData=anno, ...) - + # put in n as rawcountsJ first so it doesn't complain later # when assinging k to it junctionData <- SummarizedExperiment( colData=colData(fds), assays=list(rawCountsJ=k), rowRanges=GRanges(seqnames=rep("chr1", j), - ranges=IRanges(start=seq_len(j), width=1 )) - ) + ranges=IRanges(start=seq_len(j), width=1 ))) + nonSplitData <- SummarizedExperiment( colData=colData(fds), assays=list(rawCountsSS=nonSplit), rowRanges=GRanges(seqnames=rep("chr1", j), - ranges=IRanges(start=seq_len(j), width=1 )) - ) + ranges=IRanges(start=seq_len(j), width=1 ))) + fds <- new("FraserDataSet", junctionData, name = name(fds), bamParam = scanBamParam(fds), strandSpecific = strandSpecific(fds), workingDir = workingDir(fds), - nonSplicedReads = nonSplitData - ) + nonSplicedReads = nonSplitData) + metadata(fds)[['optimalEncDim']] <- q metadata(fds)[['encDimTable']] <- data.table( encodingDimension=q, evaluationLoss=1, evalMethod='simulation') - + # Store "other" counts counts(fds, type="psi3", side="other", withDimnames=FALSE) <- n - k counts(fds, type="psi5", side="other", withDimnames=FALSE) <- n - k counts(fds, type="psiSite", side="other", withDimnames=FALSE) <- n - + # store information about the simulation in the fds # (same values for psi3 and psi5) for(type in c("psi3", "psi5")){ setAssayMatrix(fds=fds, name="truePSI", type=type, - withDimnames=FALSE) <- mu_psi + withDimnames=FALSE) <- mu_psi setAssayMatrix(fds=fds, name="trueLogitPSI", type=type, - withDimnames=FALSE) <- y_psi - mcols(fds, type=type)[,"trueRho"] <- rho_psi - + withDimnames=FALSE) <- y_psi + mcols(fds, type=type)[,"trueRho"] <- rho_psi + # needed so that subsetting the fds works later mcols(fds, type=type)[["startID"]] <- seq_len(nrow(mcols(fds, type=type))) mcols(fds, type=type)[["endID"]] <- seq_len(nrow(mcols(fds, type=type))) } - + # store info for SE setAssayMatrix(fds=fds, name="truePSI", type="psiSite", - withDimnames=FALSE) <- mu_se + withDimnames=FALSE) <- mu_se setAssayMatrix(fds=fds, name="trueLogitPSI", type="psiSite", - withDimnames=FALSE) <- y_se + withDimnames=FALSE) <- y_se mcols(fds, type="psiSite")[,"trueRho"] <- rho_se - + # needed so that subsetting the fds works later siteIDs <- seq_row(mcols(fds, type="psiSite")) mcols(fds, type="psiSite")[["startID"]] <- siteIDs mcols(fds, type="psiSite")[["endID"]] <- siteIDs mcols(fds, type="psiSite")[["spliceSiteID"]] <- siteIDs - + return(fds) - + } # # Generate a simulated fds using a Dirichlet-Mulitnomial distribution # makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, - groups=round(j*0.65), nbMeanLog=8, nbSize=0.55, - rhoMeanlog=-5, rhoSdlog=1, Hmean=0, Hsd=100, Dmean=0, - Dsd=0.007, ...){ - + groups=round(j*0.65), nbMeanLog=8, nbSize=0.55, + rhoMeanlog=-5, rhoSdlog=1, Hmean=0, Hsd=100, Dmean=0, + Dsd=0.007, ...){ + # # Simulate groups of junctions having the same donor/acceptor site # @@ -173,12 +173,12 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, donorGroups <- seq_len(d) # ids of the groups (1,2,...,d) # assign junction to donor/acceptor groups junctionGroups <- sort(sample(x = donorGroups, size=j, replace = TRUE)) - + # Set d to the actual number of groups (some might not have been assigned # any junction at all) donorGroups <- donorGroups[donorGroups %in% unique(junctionGroups)] d <- length(donorGroups) - + # # SE simulated as additional junction for every group # @@ -186,8 +186,8 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, psiJunctionGroups <- junctionGroups # store for later j <- j + d # add one junction for every group junctionGroups <- sort(c(junctionGroups, donorGroups)) - - + + # # Simulate n for each junction using lognormal and negative binomial # @@ -197,7 +197,7 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, n <- rnbinom(m, mu=x, size=nbSize)}, double(m))) # n <- matrix(rnbinom(j*m, mu=nMean, size=theta), nrow=j, ncol=m) - + # Sum up the values of n within one donor/acceptor group to get the n value # of the group dt_n <- as.data.table(n) @@ -205,16 +205,16 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, res <- lapply(colnames(dt_n),function(x){ sum_n[,c(paste0(x, "_sum")):=sum(get(x)),by=junctionGroups] }) - + # Extract final n matrix (all junctions within a group have the same n # value) sum_cols <- paste0(colnames(dt_n), "_sum") n <- as.matrix(sum_n[,..sum_cols]) colnames(n) <- NULL mode(n) <- 'integer' - - - + + + # # Simulate betaBin dispersion for every donor/acceptor group, same # dispersion for all junctions within a group @@ -225,9 +225,9 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, dt2 <- data.table(group=junctionGroups) rhoFull <- merge(dt1, dt2, by="group") rho <- rhoFull[,rho] # set dispersion for every junction - - - + + + # # Simulate covariates. # @@ -236,7 +236,7 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, b_true <- double(j) y_true <- t(predictYCpp(H_true, D_true, b_true)) ae_mu <- predictMuCpp(y_true) - + # Use softmax on mu to get mu within one group to sum up to 1 softmax <- function(x){ sum <- sum(exp(x)) @@ -247,170 +247,170 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, return( apply(group_mu, 2, softmax) ) }) mu <- do.call(rbind, softmax_mu) - + # Calcluate alpha values alpha <- mu * (1-rho)/rho - - - + + + # # Simulate psi3=psi5 and SE jointly with Dirchilet-Multinomial # res <- vapply(donorGroups, function(groupID){ - + # Get the indices of the junctions in this group pos <- which(junctionGroups == groupID) - + # get alpha and n values for this group group_alpha <- alpha[pos,] group_n <- n[pos,] - + # draw counts from a dirichlet multinomial distribution # (jointly for PSi and SE) counts <- t(rdirmnom(m, group_n[1,], t(group_alpha))) - + # First row of resulting counts represents non split reads for SE, # the other rows are the counts for the "real" junctions nonSplit <- counts[1,] k <- counts[-1,] - + # Substract nonSplit reads from total N to get n for PSI (=other for SE) group_n <- group_n - matrix(nonSplit, nrow=nrow(group_n), - ncol=ncol(group_n), byrow = TRUE) - + ncol=ncol(group_n), byrow = TRUE) + # Also get mu and rho for this group (to split it into PSI and SE part # for storing later) group_mu <- mu[pos,] group_rho <- rho[pos] - + # First row/value is the one for SE se_n <- group_n[1,] se_mu <- group_mu[1,] se_alpha <- group_alpha[1,] se_rho <- group_rho[1] - + # The other rows/values are the ones relevant for PSI psi_n <- group_n[-1,] psi_alpha <- group_alpha[-1,] psi_rho <- group_rho[-1] psi_mu <- group_mu[-1,] if(is.null(nrow(psi_mu))){ # only one junction in the group, convert - # to matrix with one row so that colSums - # works + # to matrix with one row so that colSums + # works psi_mu <- matrix(psi_mu, nrow=1) } # scale mu's so that they sum up to 1 again (after mu for SE is removed) psi_mu <- psi_mu / matrix(colSums(psi_mu), nrow=nrow(psi_mu), - ncol=ncol(psi_mu), byrow = TRUE) - + ncol=ncol(psi_mu), byrow = TRUE) + # return everyting relevant for storing counts, n, alpha, mu, # rho split into the parts for PSI and SE return(list(k=k, nonSplit=nonSplit, n=psi_n, group_n=se_n, - psi_mu=psi_mu, se_mu=se_mu, psi_alpha=psi_alpha, - se_alpha=se_alpha, psi_rho=psi_rho, se_rho=se_rho)) + psi_mu=psi_mu, se_mu=se_mu, psi_alpha=psi_alpha, + se_alpha=se_alpha, psi_rho=psi_rho, se_rho=se_rho)) }, FUN.VALUE=list(k=numeric(m), nonSplit=numeric(m), n=numeric(m), - group_n=numeric(m), psi_mu=numeric(m), - se_mu=numeric(m), psi_alpha=numeric(m), - se_alpha=numeric(m), psi_rho=numeric(1), - se_rho=numeric(1))) - + group_n=numeric(m), psi_mu=numeric(m), + se_mu=numeric(m), psi_alpha=numeric(m), + se_alpha=numeric(m), psi_rho=numeric(1), + se_rho=numeric(1))) + # Extract k and nonSplit reads k <- do.call(rbind, res[1,]) nonSplit <- do.call(rbind, res[2,]) mode(k) <- 'integer' mode(nonSplit) <- 'integer' - + # Extract n value for each junction and group n value for SE psi_n <- do.call(rbind, res[3,]) se_n <- do.call(rbind, res[4,]) - + # Extract mu values for PSI and SE psi_mu <- do.call(rbind, res[5,]) se_mu <- do.call(rbind, res[6,]) - + # Extract alpha values for PSI and SE psi_alpha <- do.call(rbind, res[7,]) se_alpha <- do.call(rbind, res[8,]) - + # Extract rho values for PSI and SE psi_rho <- unlist(res[9,]) se_rho <- unlist(res[10,]) - - - + + + # # Create FRASER data set # sampleIDs <- paste0("sample", seq_len(m)) anno <- data.table(sampleID = sampleIDs, bamFile=rep(NA, m)) fds <- FraserDataSet(colData=anno, ...) - + # put in k as rawcountsJ (= nr reads spanning each junction) junctionData <- SummarizedExperiment( colData=colData(fds), assays=list(rawCountsJ=k), rowRanges=GRanges(seqnames=rep("chr1", nrJunctions), - ranges=IRanges(start=psiJunctionGroups, width=1)) - ) + ranges=IRanges(start=psiJunctionGroups, width=1))) + nonSplitData <- SummarizedExperiment( colData=colData(fds), assays=list(rawCountsSS=nonSplit), rowRanges=GRanges(seqnames=rep("chr1", d), - ranges=IRanges(start=donorGroups, width=1)) - ) + ranges=IRanges(start=donorGroups, width=1))) + fds <- new("FraserDataSet", - junctionData, - name = name(fds), - bamParam = scanBamParam(fds), - strandSpecific = strandSpecific(fds), - workingDir = workingDir(fds), - nonSplicedReads = nonSplitData - ) + junctionData, + name = name(fds), + bamParam = scanBamParam(fds), + strandSpecific = strandSpecific(fds), + workingDir = workingDir(fds), + nonSplicedReads = nonSplitData) + metadata(fds)[['optimalEncDim']] <- q - - + + # store other counts (n-k) for psi3=psi5 and SE (other=n) counts(fds, type="psi3", side="other", withDimnames=FALSE) <- psi_n - k counts(fds, type="psi5", side="other", withDimnames=FALSE) <- psi_n - k counts(fds, type="psiSite", side="other", withDimnames=FALSE) <- se_n - + for(type in c("psi3", "psi5")){ # store information about the simulation in the fds # (same values for psi3 and psi5) setAssayMatrix(fds=fds, name="truePSI", type=type, - withDimnames=FALSE) <- psi_mu + withDimnames=FALSE) <- psi_mu setAssayMatrix(fds=fds, name="trueLogitPSI", type=type, - withDimnames=FALSE) <- qlogis(psi_mu) + withDimnames=FALSE) <- qlogis(psi_mu) setAssayMatrix(fds=fds, name="trueAlpha", type=type, - withDimnames=FALSE) <- psi_alpha - mcols(fds, type=type)[,"trueRho"] <- psi_rho - + withDimnames=FALSE) <- psi_alpha + mcols(fds, type=type)[,"trueRho"] <- psi_rho + # needed so that subsetting the fds works later mcols(fds, type=type)[["startID"]] <- - psiJunctionGroups # 1:nrow(mcols(fds, type=type)) + psiJunctionGroups # 1:nrow(mcols(fds, type=type)) mcols(fds, type=type)[["endID"]] <- - psiJunctionGroups # 1:nrow(mcols(fds, type=type)) + psiJunctionGroups # 1:nrow(mcols(fds, type=type)) } - + # store info for SE setAssayMatrix(fds=fds, name="truePSI", type="psiSite", - withDimnames=FALSE) <- se_mu + withDimnames=FALSE) <- se_mu setAssayMatrix(fds=fds, name="trueLogitPSI", type="psiSite", - withDimnames=FALSE) <- qlogis(se_mu) + withDimnames=FALSE) <- qlogis(se_mu) setAssayMatrix(fds=fds, name="trueAlpha", type="psiSite", - withDimnames=FALSE) <- se_alpha + withDimnames=FALSE) <- se_alpha mcols(fds, type="psiSite")[,"trueRho"] <- se_rho - + # needed so that subsetting the fds works later mcols(fds, type="psiSite")[["startID"]] <- - donorGroups # 1:nrow(mcols(fds, type="psiSite")) + donorGroups # 1:nrow(mcols(fds, type="psiSite")) mcols(fds, type="psiSite")[["endID"]] <- - seq_len(nrow(mcols(fds, type="psiSite"))) + seq_len(nrow(mcols(fds, type="psiSite"))) mcols(fds, type="psiSite")[["spliceSiteID"]] <- - donorGroups # 1:nrow(mcols(fds, type="psiSite")) - + donorGroups # 1:nrow(mcols(fds, type="psiSite")) + return(fds) - + } #' @@ -430,18 +430,20 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, #' adding to the mean psi of the junction over all samples or "samplePSI" to #' add to the psi value of the junction in the specific sample. "simulatedPSI" #' is only possible if a simulated dataset is used. +#' @param BPPARAM A BiocParallel object to run the computation in parallel #' #' @return FraserDataSet #' #' @examples -#' # An example dataset -#' fds <- createTestFraserDataSet() +#' # A generic dataset +#' fds <- makeSimulatedFraserDataSet() #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) #' @export injectOutliers <- function(fds, type=c("psi5", "psi3", "psiSite"), freq=1E-3, minDpsi=0.2, minCoverage=2, deltaDistr="uniformDistr", verbose=FALSE, - method=c('meanPSI', 'samplePSI', 'simulatedPSI')){ + method=c('samplePSI', 'meanPSI', 'simulatedPSI'), + BPPARAM=bpparam()){ type <- match.arg(type, several.ok=TRUE) method <- match.arg(method) @@ -449,278 +451,297 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "psiSite"), for(t in type){ fds <- injectOutliers(fds, type=t, freq=freq, minDpsi=minDpsi, minCoverage=minCoverage, deltaDistr=deltaDistr, - verbose=verbose, method=method) + verbose=verbose, method=method, BPPARAM=BPPARAM) } return(fds) } - - # get infos from the fds - if(paste0("originalCounts_", type) %in% assayNames(fds)){ - message("Use existing original counts and reinject outliers.") - k <- as.matrix(getAssayMatrix(fds, type=type, "originalCounts")) - o <- as.matrix(getAssayMatrix(fds, type=type, "originalOtherCounts")) - } else { - k <- as.matrix(counts(fds, type=type, side="ofI")) - o <- as.matrix(counts(fds, type=type, side="other")) + # copy original k and o + if(type == "psiSite"){ + setAssayMatrix(fds, type="psiSite", "originalCounts", + withDimnames=FALSE) <- + counts(fds, type="psiSite", side="ofInterest") + setAssayMatrix(fds, type="psiSite", "originalOtherCounts", + withDimnames=FALSE) <- + counts(fds, type="psiSite", side="other") } - n <- k + o - m <- ncol(k) - j <- nrow(k) - - # compute psi - psi <- switch(method, + else{ + setAssayMatrix(fds, type=type, "originalCounts", + withDimnames=FALSE) <- + counts(fds, type=type, side="ofInterest") + setAssayMatrix(fds, type="psi5", "originalOtherCounts", + withDimnames=FALSE) <- + counts(fds, type="psi5", side="other") + setAssayMatrix(fds, type="psi3", "originalOtherCounts", + withDimnames=FALSE) <- + counts(fds, type="psi3", side="other") + } + + # get infos from the fds + m <- ncol(fds) + j <- nrow(mcols(fds, type=type)) + + k <- as.matrix(K(fds, type=type)) + n <- as.matrix(N(fds, type=type)) + o <- as.matrix(counts(fds, type=type, side="other")) + + psi <- switch(match.arg(method), samplePSI = (k + pseudocount())/(n + 2*pseudocount()), meanPSI = matrix(nrow=j, ncol=m, - rowMeans( (k + pseudocount())/(n + 2*pseudocount()) )), - simulatedPSI = getAssayMatrix(fds, "truePSI", type=type)) - - # - # matrix of donor/acceptor for possible injection - # - tmpIndex <- getSiteIndex(fds, type) - dt <- as.data.table(rowRanges(fds, type=type))[,.( - chr=seqnames, start, end, strand, idxInCount=.I, - junctionID=tmpIndex)] - dt[,nPerGroup:=.N,by=junctionID] - dt[,idxGroup:=.GRP, by=junctionID] - dt[,idxInGroup:=seq_len(.N),by=junctionID] - - if(max(dt$idxGroup) * m * freq < 10){ - freq <- 10/(max(dt$idxGroup) * m) - warning("Injection-frequency is to low. Increasing it to `", - signif(freq, 2), "` so we can inject at least 10 events ", - "into the data set!") - } - - # where do we inject and how - indexOut_groups <- matrix(ncol=m, sample(c(0,1,-1), - max(dt$idxGroup)*m, replace=TRUE, - prob=c(1-freq, freq/2, freq/2))) - - # positions where outliers will be injected - list_index <- which(indexOut_groups != 0, arr.ind=TRUE) - - - # sample primary injection - primary <- NULL - primaryInjection <- merge(as.data.table(list_index), dt, - sort=FALSE, by.x="row", by.y="idxGroup", allow.cartesian=TRUE) - primaryInjection[,primary:=idxInGroup==sample(idxInGroup,1),by="row,col"] - primaryInjection[,n:=n[cbind(idxInCount, col)]] - primaryInjection[,k:=k[cbind(idxInCount, col)]] - primaryInjection[,nOk:=length(unique(n)) == 1, by="row,col"] - if(nrow(primaryInjection[nOk == FALSE]) / nrow(primaryInjection) > 0.05){ - # this can happen in the lower range if the prediction of the direction - # did not work properly. Where the donor/acceptor goes the wrong way. - # But this should not be too frequent. So ignore if it is below 5% - warning("It looks like you have a problem with your indexes. ", - "Please report it on github to the developer.") + rowMeans( (k + pseudocount()) / (n + 2*pseudocount()))), + simulatedPSI = getAssayMatrix(fds, "truePSI", type=type) ) + + # nedded to have comparably many outliers when type is SE + if(type == "psiSite"){ + freq <- freq/10 } - primaryInjection <- primaryInjection[nOk == TRUE] - primaryInjection[,I:=cumsum(primary)] - primaryInjection[,I:=max(I),by="row,col"] - - - # get matrix accessors (row, col) aka (junctionID, sampleID) - primaryIndexInReal <- as.matrix( - primaryInjection[primary == TRUE, .(idxInCount, col)]) - primaryIndexInGroup <- as.matrix( - primaryInjection[primary == TRUE, .(row, col)]) - secondaryIndexInReal <- as.matrix( - primaryInjection[primary != TRUE, .(idxInCount, col)]) - secondaryIndexInGroup <- as.matrix( - primaryInjection[primary != TRUE, .(row, col)]) - # init with empty for psiSite. Function rep does not - # support times of length zero eg: rep(1, integer(0)) - secondaryPrimaryIndex <- as.matrix(primaryInjection[FALSE,.( - row, col, idxInCount, junctionID, nPerGroup, I, rep=integer(.N))]) - if(nrow(primaryInjection[primary == TRUE & nPerGroup > 1]) > 0){ - secondaryPrimaryIndex <- as.matrix(primaryInjection[ - primary == TRUE & nPerGroup > 1, .( - idxInCount, junctionID, nPerGroup, I, - rep=rep(TRUE, nPerGroup-1)), - by="row,col"]) + + # data table with information about chr, start, end, + # ... for calculating groups + dt <- if(type == "psiSite"){ + data.table( + junctionID = 1:j, + chr = as.factor(seqnames(nonSplicedReads(fds))), + start = start(nonSplicedReads(fds)), + end = end(nonSplicedReads(fds)), + strand = as.factor(strand(nonSplicedReads(fds))) ) + }else{ + data.table( + junctionID = 1:j, + chr = as.factor(seqnames(fds)), + start = start(fds), + end = end(fds), + strand = as.factor(strand(fds)) ) } + dt[, donorGroupSize:=length(junctionID), by=c("chr", "start", "strand")] + dt[, acceptorGroupSize:=length(junctionID), by=c("chr", "end", "strand")] + dt[, donorGroupID := .GRP, by=c("chr", "start", "strand")] + dt[, acceptorGroupID:=.GRP, by=c("chr", "end", "strand")] + + # Get groups where outlier can be injected + available_groups <- switch(type, + psi3 = dt[acceptorGroupSize > 1, acceptorGroupID, + by=acceptorGroupID]$acceptorGroupID, + psi5 = dt[donorGroupSize > 1, donorGroupID, + by=donorGroupID]$donorGroupID, + psiSite = seq_len(j)) - # get direction of injection - injDirection <- indexOut_groups[primaryIndexInGroup] - # table(injDirection) - - # get current psi of this junction and calculate maximal possible delta psi - junction_psi <- psi[primaryIndexInReal] - maxDpsi <- ifelse(injDirection > 0, 1 - junction_psi, junction_psi) - - # if not possible to inject here with at least minDpsi, - # change injection direction - injectionPossible <- maxDpsi > minDpsi - # table(injectionPossible) - injDirection[!injectionPossible] <- -injDirection[!injectionPossible] - maxDpsi <- ifelse(injDirection > 0, 1 - junction_psi, junction_psi) - - # ensure that injected points becomes an outlier by adding - # mean delta psi as an offset - meanDpsi <- rowMeans2(abs(psi[primaryIndexInReal[,"idxInCount"],] - - rowMeans2(psi[primaryIndexInReal[,"idxInCount"],]))) - - # sample delta psi for injection from uniform distribution between - # min and max dpsi - injMinDpsi <- ifelse(minDpsi + meanDpsi < maxDpsi, - minDpsi + meanDpsi, maxDpsi) - injDpsi <- injDirection * switch(deltaDistr, - uniformDistr = runif(length(injMinDpsi), injMinDpsi, maxDpsi), - ifelse(deltaDistr > maxDpsi, maxDpsi, deltaDistr)) - - # get N of this junction - n_ji <- n[primaryIndexInReal] - - # inject new primary k_ij -> change k based on n and new_psi - # (and take pseudocounts into account): (k+1)/(n+2)=psi -> k = psi*(n+2)-1 - new_primary_psi <- junction_psi + injDpsi - new_primary_k <- pmax(pmin(round( - new_primary_psi*(n_ji + 2*pseudocount()) - pseudocount()), n_ji), 0) - - # inject secondary k_ij -> change k based on n and the new_psi - second_delta_psi <- - injDpsi[secondaryPrimaryIndex[,"I"]] * ( - psi[secondaryIndexInReal] / ( - 1-psi[secondaryPrimaryIndex[,c("idxInCount", "col"),drop=FALSE]])) - new_second_psi <- psi[secondaryIndexInReal] + second_delta_psi - - # sanity check for injected psi - if(any(new_second_psi < -0.0001) | any(new_second_psi > 1.0001)){ - warning("Have to cut injected delta psi for: ", - sum(new_second_psi < -0.0001), " and ", - sum(new_second_psi > 1.0001), " instances!") + # e.g. for psi3/5: no donor/acceptor + # groups with at least 2 junctions (e.g in simulationBB) + if(length(available_groups) == 0){ + available_groups <- seq_len(j) + freq <- freq/10 } - new_second_psi <- pmin(pmax(new_second_psi, 0), 1) - new_second_k <- pmax(pmin(round(new_second_psi * (( - n[secondaryIndexInReal] + 2 * pseudocount()) - - pseudocount())), - n[secondaryIndexInReal]), 0) - - # - # check if we really have an outlier or not - # and get primiary/secondary injections - # - goodInjections <- n[primaryIndexInReal] >= minCoverage & - abs(k[primaryIndexInReal] - new_primary_k) > 2 - goodJunctionIds <- primaryInjection[primary == TRUE][ - goodInjections,junctionID] - goodSecondary <- primaryInjection[primary == FALSE][, - junctionID %in% goodJunctionIds] - - message(paste0(date(), ": Injecting outliers: ", sum(goodInjections), - " / ", sum(goodSecondary), " (primary/secondary)")) - - # - # prepare the injection into the dataset only for those which passed QC - # - - # set counts (k and o) - k[primaryIndexInReal[goodInjections,,drop=FALSE]] <- - new_primary_k[goodInjections] - k[secondaryIndexInReal[goodSecondary,,drop=FALSE]] <- - new_second_k[goodSecondary] - o <- n - k - - # set injection status (direction, primary, secondary) - indexOut <- matrix(0, nrow=nrow(k), ncol=ncol(k)) - indexOut[primaryIndexInReal[goodInjections,]] <- - injDirection[goodInjections] - indexOut[secondaryIndexInReal[goodSecondary,]] <- - 2 * injDirection[secondaryPrimaryIndex[,"I"][goodSecondary]] - - # set injected delta psi - indexDeltaPsi <- matrix(0, nrow=nrow(k), ncol=ncol(k)) - indexDeltaPsi[primaryIndexInReal[goodInjections,]] <- - injDpsi[goodInjections] - indexDeltaPsi[secondaryIndexInReal[goodSecondary,]] <- - second_delta_psi[goodSecondary] - - # - # do the injection and save the additional informations in the object - # - # copy original k and o - if(type == "psiSite"){ - setAssayMatrix(fds, type="psiSite", "originalCounts", - withDimnames=FALSE) <- - counts(fds, type="psiSite", side="ofInterest") - setAssayMatrix(fds, type="psiSite", "originalOtherCounts", - withDimnames=FALSE) <- - counts(fds, type="psiSite", side="other") - } else { - setAssayMatrix(fds, type=type, "originalCounts", - withDimnames=FALSE) <- - counts(fds, type=type, side="ofInterest") - setAssayMatrix(fds, type="psi5", "originalOtherCounts", - withDimnames=FALSE) <- - counts(fds, type="psi5", side="other") - setAssayMatrix(fds, type="psi3", "originalOtherCounts", - withDimnames=FALSE) <- - counts(fds, type="psi3", side="other") + list_index <- data.frame(row=integer(0), col=integer(0)) + count <- 0 + while(nrow(list_index) < 1){ + if(count == 20){ + stop("Could not inject at least 2 outliers.", + " Please make sure you have enough junctions and samples", + " in you dataset.") + } + count <- count + 1 + + indexOut_groups <- matrix(sample(c(0,1,-1), length(available_groups)*m, + replace=TRUE, prob=c(1-freq, freq/2, freq/2)), ncol=m) + + # positions where outliers will be injected + list_index <- which(indexOut_groups != 0, arr.ind = TRUE) } - # store new k and o counts including the outlier counts - counts(fds, type=type, side="ofInterest", withDimnames=FALSE) <- k - counts(fds, type=type, side="other", withDimnames=FALSE) <- o + # apply injection function to each outlier + message(date(), ": Injecting ", nrow(list_index), " outliers ...") + result <- bplapply(seq_len(nrow(list_index)), list_index=list_index, + indexOut_groups=indexOut_groups, type=type, psi=psi, n=n, dt=dt, + minDpsi=minDpsi, verbose=verbose, BPPARAM=SerialParam(), + FUN=function(j, list_index, indexOut_groups, type, + psi, n, dt=dt, minDpsi, verbose){ + # extract group, sample and injecetion + # direction (i.e +1/up or -1/down) + row <- list_index[j,'row'] + group <- available_groups[row] + sample <- list_index[j,'col'] + injDirection <- indexOut_groups[row, sample] + + # sample one junction from all junction within this group + group_junctions <- if(type == "psi3"){ + dt[acceptorGroupID == group, junctionID] } + else { + dt[donorGroupID == group, junctionID] } + junction <- if(length(group_junctions)==1){ + group } + else { + sample(group_junctions, 1) } + + # get current psi of this junction and calculate + # maximla possible value of delta psi for the injection + junction_psi <- psi[junction, sample] + maxDpsi <- if(injDirection > 0){ + 1 - junction_psi } + else{ + junction_psi } + + # if not possible to inject here with at least minDpsi, + # change injection direction + if(maxDpsi < minDpsi){ + injDirection <- -injDirection + indexOut_groups[row, sample] <- injDirection + maxDpsi <- if(injDirection > 0){ + 1 - junction_psi } + else{ + junction_psi } + } + + # sample delta psi for injection from uniform + # distribution between min and max dpsi + minDpsi <- ifelse(minDpsi < maxDpsi, minDpsi, maxDpsi) + injDpsi <- injDirection * switch(as.character(deltaDistr), + uniformDistr = runif(1, minDpsi, maxDpsi), + ifelse(as.double(deltaDistr) > maxDpsi, + maxDpsi, as.double(deltaDistr)) ) + + # get N of this junction + n_ji <- n[junction,sample] + + # new counts after injection + newKs <- integer(length(group_junctions)) + indexDpsi <- double(length(group_junctions)) + indOut <- integer(length(group_junctions)) + + # for all other junctions in this group + for(i in seq_len(length(group_junctions))){ + junction_k <- group_junctions[i] + # get new_psi and change k based on n and new_psi + # (and take pseudocounts into account): + # (k+1)/(n+2)=psi -> k = psi*(n+2) - 1 + if(junction_k == junction){ + new_psi <- junction_psi + injDpsi + new_k <- round( new_psi * + (n_ji + 2*pseudocount()) - pseudocount() ) + + # store position of outlier + indOut[i] <- injDirection + indexDpsi[i] <- injDpsi + } + else{ + deltaPSI_k <- - (psi[junction_k,sample] / + (1-junction_psi)) * injDpsi + new_psi <- psi[junction_k,sample] + deltaPSI_k + new_k <- round( new_psi*(n_ji + 2*pseudocount()) - + pseudocount() ) + + # also store position of other junction used in swap + indOut[i] <- -injDirection * 2 + indexDpsi[i] <- deltaPSI_k + } + # for SE: ensure new_k <= n_ij + # (so that o=n-k is always >= 0) + # (not needed for psi3/5 because o + # will recalculated from k's there) + if(length(group_junctions)==1){ # if(type == "psiSite"){ + new_k <- min(new_k, n_ji) + } + # ensure new_k >= 0 and assign k_ij <- new_k + newKs[i] <- max(0, new_k) + } + + if(verbose){ + print(paste("injected outlier", j, "with delta PSI of", + injDpsi, "at junction", junction, "and sample", + sample)) + } + + return(list(newKs = newKs, newOs=sum(newKs)-newKs, + junctions = group_junctions, injDeltaPSI = indexDpsi, + injDirections = indOut, + sample = rep(sample, length(group_junctions)))) + }) + + # get all junctions, samples, ... where outlier injection changed the counts + junctions <- unlist(lapply(result, "[[", 'junctions')) + samples <- unlist(lapply(result, "[[", 'sample')) + newKs <- unlist(lapply(result, "[[", 'newKs')) + newOs <- unlist(lapply(result, "[[", 'newOs')) + injDirection <- unlist(lapply(result, "[[", 'injDirections')) + injDeltaPSI <- unlist(lapply(result, "[[", 'injDeltaPSI')) + + # matrices to store indices of outliers and their delta psi + indexOut <- matrix(0, nrow = j, ncol = m) + indexDeltaPSI <- matrix(0, nrow = j, ncol = m) + # set counts to changed counts after the injection + replaceIndices <- matrix(c(junctions,samples), ncol=2) + k[replaceIndices] <- newKs + if(length(available_groups) == j){ + o <- n - k + } else{ + o[replaceIndices] <- newOs + } + indexOut[replaceIndices] <- injDirection + indexDeltaPSI[replaceIndices] <- injDeltaPSI + # store positions of true outliers and their true delta PSI value setAssayMatrix(fds=fds, name="trueOutliers", type=type, - withDimnames=FALSE) <- indexOut + withDimnames=FALSE) <- indexOut setAssayMatrix(fds=fds, name="trueDeltaPSI", type=type, - withDimnames=FALSE) <- indexDeltaPsi - + withDimnames=FALSE) <- indexDeltaPSI + + # store new k counts which include the outlier counts + counts(fds, type=type, side="ofInterest", withDimnames=FALSE) <- k + + # store modified other counts + counts(fds, type=type, side="other", withDimnames=FALSE) <- o + return(fds) } - removeInjectedOutliers <- function(fds, type){ - + # copy injected k and o counts if(type == "psiSite"){ - setAssayMatrix(fds, type="psiSite", "outlierCounts") <- - counts(fds, type="psiSite", side="ofInterest") - setAssayMatrix(fds, type="psiSite", "outlierOtherCounts") <- - counts(fds, type="psiSite", side="other") + setAssayMatrix(fds, type="psiSite", "outlierCounts", + withDimnames=FALSE) <- counts(fds, type="psiSite", side="ofInteres") + setAssayMatrix(fds, type="psiSite", "outlierOtherCounts", + withDimnames=FALSE) <- counts(fds, type="psiSite", side="other") } else{ - setAssayMatrix(fds, type="psi5", "outlierCounts") <- - counts(fds, type="psi5", side="ofInterest") - setAssayMatrix(fds, type="psi5", "outlierOtherCounts") <- - counts(fds, type="psi5", side="other") - setAssayMatrix(fds, type="psi3", "outlierOtherCounts") <- - counts(fds, type="psi3", side="other") + setAssayMatrix(fds, type="psi5", "outlierCounts", + withDimnames=FALSE) <- counts(fds, type="psi5", side="ofInterest") + setAssayMatrix(fds, type="psi5", "outlierOtherCounts", + withDimnames=FALSE) <- counts(fds, type="psi5", side="other") + setAssayMatrix(fds, type="psi3", "outlierOtherCounts", + withDimnames=FALSE) <- counts(fds, type="psi3", side="other") } - + # assign original k and o to rawCountsJ and rawOtherCounts if(type == "psiSite"){ - counts(fds, type="psiSite", side="ofInterest") <- - getAssayMatrix(fds, type="psiSite", "originalCounts") - counts(fds, type="psiSite", side="other") <- - getAssayMatrix(fds, type="psiSite", "originalOtherCounts") - + counts(fds, type="psiSite", side="ofInterest", withDimnames=FALSE) <- + getAssayMatrix(fds, type="psiSite", "originalCounts") + counts(fds, type="psiSite", side="other", withDimnames=FALSE) <- + getAssayMatrix(fds, type="psiSite", "originalOtherCounts") + assays(fds)[['originalCounts_psiSite']] <- NULL assays(fds)[['originalOtherCounts_psiSite']] <- NULL } else{ - counts(fds, type="psi5", side="ofInterest") <- - getAssayMatrix(fds, type="psi5", "originalCounts") - counts(fds, type="psi5", side="other") <- - getAssayMatrix(fds, type="psi5", "originalOtherCounts") - counts(fds, type="psi3", side="other") <- - getAssayMatrix(fds, type="psi3", "originalOtherCounts") - + counts(fds, type="psi5", side="ofInterest", withDimnames=FALSE) <- + getAssayMatrix(fds, type="psi5", "originalCounts") + counts(fds, type="psi5", side="other", withDimnames=FALSE) <- + getAssayMatrix(fds, type="psi5", "originalOtherCounts") + counts(fds, type="psi3", side="other", withDimnames=FALSE) <- + getAssayMatrix(fds, type="psi3", "originalOtherCounts") + assays(fds)[['originalCounts_psi5']] <- NULL assays(fds)[['originalOtherCounts_psi5']] <- NULL assays(fds)[['originalOtherCounts_psi3']] <- NULL } - + return(fds) - + } restoreInjectedOutliers <- function(fds, type){ - + # copy original k and o counts if(type == "psiSite"){ setAssayMatrix(fds, type="psiSite", "originalCounts") <- @@ -736,14 +757,14 @@ restoreInjectedOutliers <- function(fds, type){ setAssayMatrix(fds, type="psi3", "originalOtherCounts") <- counts(fds, type="psi3", side="other") } - + # assign injected k and o to rawCountsJ and rawOtherCounts if(type == "psiSite"){ counts(fds, type="psiSite", side="ofInterest") <- getAssayMatrix(fds, type="psiSite", "outlierCounts") counts(fds, type="psiSite", side="other") <- getAssayMatrix(fds, type="psiSite", "outlierOtherCounts") - + assays(fds)[['outlierCounts_psiSite']] <- NULL assays(fds)[['outlierOtherCounts_psiSite']] <- NULL } @@ -754,14 +775,14 @@ restoreInjectedOutliers <- function(fds, type){ getAssayMatrix(fds, type="psi5", "outlierOtherCounts") counts(fds, type="psi3", side="other") <- getAssayMatrix(fds, type="psi3", "outlierOtherCounts") - + assays(fds)[['outlierCounts_psi5']] <- NULL assays(fds)[['outlierOtherCounts_psi5']] <- NULL assays(fds)[['outlierOtherCounts_psi3']] <- NULL } - + return(fds) - + } diff --git a/R/plotMethods.R b/R/plotMethods.R index cd568ad5..0dacc3e5 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -1003,7 +1003,9 @@ getColDataAsDFFactors <- function(fds, names){ } #' @noRd -qlogisWithCap <- function(x){ +qlogisWithCap <- function(x, digits=2){ + x <- round(x, digits) + x <- pmin(pmax(x, 10^-digits), 1-10^-digits) ans <- qlogis(x) ans[is.infinite(ans)] <- NA rowm <- rowMaxs(ans, na.rm=TRUE) diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index a468fc67..28bddb33 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -27,7 +27,7 @@ calculateZscore <- function(fds, type=currentType(fds), logit=TRUE){ zscores <- (residual - rowMeans(residual)) / rowSds(residual) zScores(fds, withDimnames=FALSE) <- zscores - + return(fds) } @@ -59,7 +59,8 @@ calculatePvalues <- function(fds, type=currentType(fds), fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, pvals=pvals, index, BPPARAM=BPPARAM) fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, type=type, dist="BetaBinomial") <- fwer_pvals + pVals(fds, type=type, dist="BetaBinomial", + withDimnames=FALSE) <- fwer_pvals return(fds) } diff --git a/R/saveHDF5Objects.R b/R/saveHDF5Objects.R index a40552e6..3dedee6c 100644 --- a/R/saveHDF5Objects.R +++ b/R/saveHDF5Objects.R @@ -66,7 +66,7 @@ loadFraserDataSet <- function(dir, name=NULL, file=NULL, upgrade=FALSE){ } fds <- readRDS(fdsFile) - # needs to be here due to our FRASER -> FRASER package change. + # needs to be here due to our FraseR -> FRASER package change. # can be removed later if the full pipeline is rerun attributes(fds)$class <- structure("FraserDataSet", package="FRASER") @@ -97,7 +97,7 @@ loadFraserDataSet <- function(dir, name=NULL, file=NULL, upgrade=FALSE){ # set the correct path of the assay seed file (if folder changed) for(aname in assayNames(fds)){ - if(is(assay(fds, aname), "matrix")){ + if(is(assay(fds, aname, withDimnames=FALSE), "matrix")){ next } message("Loading assay: ", aname) @@ -105,9 +105,14 @@ loadFraserDataSet <- function(dir, name=NULL, file=NULL, upgrade=FALSE){ if(!file.exists(afile)){ warning(paste("Can not find assay file: ", aname, ".", "The assay will be removed from the object.")) - assay(fds, aname) <- NULL - } else if(afile != path(assay(fds, aname))) { - path(assay(fds, aname)) <- afile + assay(fds, aname, withDimnames=FALSE) <- NULL + } else if(afile != path(assay(fds, aname, withDimnames=FALSE))) { + if(R.Version()$major == "3"){ + path(assay(fds, aname, withDimnames=FALSE)) <- afile + } else { + slot(assay(fds, aname, withDimnames=FALSE), + "seed")@seed@filepath <- afile + } } } @@ -131,6 +136,7 @@ saveFraserDataSet <- function(fds, dir=NULL, name=NULL, rewrite=FALSE) { # over each assay object name(fds) <- name + workingDir(fds) <- dir for(aname in assayNames(fds)){ assay <- assay(fds, aname) assay(fds, aname, withDimnames=FALSE) <- saveAsHDF5( diff --git a/R/updateRho.R b/R/updateRho.R index 54d8b9dc..f5a0d429 100644 --- a/R/updateRho.R +++ b/R/updateRho.R @@ -15,7 +15,8 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ double(1), "minimum") if(isTRUE(verbose)){ - print(summary(rho(fds))) + stxt <- capture.output(summary(rho(fds))) + message(date(), ": rho fit:\n\t", paste(stxt, collapse="\n\t")) } validObject(fds) diff --git a/appveyor.yml b/appveyor.yml index e47fd258..1f318d9f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,10 +35,13 @@ environment: # BIOC_USE_DEVEL: TRUE # GCC_PATH: mingw_32 # R_ARCH: i386 -# - R_VERSION: devel -# BIOC_USE_DEVEL: TRUE + - R_VERSION: devel + BIOC_USE_DEVEL: TRUE - R_VERSION: release - - R_VERSION: 4.0.0 + # dont test 4.0.0 due to: + # * https://github.com/r-lib/roxygen2/issues/1128 + # * https://github.com/RcppCore/Rcpp/issues/1095 + - R_VERSION: 4.0.2 - R_VERSION: 3.6.0 build_script: diff --git a/inst/extdata/bam/sample1.bam b/inst/extdata/bam/sample1.bam index 7e2e9316..522026bd 100644 Binary files a/inst/extdata/bam/sample1.bam and b/inst/extdata/bam/sample1.bam differ diff --git a/inst/extdata/bam/sample1.bam.bai b/inst/extdata/bam/sample1.bam.bai index a38fcd37..f23ccd90 100644 Binary files a/inst/extdata/bam/sample1.bam.bai and b/inst/extdata/bam/sample1.bam.bai differ diff --git a/inst/extdata/bam/sample2.bam b/inst/extdata/bam/sample2.bam index 75076a77..dd9ebde9 100644 Binary files a/inst/extdata/bam/sample2.bam and b/inst/extdata/bam/sample2.bam differ diff --git a/inst/extdata/bam/sample2.bam.bai b/inst/extdata/bam/sample2.bam.bai index 9cdeeddf..9b40feb7 100644 Binary files a/inst/extdata/bam/sample2.bam.bai and b/inst/extdata/bam/sample2.bam.bai differ diff --git a/inst/extdata/bam/sample3.bam b/inst/extdata/bam/sample3.bam index 0fbe16ca..780dac42 100644 Binary files a/inst/extdata/bam/sample3.bam and b/inst/extdata/bam/sample3.bam differ diff --git a/inst/extdata/bam/sample3.bam.bai b/inst/extdata/bam/sample3.bam.bai index 9952417b..e785ba84 100644 Binary files a/inst/extdata/bam/sample3.bam.bai and b/inst/extdata/bam/sample3.bam.bai differ diff --git a/inst/extdata/sampleTable.tsv b/inst/extdata/sampleTable.tsv index e6eefa24..748199c9 100644 --- a/inst/extdata/sampleTable.tsv +++ b/inst/extdata/sampleTable.tsv @@ -1,4 +1,4 @@ -sampleID bamFile group gene -sample1 extdata/bam/sample1.bam 1 TIMMDC1 -sample2 extdata/bam/sample2.bam 3 CLPP -sample3 extdata/bam/sample3.bam 2 MCOLN1 +sampleID bamFile group gene pairedEnd +sample1 extdata/bam/sample1.bam 1 TIMMDC1 TRUE +sample2 extdata/bam/sample2.bam 3 CLPP TRUE +sample3 extdata/bam/sample3.bam 2 MCOLN1 TRUE diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd index 4fa9bf29..186a1371 100644 --- a/man/injectOutliers.Rd +++ b/man/injectOutliers.Rd @@ -12,7 +12,8 @@ injectOutliers( minCoverage = 2, deltaDistr = "uniformDistr", verbose = FALSE, - method = c("meanPSI", "samplePSI", "simulatedPSI") + method = c("samplePSI", "meanPSI", "simulatedPSI"), + BPPARAM = bpparam() ) } \arguments{ @@ -37,6 +38,8 @@ i.e. to which value the delta psi of the injection is added: "meanPSI" for adding to the mean psi of the junction over all samples or "samplePSI" to add to the psi value of the junction in the specific sample. "simulatedPSI" is only possible if a simulated dataset is used.} + +\item{BPPARAM}{A BiocParallel object to run the computation in parallel} } \value{ FraserDataSet @@ -45,7 +48,7 @@ FraserDataSet Inject artificial outliers in an existing fds } \examples{ -# An example dataset -fds <- createTestFraserDataSet() +# A generic dataset +fds <- makeSimulatedFraserDataSet() fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) } diff --git a/man/makeSimulatedFraserDataSet.Rd b/man/makeSimulatedFraserDataSet.Rd new file mode 100644 index 00000000..bcc785a9 --- /dev/null +++ b/man/makeSimulatedFraserDataSet.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/makeSimulatedDataset.R +\name{makeSimulatedFraserDataSet} +\alias{makeSimulatedFraserDataSet} +\title{Create an simulated example data set for FRASER} +\usage{ +makeSimulatedFraserDataSet( + m = 100, + j = 500, + q = 10, + distribution = c("BB", "DM"), + ... +) +} +\arguments{ +\item{m}{Number of simulated samples} + +\item{j}{Number of simulated junctions} + +\item{q}{number of simulated latent variables.} + +\item{distribution}{Either "BB" for a beta-binomial simulation or "DM" for a +dirichlet-multinomial simulation.} + +\item{...}{Further arguments used to construct the FraserDataSet.} +} +\value{ +An FraserDataSet containing an example dataset based on + simulated data +} +\description{ +Simulates a data set based on random counts following a +beta binomial (or Dirichlet-Multinomial) distribution. +} +\examples{ +# A generic dataset +fds1 <- makeSimulatedFraserDataSet() +fds1 + +# A generic dataset with specificed sample size and injection method +fds2 <- makeSimulatedFraserDataSet(m=10, j=100, q=3) +fds2 + +} diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd index 0a2fbc65..55c7686f 100644 --- a/man/optimHyperParams.Rd +++ b/man/optimHyperParams.Rd @@ -17,7 +17,8 @@ optimHyperParams( BPPARAM = bpparam(), internalThreads = 1, plot = TRUE, - delayed = ifelse(ncol(fds) <= 300, FALSE, TRUE) + delayed = ifelse(ncol(fds) <= 300, FALSE, TRUE), + ... ) } \arguments{ @@ -57,6 +58,8 @@ the hyperparameter optimization finishes.} calculations), otherwise the function works on the delayedMatrix representations (more memory efficient). The default value depends on the number of samples in the fds-object.} + +\item{...}{Additional parameters passed to \code{injectOutliers}.} } \value{ FraserDataSet @@ -67,12 +70,13 @@ ratios while maximizing the precision-recall curve. } \examples{ # generate data - fds <- createTestFraserDataSet() + fds <- makeSimulatedFraserDataSet(m=15, j=20) # run hyperparameter optimization - fds <- optimHyperParams(fds, type="psi5", implementation="PCA") + fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) # get estimated optimal dimension of the latent space bestQ(fds, type="psi5") + hyperParams(fds, type="psi5") } diff --git a/setupEnv.R b/setupEnv.R index b32cb01d..dca663f8 100644 --- a/setupEnv.R +++ b/setupEnv.R @@ -39,6 +39,10 @@ if("windows" == .Platform$OS.type){ # install needed packages # add testthat to pre installation dependencies # due to: https://github.com/r-lib/pkgload/issues/89 +if(!requireNamespace("XML", quietly=TRUE) & R.version[['major']] == "3"){ + installIfReq(p="devtools", type=BTYPE, Ncpus=NCPUS) + devtools::install_version("XML", version="3.99-0.3") +} for(p in c("getopt", "XML", "xml2", "testthat", "devtools", "covr", "roxygen2", "BiocCheck", "R.utils", "GenomeInfoDbData", "rtracklayer", "hms")){ diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp index 3d47631f..07eadcca 100644 --- a/src/loss_n_gradient_functions.cpp +++ b/src/loss_n_gradient_functions.cpp @@ -62,7 +62,7 @@ arma::vec estMu(arma::vec y, arma::uvec pos){ // [[Rcpp::export()]] arma::mat predictYCpp(arma::mat H, arma::mat D, arma::vec b){ - arma::mat y, ey, mu; + arma::mat y; y = H * D.t(); y = y.each_row() + b.t(); diff --git a/tests/testthat.R b/tests/testthat.R index a9909efc..9867d085 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,9 @@ library(testthat) library(FRASER) +# to speed up the testing on windows do it in serial mode +if(.Platform$OS.type != "unix") { + register(SerialParam()) +} + test_check("FRASER") diff --git a/tests/testthat/test_counting.R b/tests/testthat/test_counting.R index 2c8c0da1..76410119 100644 --- a/tests/testthat/test_counting.R +++ b/tests/testthat/test_counting.R @@ -1,7 +1,7 @@ context("Test counting") test_that("Count junctions", { - attach(test_generate_count_example()) + out <- capture.output(attach(test_generate_count_example())) expect_is(test_fdsSample3, "FraserDataSet") @@ -15,7 +15,7 @@ test_that("Count junctions", { }) test_that("Strand spcific counting", { - attach(test_generate_strand_specific_count_example()) + suppressMessages(attach(test_generate_strand_specific_count_example())) expect_is(test_fdsSample3_stranded, "FraserDataSet") @@ -40,13 +40,17 @@ test_that("Strand spcific counting", { test_that("test minAnchor", { fds <- createTestFraserSettings() - features <- makeGRangesFromDataFrame(data.table(GeneID=1:5, Chr="chr19", - Start=c(7592514, 7592749, 7594598, 7595171, 7595320), - End=c(7592515, 7592750, 7594599, 7595172, 7595321), Strand="*")) - expect_equivalent(c(9, 9, 10, 10, 0, 0, 0, 0, 9, 9), countNonSplicedReads( - "sample3", features, fds, minAnchor=5, recount=TRUE)[,1]) - expect_equivalent(c(6, 5, 10, 10, 0, 0, 0, 0, 9, 8), countNonSplicedReads( - "sample3", features, fds, minAnchor=15, recount=TRUE)[,1]) + features <- makeGRangesFromDataFrame(data.table(GeneID=1:3, Chr="chr19", + Start=c(7592515, 7594599, 7594599), + End=c(7592749, 7595171, 7595320), Strand="*")) + + out <- capture.output({ctnNS5 <- as.matrix(countNonSplicedReads( + "sample3", features, fds, minAnchor=5, recount=TRUE)) }) + out <- capture.output({ctnNS25 <- as.matrix(countNonSplicedReads( + "sample3", features, fds, minAnchor=25, recount=TRUE)) }) + + expect_equivalent(c(7, 8, 0, 0, 7), ctnNS5[,1]) + expect_equivalent(c(5, 8, 0, 0, 6), ctnNS25[,1]) }) test_that("Test psi values", { diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R index b7234550..069a217d 100644 --- a/tests/testthat/test_fraser_pipeline.R +++ b/tests/testthat/test_fraser_pipeline.R @@ -1,15 +1,6 @@ context("Test FRASER pipeline") test_that("FRASER function", { - # fds <- getFraser() - # fds1 <- FRASER(fds=fds, q=2, iterations=2) - # - # expect_is(fds, "FraserDataSet") - # expect_equal(dim(pVals(fds1, "psi5")), dim(pVals(fds, "psi5"))) - # expect_equal(dim(pVals(fds1, "psiSite")), dim(pVals(fds, "psiSite"))) -}) - -test_that("FraserDataSet create settings", { fds <- createTestFraserDataSet() expect_is(fds, "FraserDataSet") anames <- c(psiTypes, paste0(c("delta", "predictedMeans", diff --git a/tests/testthat/test_getter_setter.R b/tests/testthat/test_getter_setter.R index fe2db13f..3e283452 100644 --- a/tests/testthat/test_getter_setter.R +++ b/tests/testthat/test_getter_setter.R @@ -1,14 +1,14 @@ context("testing getter/setter functions") test_that("counts", { - fds <- getFraser() + fds <- createTestFraserDataSet() expect_equal(counts(fds, type="psi5"), K(fds, "psi5")) expect_equal(counts(fds, type="psi3"), K(fds, "psi3")) expect_equal(counts(fds, type="psiSite"), K(fds, "psiSite")) - expect_equal(as.matrix(counts(fds, type="psi5", side='other')), - as.matrix(N(fds, "psi5") - K(fds, "psi5"))) - expect_equal(as.matrix(counts(fds, type="psi3", side='other')), - as.matrix(N(fds, "psi3") - K(fds, "psi3"))) - expect_equal(as.matrix(counts(fds, type="psiSite", side='other')), - as.matrix(N(fds, "psiSite") - K(fds, "psiSite"))) + expect_equal(counts(fds, type="psi5", side='other'), + N(fds, "psi5") - K(fds, "psi5")) + expect_equal(counts(fds, type="psi3", side='other'), + N(fds, "psi3") - K(fds, "psi3")) + expect_equal(counts(fds, type="psiSite", side='other'), + N(fds, "psiSite") - K(fds, "psiSite")) }) diff --git a/tests/testthat/test_hyperParams.R b/tests/testthat/test_hyperParams.R index 46cd7250..26331409 100644 --- a/tests/testthat/test_hyperParams.R +++ b/tests/testthat/test_hyperParams.R @@ -1,46 +1,19 @@ context("Test hyper param optimization") -test_that("Getter/Setter hyper params", { - attach(test_generate_count_example()) - - expect_is(test_fdsSample3, "FraserDataSet") - - # test how many ranges we found - expect_equal(length(test_rangeFDS), 3) - expect_equal(length(nonSplicedReads(test_rangeFDS)), 5) - - # test the manually counted positions - expect_equal(as.vector(counts(test_rangeFDS, type="j")), test_rawCountsJ) - # expect_equal(as.vector(counts(test_rangeFDS, type="ss")), test_rawCountsSS) -}) - -test_that("Test psi values", { - attach(test_generate_count_example()) - - expect_equal(as.vector(counts(test_rangeFDS, type="psi3")), test_rawCountsJ) - expect_equal(test_p3rawOCounts, - as.vector(counts(test_rangeFDS, type="psi3", side="other")) - ) - - expect_equal(as.vector(counts(test_rangeFDS, type="psi5")), test_rawCountsJ) - expect_equal(test_p5rawOCounts, - as.vector(counts(test_rangeFDS, type="psi5", side="other")) - ) - - #expect_equal(as.vector(counts(test_rangeFDS, type="psiSite")), test_rawCountsSS) - expect_equal(test_pSrawOCounts, - as.vector(counts(test_rangeFDS, type="psiSite", side="other")) - ) - - expect_equal(as.vector(assays(test_rangeFDS)[["psi3"]]), - test_rawCountsJ / (test_rawCountsJ + test_p3rawOCounts) - ) - - expect_equal(as.vector(assays(test_rangeFDS)[["psi5"]]), - test_rawCountsJ / (test_rawCountsJ + test_p5rawOCounts) - ) - - #expect_equal(as.vector(assays(test_rangeFDS)[["psiSite"]]), - # test_rawCountsSS / (test_rawCountsSS + test_pSrawOCounts) - #) +test_that("Test hyper param testing", { + fds <- makeSimulatedFraserDataSet(m=15, j=20, dist="BB") + + # test BB no hyper params and accessors + fds <- optimHyperParams(fds, type="psi3", implementation="BB") + expect_true(is.na(hyperParams(fds, type="psi3")[,q])) + + fds <- optimHyperParams(fds, type="psi5", minDeltaPsi=0.01, plot=FALSE, + q_param = c(2,3), noise_param=c(1), iterations=2, BPPARAM=bpparam()) + + expect_equal(c(2,5), dim(hyperParams(fds, type="psi5", all=TRUE))) + expect_equal(c(1,5), dim(hyperParams(fds, type="psi5", all=FALSE))) + expect_equal(metadata(fds)$hyperParams_psi5[order(-aroc)][1,q], + bestQ(fds, type="psi5")) + expect_equal(1, bestNoise(fds, type="psi5")) + expect_is(suppressWarnings(plotEncDimSearch(fds, "psi5")), "ggplot") }) diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index 9f14ee6f..8e76cc19 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -1,34 +1,32 @@ context("Test stats calculations") test_that("PSI value calculation", { - # get subset to speed up test - #fds <- getFraser()[,1] - #fds <- fds[mcols(fds, type="psi3")$startID %in% c(1,2,6,7),] - # expect it from the toy data after subsetting - #expect_equal(length(fds), 10) - #expect_equal(length(nonSplicedReads(fds)), 12) - - # set predifiend values - #assays(fds)[["rawCountsJ"]] <- rep(c(0,5), c(1,9)) - #assays(fds)[["rawCountsSS"]] <- rep(c(0,5), c(1,11)) - - # calculate psi values - #fds <- calculatePSIValues(fds) - - #assays(fds)[["psiSite"]] - #counts(fds, type="psi3") - - #expect_equal("ok", "ok") - #settings <- createTestFraserDataSet() - #expect_is(settings, "FraserDataSet") + fds <- getFraser() + + psiVal <- K(fds, "psi5") / N(fds, "psi5") + + # no pseudo counts in psi values + expect_equal(psiVal[!is.na(psiVal)], assay(fds, "psi5")[!is.na(psiVal)]) + + # NAs turned into 1 + expect_equal(assay(fds, "psi5")[is.na(psiVal)], rep(1, sum(is.na(psiVal)))) + + # NAs where N==0 + expect_true(all(N(fds, "psi5")[!is.na(psiVal)] != 0)) + expect_true(all(N(fds, "psi5")[ is.na(psiVal)] == 0)) }) test_that("Zscore calculation", { - #fds <- getFraser() - #fds <- FRASER(fds) - - #expect_is(fds, "FraserDataSet") - #expect_equal(dim(fds), c(94, 12)) - #expect_equal(dim(nonSplicedReads(fds)), c(111, 12)) + fds <- getFraser(clean = TRUE) + + # prepare zScore input for logit scale + psiVal <- (K(fds, "psi5") + pseudocount())/(N(fds, "psi5") + 2*pseudocount()) + mu <- predictedMeans(fds, "psi5") + residual <- qlogis(psiVal) - qlogis(mu) + + # compute zscore + zscores <- (residual - rowMeans(residual)) / rowSds(residual) + + expect_equal(zscores, zScores(fds, "psi5")) }) diff --git a/tests/testthat/test_write_read_objects.R b/tests/testthat/test_write_read_objects.R new file mode 100644 index 00000000..bd8e1b31 --- /dev/null +++ b/tests/testthat/test_write_read_objects.R @@ -0,0 +1,32 @@ +context("testing HDF5 write/read functionality") + +test_that("save/load/move fds object", { + # temp folder + dir1 <- tempfile("fraser_test_1_") + dir2 <- tempfile("fraser_test_2_") + on.exit(unlink(dir1, force=TRUE, recursive=TRUE)) + on.exit(unlink(dir2, force=TRUE, recursive=TRUE)) + + # create test object (force to write hdf5) + options(FRASER.maxJunctionsNoHDF5=10) + on.exit(options(FRASER.maxJunctionsNoHDF5=1000)) + fds <- makeSimulatedFraserDataSet(workingDir=dir1, j=11) + fds <- saveFraserDataSet(fds) + + # load from created folder + fds_l1 <- loadFraserDataSet(file=file.path( + dir1, "savedObjects", "Data_Analysis", "fds-object.RDS")) + + expect_equivalent(fds_l1, fds) + expect_equal(file_path_as_absolute(dir1), + dirname(dirname(dirname(path(assay(fds, "rawCountsJ")))))) + + # rename and check if still the same + renameFile(dir1, dir2) + fds_l2 <- loadFraserDataSet(file=file.path( + dir2, "savedObjects", "Data_Analysis", "fds-object.RDS")) + expect_equivalent(fds_l2, fds) + expect_equal(file_path_as_absolute(dir2), + dirname(dirname(dirname(path(assay(fds_l2, "rawCountsJ")))))) +}) + diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 33f54ed7..b1bffd3e 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -6,7 +6,13 @@ \documentclass[11pt]{article} <>= -BiocStyle::latex() +BiocStyle::latex(relative.path = TRUE) +@ + +<>= +library(BiocParallel) +# for speed we run everything in serial mode +register(SerialParam()) @ <>= @@ -43,9 +49,8 @@ opts_chunk$set( <>= opts_chunk$set(concordance=TRUE) -@ -% \SweaveOpts{concordance=TRUE} +@ \maketitle @@ -184,7 +189,7 @@ Additionally, the user can create several analysis plots directly from the fitted \fds{} object. These plotting functions are described in section \ref{sec:result-vis}. -<>= +<>= # load FRASER library library(FRASER) @@ -214,10 +219,10 @@ fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) # alternatively, we also provide a way to use biomart for the annotation: # fds <- annotateRanges(fds) -# get results: usually, using p-value cutoff is recommended, but on the small -# example dataset, this would give no results, so we use z-scores here instead -# res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.1) -res <- results(fds, zScoreCutoff=1, padjCutoff=NA, deltaPsiCutoff=0.02) +# get results: we recommend to use an FDR cutoff 0.05, but due to the small +# dataset size we extract all events and their associated values +# eg: res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.3) +res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA) res # result visualization @@ -323,7 +328,7 @@ resulting \Rclass{FraserDataSet} object contains two \Rclass{SummarizedExperiment} objects for each the junctions and the splice sites. -<>= +<>= # example of how to use parallelization: use 10 cores or the maximal number of # available cores if fewer than 10 are available and use Snow if on Windows if(.Platform$OS.type == "unix") { @@ -331,7 +336,9 @@ if(.Platform$OS.type == "unix") { } else { register(SnowParam(workers=min(10, multicoreWorkers()))) } +@ +<>= # count reads fds <- countRNAData(settings) fds @@ -345,16 +352,16 @@ together with the sample annotation from above to create the \fds: <>= # example sample annoation for precalculated count matrices sampleTable <- fread(system.file("extdata", "sampleTable_countTable.tsv", - package="FRASER", mustWork=TRUE)) + package="FRASER", mustWork=TRUE)) head(sampleTable) # get raw counts junctionCts <- fread(system.file("extdata", "raw_junction_counts.tsv.gz", - package="FRASER", mustWork=TRUE)) + package="FRASER", mustWork=TRUE)) head(junctionCts) spliceSiteCts <- fread(system.file("extdata", "raw_site_counts.tsv.gz", - package="FRASER", mustWork=TRUE)) + package="FRASER", mustWork=TRUE)) head(spliceSiteCts) # create FRASER object