Skip to content

Commit

Permalink
FRASER revision (#13)
Browse files Browse the repository at this point in the history
* update and adjust `injectOutlier` and `hyperParameter` functions

* option to compute z scores in logit space or not

* add cap value [0.01,0.99] to logit function

* bugfixes (withDimnames, dirname in save function, non existing chromosomes)

* dont force HDF5

* proper pairedEnd counting

Co-authored-by: Ines Scheller <[email protected]>
  • Loading branch information
c-mertes and ischeller authored Aug 27, 2020
1 parent eb45f63 commit faa0a20
Show file tree
Hide file tree
Showing 36 changed files with 679 additions and 612 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
language: r
sudo: required
dist: bionic
# due to https://travis-ci.community/t/r-install-broken-travis-ci-rstudio-org-returns-403/9640/2
# dist: bionic
cache:
- directories:
- $HOME/R/Library
Expand Down
1 change: 0 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ Collate:
annotationOfRanges.R
beta-binomial-testing.R
calculatePSIValue.R
calculateStats.R
countRNAseqData.R
example_functions.R
filterExpression.R
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ export(getSplitReadCountsForAllSamples)
export(hyperParams)
export(injectOutliers)
export(loadFraserDataSet)
export(makeSimulatedFraserDataSet)
export(mapSeqlevels)
export(mergeCounts)
export(name)
Expand Down
4 changes: 2 additions & 2 deletions R/beta-binomial-testing.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ pvalueByBetaBinomialPerType <- function(fds, aname, psiType, pvalFun,
pvalues_full[toTest,] <- pvalues

# transform it to a hdf5 assay and save it to the dataset
assays(fds, type=psiType)[[aname]] <- pvalues_full
assay(fds, aname, type=psiType, withDimnames=FALSE) <- pvalues_full

# save the pvalue calculations in the SE ranged object
return(fds)
Expand Down Expand Up @@ -201,7 +201,7 @@ betabinVglmTest <- function(cMat, alternative="two.sided",

#'
#' error/warning catching functions by martin morgan
#' http://stackoverflow.com/questions/4948361/how-do-i-save-warnings-and-errors-as-output-from-a-function
#' http://stackoverflow.com/questions/4948361
#'
#' @noRd
tryCatchFactory <- function(fun, timeout=300){
Expand Down
100 changes: 55 additions & 45 deletions R/calculatePSIValue.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,12 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE,
overwriteCts=overwriteCts, BPPARAM=BPPARAM)
}

# save Fraser object to disk
fds <- saveFraserDataSet(fds)

# calculate the delta psi value
for(psiType in types){
assayName <- paste0("delta_", psiType)
fds <- calculateDeltaPsiValue(fds, psiType, assayName)
}

# save final Fraser object to disk
fds <- saveFraserDataSet(fds)

# return it
return(fds)
}
Expand Down Expand Up @@ -88,16 +82,10 @@ calculatePSIValuePrimeSite <- function(fds, psiType, overwriteCts, BPPARAM){

# check if other counts and psi values chache file exists already
cacheFile <- getOtherCountsCacheFile(sample, fds)
if(file.exists(cacheFile)){
h5 <- HDF5Array(filepath=cacheFile, name=h5DatasetName)
if((isFALSE(overwriteCts) |
!all(paste0("rawOtherCounts_psi", c(5, 3)) %in%
assayNames(fds)) ) &&
nrow(h5) == nrow(K(fds, type="psi5"))){

return(h5)
}
unlink(cacheFile)
ans <- checkPsiCacheFile(cFile=cacheFile, dName=h5DatasetName,
overwrite=overwriteCts, ptype=c("psi5", "psi3"), fds=fds)
if(!is.null(ans)){
return(ans)
}

# add sample specific counts to the data.table (K)
Expand Down Expand Up @@ -128,41 +116,63 @@ calculatePSIValuePrimeSite <- function(fds, psiType, overwriteCts, BPPARAM){
countData[is.na(psi5),psi5:=1]
countData[is.na(psi3),psi3:=1]

# if no HDF5 is requested return it as matrix
if(dontWriteHDF5(fds)){
return(as.matrix(countData[,.(o5,o3,psi5,psi3)]))
}

# write other counts and psi values to h5 file
# get defind chunk sizes
# get defined chunk sizes
chunkDims <- c(
min(nrow(countData), options()[['FRASER-hdf5-chunk-nrow']]),
1)
writeHDF5Array(as.matrix(countData[,.(o5,o3,psi5,psi3)]),
filepath=cacheFile, name=h5DatasetName,
filepath=cacheFile, name=h5DatasetName,
chunkdim=chunkDims, level=7, verbose=FALSE)

# get counts as DelayedMatrix
HDF5Array(filepath=cacheFile, name=h5DatasetName)

}
)
names(psiValues) <- samples(fds)

# merge it and assign it to our object
assay(fds, type="j", "psi5", withDimnames=FALSE) <-
do.call(cbind, mapply('[', psiValues, TRUE, 3, drop=FALSE))
assay(fds, type="j", "psi3", withDimnames=FALSE) <-
do.call(cbind, mapply('[', psiValues, TRUE, 4, drop=FALSE))
assay(fds, type="j", "psi5", withDimnames=FALSE) <- do.call(cbind,
mapply('[', psiValues, TRUE, 3, drop=FALSE, SIMPLIFY=FALSE))
assay(fds, type="j", "psi3", withDimnames=FALSE) <- do.call(cbind,
mapply('[', psiValues, TRUE, 4, drop=FALSE, SIMPLIFY=FALSE))

if(isTRUE(overwriteCts)){
assay(fds, type="j", "rawOtherCounts_psi5", withDimnames=FALSE) <-
do.call(cbind, bplapply(psiValues, BPPARAM=BPPARAM,
function(x){ x[,1,drop=FALSE] }))
do.call(cbind, mapply('[', psiValues, TRUE, 1,
drop=FALSE, SIMPLIFY=FALSE))
assay(fds, type="j", "rawOtherCounts_psi3", withDimnames=FALSE) <-
do.call(cbind, bplapply(psiValues, BPPARAM=BPPARAM,
function(x){ x[,2,drop=FALSE] }))
do.call(cbind, mapply('[', psiValues, TRUE, 2,
drop=FALSE, SIMPLIFY=FALSE))
}

return(fds)
}


#'
#' Helper function to check for PSI value cached data
#'
#' @noRd
checkPsiCacheFile <- function(cFile, dName, overwrite, ptype, fds){
if(file.exists(cFile) && dName %in% h5ls(cFile)$name){
h5 <- HDF5Array(filepath=cFile, name=dName)
aNames <- paste0("rawOtherCounts_", ptype)
if(isFALSE(overwrite) && all( aNames %in% assayNames(fds)) &&
nrow(h5) == nrow(K(fds, type=ptype[1]))){
return(h5)
}
h5delete(cFile, name=dName)
}
return(NULL)
}


#'
#' This function calculates the site PSI values for each splice site
#' based on the FraserDataSet object
Expand Down Expand Up @@ -206,16 +216,10 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){

# get counts and psiSite values from cache file if it exists
cacheFile <- getOtherCountsCacheFile(sample, fds)
if(file.exists(cacheFile) &&
psiH5datasetName %in% h5ls(cacheFile)$name){
h5 <- HDF5Array(filepath=cacheFile, name=psiH5datasetName)
if((isFALSE(overwriteCts) | !psiROCName %in% assayNames(fds))
&& nrow(h5) == nrow(K(fds, type="psiSite"))){

return(h5)
} else{
h5delete(cacheFile, name=psiH5datasetName)
}
ans <- checkPsiCacheFile(cFile=cacheFile, dName=psiH5datasetName,
overwrite=overwriteCts, ptype="psiSite", fds=fds)
if(!is.null(ans)){
return(ans)
}

# add sample specific counts to the data.table
Expand All @@ -226,7 +230,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
sdata[,os:=sum(k)-k, by="spliceSiteID"]

# remove the junction part since we only want to calculate the
# psi values for the splice sites themselfs
# psi values for the splice sites themselves
sdata <- sdata[type=="spliceSite"]

# calculate psi value
Expand All @@ -235,8 +239,13 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
# if psi is NA this means there were no reads at all so set it to 1
sdata[is.na(psiValue),psiValue:=1]

# if no HDF5 is requested return it as matrix
if(dontWriteHDF5(fds)){
return(as.matrix(sdata[,.(os, psiValue)]))
}

# write other counts and psi values to h5 file
# get defind chunk sizes
# get defined chunk sizes
chunkDims <- c(
min(nrow(sdata), options()[['FRASER-hdf5-chunk-nrow']]),
2)
Expand All @@ -251,12 +260,13 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){
names(psiSiteValues) <- samples(fds)

# merge it and assign it to our object
assay(fds, type="ss", psiName, withDimnames=FALSE) <-
do.call(cbind, mapply('[', psiSiteValues, TRUE, 2, drop=FALSE))
assay(fds, type="ss", psiName, withDimnames=FALSE) <- do.call(cbind,
mapply('[', psiSiteValues, TRUE, 2, drop=FALSE,
SIMPLIFY=FALSE))
if(isTRUE(overwriteCts)){
assay(fds, type="ss", psiROCName, withDimnames=FALSE) <-
do.call(cbind, bplapply(psiSiteValues, BPPARAM=BPPARAM,
function(x) { x[,1,drop=FALSE] }))
assay(fds, type="ss", psiROCName, withDimnames=FALSE) <- do.call(cbind,
mapply('[', psiSiteValues, TRUE, 1, drop=FALSE,
SIMPLIFY=FALSE))
}

return(fds)
Expand Down Expand Up @@ -310,4 +320,4 @@ getOtherCountsCacheFolder <- function(fds){

# return it
return(cachedir)
}
}
53 changes: 0 additions & 53 deletions R/calculateStats.R

This file was deleted.

17 changes: 12 additions & 5 deletions R/countRNAseqData.R
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ countSplitReads <- function(sampleID, fds, NcpuPerSample=1, genome=NULL,
}

# remove chromosomes with different seqlength than in genome (if provided)
# and remove non annotation existing chromosomes from counting
if(!is.null(genome)){
if(is.character(genome)){
genome <- getBSgenome(genome)
Expand All @@ -483,19 +484,28 @@ countSplitReads <- function(sampleID, fds, NcpuPerSample=1, genome=NULL,
chrLengths <- extractChromosomeLengths(bamFile)
mismatchChrs <- which(seqlengths(genome)[chromosomes] != chrLengths)
if(length(mismatchChrs) > 0){
warning("Not counting chromosome(s) ", chromosomes[mismatchChrs],
warning("Not counting chromosome(s) ",
paste(chromosomes[mismatchChrs], collapse=", "),
" in sample ", sampleID, " as it has a different length",
" in the bamFile of this sample than in the provided",
" genome.")
chromosomes <- chromosomes[-mismatchChrs]
}
missingChrs <- which(!chromosomes %in% seqnames(genome))
if(length(missingChrs) > 0){
warning("Not counting chromosome(s) ",
paste(chromosomes[missingChrs], collapse=", "),
" in sample ", sampleID, " as it is not specified in",
" the provided genome.")
chromosomes <- chromosomes[-missingChrs]
}
}

# extract the counts per chromosome
countsList <- bplapply(chromosomes, FUN=countSplitReadsPerChromosome,
bamFile=bamFile, pairedEnd=pairedEnd, settings=fds, genome=genome,
BPPARAM=getBPParam(NcpuPerSample, length(chromosomes)))

# sort and merge the results befor returning/saving
countsGR <- sort(unlist(GRangesList(countsList)))
saveRDS(countsGR, cacheFile)
Expand Down Expand Up @@ -815,9 +825,6 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds,
# unstranded case: for counting only non spliced reads we
# skip this information
isPairedEnd <- pairedEnd(fds[,samples(fds) == sampleID])[[1]]
if(isFALSE(as.logical(strandSpecific(fds)))){
isPairedEnd <- FALSE
}
doAutosort <- isPairedEnd

# check cache if available
Expand Down
16 changes: 10 additions & 6 deletions R/find_encoding_dimensions.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ predict_outliers <- function(fds, type, implementation, BPPARAM){
}

eval_prot <- function(fds, type){

message(date(), ": Calculating AUC-PR ...")

index <- getSiteIndex(fds, type)
idx <- !duplicated(index)

Expand Down Expand Up @@ -96,27 +99,29 @@ findEncodingDim <- function(i, fds, type, params, implementation,
#' @param delayed If FALSE, count matrices will be loaded into memory (faster
#' calculations), otherwise the function works on the delayedMatrix
#' representations (more memory efficient). The default value depends on the
#' number of samples in the fds-object.
#' number of samples in the fds-object.
#' @param ... Additional parameters passed to \code{injectOutliers}.
#'
#' @return FraserDataSet
#'
#' @examples
#' # generate data
#' fds <- createTestFraserDataSet()
#' fds <- makeSimulatedFraserDataSet(m=15, j=20)
#'
#' # run hyperparameter optimization
#' fds <- optimHyperParams(fds, type="psi5", implementation="PCA")
#' fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5))
#'
#' # get estimated optimal dimension of the latent space
#' bestQ(fds, type="psi5")
#' hyperParams(fds, type="psi5")
#'
#' @export
optimHyperParams <- function(fds, type, implementation="PCA",
q_param=seq(2, min(40, ncol(fds)), by=3),
noise_param=0, minDeltaPsi=0.1,
iterations=5, setSubset=15000, injectFreq=1e-2,
BPPARAM=bpparam(), internalThreads=1, plot=TRUE,
delayed=ifelse(ncol(fds) <= 300, FALSE, TRUE)){
delayed=ifelse(ncol(fds) <= 300, FALSE, TRUE), ...){
if(isFALSE(needsHyperOpt(implementation))){
message(date(), ": For correction '", implementation, "' no hyper ",
"parameter optimization is needed.")
Expand Down Expand Up @@ -168,8 +173,7 @@ optimHyperParams <- function(fds, type, implementation="PCA",
currentType(fds_copy) <- type

# inject outliers
fds_copy <- injectOutliers(fds_copy, type=type, freq=injectFreq,
minDpsi=minDeltaPsi, method="samplePSI")
fds_copy <- injectOutliers(fds_copy, type=type, freq=injectFreq, ...)

if(sum(getAssayMatrix(fds_copy, type=type, "trueOutliers") != 0) == 0){
warning(paste0("No outliers could be injected so the ",
Expand Down
Loading

0 comments on commit faa0a20

Please sign in to comment.