Skip to content

Commit

Permalink
feat: handle AIRSIS ESAM_MULTI2022 data format
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathancallahan committed Jun 8, 2023
1 parent 606d69d commit e1efba5
Show file tree
Hide file tree
Showing 11 changed files with 478 additions and 5 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Type: Package
Package: PWFSLSmoke
Version: 1.2.118
Version: 1.2.119
Title: Utilities for Working with Air Quality Monitoring Data
Authors@R: c(
person("Jonathan", "Callahan", email="[email protected]", role=c("aut","cre")),
Expand Down Expand Up @@ -75,4 +75,4 @@ URL: https://github.com/MazamaScience/PWFSLSmoke
BugReports: https://github.com/MazamaScience/PWFSLSmoke/issues
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.2
RoxygenNote: 7.2.3
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export(airsis_EBAM_MULTI2QualityControl)
export(airsis_EBAM_MULTI2_BQualityControl)
export(airsis_EBAM_PLUS_MULTIQualityControl)
export(airsis_ESAMQualityControl)
export(airsis_ESAM_MULTI2022QualityControl)
export(airsis_ESAM_MULTIQualityControl)
export(airsis_availableUnits)
export(airsis_createDataDataframe)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# PWFSLSmoke 1.2.119

* Updated `airsis_identifyMonitorType()` to handle a new data format.

# PWFSLSmoke 1.2.118

* Updated `wrcc_identifyMonitorType()` to handle a new data format.
Expand Down
303 changes: 303 additions & 0 deletions R/airsis_ESAM_MULTI2022QualityControl.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,303 @@
#' @keywords AIRSIS
#' @export
#' @import MazamaCoreUtils
#'
#' @title Apply Quality Control to raw AIRSIS E-Sampler dataframe
#'
#' @param tbl single site tibble created by \code{airsis_downloadData()}
#' @param valid_Longitude range of valid Longitude values
#' @param valid_Latitude range of valid Latitude values
#' @param remove_Lon_zero flag to remove rows where Longitude == 0
#' @param remove_Lat_zero flag to remove rows where Latitude == 0
#' @param valid_Flow range of valid Flow values
#' @param valid_AT range of valid AT values
#' @param valid_RHi range of valid RHi values
#' @param valid_Conc range of valid ConcHr values
#' @param flagAndKeep flag, rather then remove, bad data during the QC process
#' @description Perform various QC measures on AIRSIS E-Sampler data.
#'
#' The following columns of data are tested against valid ranges:
#' \itemize{
#' \item{\code{Flow}}
#' \item{\code{AT}}
#' \item{\code{RHi}}
#' \item{\code{ConcHr}}
#' }
#'
#' A \code{POSIXct datetime} column (UTC) is also added based on \code{TimeStamp}.
#'
#' @return Cleaned up tibble of AIRSIS monitor data.
#' @seealso \code{\link{airsis_qualityControl}}

airsis_ESAM_MULTI2022QualityControl <- function(
tbl,
valid_Longitude = c(-180, 180),
valid_Latitude = c(-90, 90),
remove_Lon_zero = TRUE,
remove_Lat_zero = TRUE,
valid_Flow = c(1.999, 2.001), # anything other than 2 is bad
valid_AT = c(-Inf, 150),
valid_RHi = c(-Inf, 55),
valid_Conc = c(-Inf, 5),
flagAndKeep = FALSE
) {

logger.debug(" ----- airsis_ESAM_MULTI2022QualityControl() ----- ")

# > print(names(tbl), width = 90)
# [1] "MasterTable_ID" "Alias" "Latitude"
# [4] "Longitude" "COncRT" "ConcHr"
# [7] "Flow" "AT" "BP.PA."
# [10] "RHx" "RHi" "W.S"
# [13] "W.D" "BV" "Alarm"
# [16] "Oceaneering.Unit.Voltage" "FT" "TimeStamp"
# [19] "PDate" "monitorName" "monitorType"
# [22] "monitorSubtype" "Date.Time.GMT"

### NOTE: ESAM (non-MULTI) looks like this:
### > names(tbl)
### [1] "MasterTable_ID" "Alias" "Latitude" "Longitude"
### [5] "Conc.mg.m3." "Flow.l.m." "AT.C." "BP.PA."
### [9] "RHx..." "RHi..." "WS.M.S." "WD.Deg."
### [13] "BV.V." "Alarm" "Start.Date.Time..GMT." "Serial.Number"
### [17] "System.Volts" "Data.1" "Data.2" "TimeStamp"
### [21] "PDate" "monitorName" "monitorType"

monitorName <- tbl$monitorName[1]

# ----- Missing Values ------------------------------------------------------

# Handle various missing value flags

# ----- Setup for flagAndKeep argument utility ------------------------------

if ( flagAndKeep ) {
# verb for logging messages
verb <- "Flagging"

tbl$rowID <- as.integer(rownames(tbl))

# duplicate tbl and add columns for flags
tblFlagged <- tbl
tblFlagged$QCFlag_anyBad <- FALSE
tblFlagged$QCFlag_reasonCode <- NA
tblFlagged$QCFlag_badLon <- FALSE
tblFlagged$QCFlag_badLat <- FALSE
tblFlagged$QCFlag_badType <- FALSE # no type info for ESAMs
tblFlagged$QCFlag_badFlow <- FALSE
tblFlagged$QCFlag_badAT <- FALSE
tblFlagged$QCFlag_badRHi <- FALSE
tblFlagged$QCFlag_badConcHr <- FALSE
tblFlagged$QCFlag_badDateAndTime <- FALSE
tblFlagged$QCFlag_duplicateHr <- FALSE
} else {
# verb for logging messages
verb <- "Discarding"
}

# ----- Location ------------------------------------------------------------

# Latitude and longitude must be in range
if ( remove_Lon_zero ) {
goodLonMask <- !is.na(tbl$Longitude) & (tbl$Longitude >= valid_Longitude[1]) & (tbl$Longitude <= valid_Longitude[2]) & (tbl$Longitude != 0)
} else {
goodLonMask <- !is.na(tbl$Longitude) & (tbl$Longitude >= valid_Longitude[1]) & (tbl$Longitude <= valid_Longitude[2])
}

if ( remove_Lat_zero ) {
goodLatMask <- !is.na(tbl$Latitude) & (tbl$Latitude >= valid_Latitude[1]) & (tbl$Latitude <= valid_Latitude[2]) & (tbl$Latitude != 0)
} else {
goodLatMask <- !is.na(tbl$Latitude) & (tbl$Latitude >= valid_Latitude[1]) & (tbl$Latitude <= valid_Latitude[2])
}

badRows <- !(goodLonMask & goodLatMask)
badRowCount <- sum(badRows)
if ( badRowCount > 0 ) {
logger.trace(paste(verb,"%s rows with invalid location information"), badRowCount)
badLocations <- paste('(',tbl$Longitude[badRows],',',tbl$Latitude[badRows],')',sep = '')
logger.trace("Bad locations: %s", paste0(badLocations, collapse = ", "))
if ( flagAndKeep ) {
# apply flags
tblFlagged$QCFlag_badLon[tbl$rowID[!goodLonMask]] <- TRUE
tblFlagged$QCFlag_badLat[tbl$rowID[!goodLatMask]] <- TRUE
tblFlagged$QCFlag_anyBad <- tblFlagged$QCFlag_anyBad | tblFlagged$QCFlag_badLon | tblFlagged$QCFlag_badLat
# apply reason codes
tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodLonMask]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodLonMask]],"badLon")
tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodLatMask]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodLatMask]],"badLat")
}
}

tbl <- tbl[goodLonMask & goodLatMask,]

# Sanity check -- row count
if ( nrow(tbl) < 1 && !flagAndKeep ) {
err_msg <- paste0("No valid PM2.5 data for ", monitorName)
logger.warn(err_msg) # This is more of a warning than some error in the data.
stop(err_msg, call. = FALSE)
}

# ----- Time ----------------------------------------------------------------

# NOTE: Assume all times downloaded from are UTC

# NOTE: It appears the ESAM "TimeStamp" data drifts throughout the day, with >60 minutes between timestamps during most
# NOTE: hours in the day, and then a daily re-synch. For now we are assuming this is a communication issue rather than
# NOTE: an issue in the actual sampling period. For example, we are assuming that a record that is received at 4:44pm is
# NOTE: actually a record for 4:00pm (which is really representative of data during the 3:00 hour -- see NOTE below).

# Add a POSIXct datetime
tbl$datetime <- lubridate::floor_date(lubridate::mdy_hms(tbl$TimeStamp), unit = "hour") - lubridate::dhours(1)
if ( flagAndKeep ) {
# TODO: Unable to get datetime moved from tbl to tblFlagged without timezone and/or display getting messed up.
# For now just duplicating the calculation, then assigning row values to NA after the fact for rows that were
# removed from tbl prior to calculating datetime above. Clean up later if possible.
tblFlagged$datetime <- lubridate::floor_date(lubridate::mdy_hms(tblFlagged$TimeStamp), unit = "hour") - lubridate::dhours(1)
tblFlagged$datetime[ which(!(tblFlagged$rowID %in% tbl$rowID)) ] <- NA
}

# NOTE: The time above truncates the timestamp to the top of an hour, and then subtracts one hour,
# NOTE: since the measurement that comes in at a few minutes past the hour is actually representative
# NOTE: of the data over the previous hour (e.g. reading received at 12:04 is actually the average of
# NOTE: the data during Hour 11). This allows for a simpler understanding of the averages, since an
# NOTE: hour's average will be representative of the data within that hour (this is similar to
# NOTE: how an average over a year, such as 2016, is referred to as 2016's value, not 2017's, even
# NOTE: though the average wasn't available until the beginning of 2017).

# Leland Tarnay QC -----------------------------------------------------------

# NOTE: Override ConcHr high value with 5000 as per conversation with Mike Broughton

###tmp.2013_NIFC_GOES65_wrcc$concQA <- with(tmp.2013_NIFC_GOES65_wrcc,
### ifelse(Flow < 2 "FlowLow",
### ifelse(Flow > 2, "FlowHigh",
### ifelse(AT > 150, "HighTemp",
### ifelse(RHi > 55,"HighRHi",
### ifelse(ConcHr < 0, "Negative",
### ifelse(ConcHr > 984, "HighConc", 'OK')))))))
####create a ConcHr numerical column, with NA values that aren't verbose about errors..
###
###tmp.2013_NIFC_GOES65_wrcc$ConcHr <- with(tmp.2013_NIFC_GOES65_wrcc,
### ifelse(concQA == 'Negative', 0,
### ifelse(concQA == 'OK', ConcHr, NA)))

goodFlow <- !is.na(tbl$Flow) & tbl$Flow >= valid_Flow[1] & tbl$Flow <= valid_Flow[2]
goodAT <- !is.na(tbl$AT) & tbl$AT >= valid_AT[1] & tbl$AT <= valid_AT[2]
goodRHi <- !is.na(tbl$RHi) & tbl$RHi >= valid_RHi[1] & tbl$RHi <= valid_RHi[2]
goodConcHr <- !is.na(tbl$ConcHr) & tbl$ConcHr >= valid_Conc[1] & tbl$ConcHr <= valid_Conc[2]
gooddatetime <- !is.na(tbl$datetime) & tbl$datetime < lubridate::now(tzone = "UTC") # saw a future date once

logger.trace("Flow has %s missing or out of range values", sum(!goodFlow))
if (sum(!goodFlow) > 0) logger.trace("Bad Flow values: %s", paste0(sort(unique(tbl$Flow[!goodFlow]),na.last=TRUE), collapse = ", "))
logger.trace("AT has %s missing or out of range values", sum(!goodAT))
if (sum(!goodAT) > 0) logger.trace("Bad AT values: %s", paste0(sort(unique(tbl$AT[!goodAT]),na.last=TRUE), collapse = ", "))
logger.trace("RHi has %s missing or out of range values", sum(!goodRHi))
if (sum(!goodRHi) > 0) logger.trace("Bad RHi values: %s", paste0(sort(unique(tbl$RHi[!goodRHi]),na.last=TRUE), collapse = ", "))
logger.trace("Conc has %s missing or out of range values", sum(!goodConcHr))
if (sum(!goodConcHr) > 0) logger.trace("Bad Conc values: %s", paste0(sort(unique(tbl$Conc.ug.m3.[!goodConcHr]),na.last=TRUE), collapse = ", "))
logger.trace("datetime has %s missing or out of range values", sum(!gooddatetime))
if (sum(!gooddatetime) > 0) logger.trace("Bad datetime values: %s", paste0(sort(unique(tbl$datetime[!gooddatetime]),na.last=TRUE), collapse = ", "))

goodMask <- goodFlow & goodAT & goodRHi & goodConcHr & gooddatetime
badQCCount <- sum(!goodMask)

if ( badQCCount > 0 ) {
logger.trace(paste(verb,"%s rows because of QC logic"), badQCCount)
if ( flagAndKeep ) {
# apply flags
tblFlagged$QCFlag_badFlow[tbl$rowID[!goodFlow]] <- TRUE
tblFlagged$QCFlag_badAT[tbl$rowID[!goodAT]] <- TRUE
tblFlagged$QCFlag_badRHi[tbl$rowID[!goodRHi]] <- TRUE
tblFlagged$QCFlag_badConcHr[tbl$rowID[!goodConcHr]] <- TRUE
tblFlagged$QCFlag_badDateAndTime[tbl$rowID[!gooddatetime]] <- TRUE
tblFlagged$QCFlag_anyBad <- (tblFlagged$QCFlag_anyBad | tblFlagged$QCFlag_badFlow | tblFlagged$QCFlag_badAT |
tblFlagged$QCFlag_badRHi | tblFlagged$QCFlag_badConcHr | tblFlagged$QCFlag_badDateAndTime)
# apply reason codes
tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodFlow]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodFlow]],"badFlow")
tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodAT]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodAT]],"badAT")
tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodRHi]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodRHi]],"badRHi")
tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodConcHr]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[!goodConcHr]],"badConcHr")
tblFlagged$QCFlag_reasonCode[tbl$rowID[!gooddatetime]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[!gooddatetime]],"badDateAndTime")
}
}

tbl <- tbl[goodMask,]

# Sanity check -- row count
if (nrow(tbl) < 1 && !flagAndKeep) {
err_msg <- paste0("No valid PM2.5 data for ", monitorName)
logger.warn(err_msg) # This is more of a warning than some error in the data.
stop(err_msg, call.=FALSE)
}

# ----- Duplicate Hours -----------------------------------------------------

# For hours with multiple records, discard all but the one with the latest processing date/time
# NOTE: Current setup for this section assumes that the last entry will be the latest one. May
# NOTE: want to build in functionality to ensure that the latest is picked if more than one exists
# NOTE: (for example, if the data is not in order by timestamp for whatever reason)

dupHrMask <- duplicated(tbl$datetime,fromLast = TRUE)
dupHrCount <- sum(dupHrMask)
uniqueHrMask <- !dupHrMask

if ( dupHrCount > 0 ) {
logger.trace(paste(verb,"%s duplicate time entries"), dupHrCount)
logger.trace("Duplicate Hours (may be >1 per timestamp): %s", paste0(sort(unique(tbl$TimeStamp[dupHrMask])), collapse = ", "))
if ( flagAndKeep ) {
# apply flags
tblFlagged$QCFlag_duplicateHr[tbl$rowID[dupHrMask]] <- TRUE
tblFlagged$QCFlag_anyBad <- tblFlagged$QCFlag_anyBad | tblFlagged$QCFlag_duplicateHr
# apply reason code
tblFlagged$QCFlag_reasonCode[tbl$rowID[dupHrMask]] <- paste(tblFlagged$QCFlag_reasonCode[tbl$rowID[dupHrMask]],"duplicateHr")
}
}

tbl <- tbl[uniqueHrMask,]

# Sanity check -- row count
if (nrow(tbl) < 1 && !flagAndKeep) {
err_msg <- paste0("No valid PM2.5 data for ", monitorName)
logger.warn(err_msg) # This is more of a warning than some error in the data.
stop(err_msg, call.=FALSE)
}

# ----- More QC -------------------------------------------------------------

# NOTE: Additional QC would go here

if ( flagAndKeep ) {
logger.trace("Retaining %d rows of measurements; %d bad rows flagged", nrow(tbl), sum(tblFlagged$QCFlag_anyBad))
} else {
logger.trace("Retaining %d rows of validated measurements", nrow(tbl))
}

# ----- Final cleanup -------------------------------------------------------

if ( flagAndKeep ) {
tblFlagged$QCFlag_reasonCode <- stringr::str_sub(tblFlagged$QCFlag_reasonCode, 3)
tblFlagged$QCFlag_reasonCode <- stringr::str_trim(tblFlagged$QCFlag_reasonCode)
tbl <- tblFlagged
tbl$rowID <- NULL
}

return(tbl)

}

# ===== DEBUGGING ==============================================================

if ( FALSE ) {

# tbl <-

valid_Longitude = c(-180, 180)
valid_Latitude = c(-90, 90)
remove_Lon_zero = TRUE
remove_Lat_zero = TRUE
valid_Flow = c(1.999, 2.001) # anything other than 2 is bad
valid_AT = c(-Inf, 150)
valid_RHi = c(-Inf, 55)
valid_Conc = c(-Inf, 5)
flagAndKeep = FALSE

}
6 changes: 5 additions & 1 deletion R/airsis_createDataDataframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ airsis_createDataDataframe <- function(
if ( monitorType == 'EBAM' ) {
pm25Var <- 'ConcHr'
} else if ( monitorType == 'ESAM' ) {
pm25Var <- 'Conc.mg.m3.'
if ( monitorSubtype == 'MULTI2022' ) {
pm25Var <- 'ConcHr'
} else {
pm25Var <- 'Conc.mg.m3.'
}
} else if ( monitorType == 'BAM1020' ) {
pm25Var <- 'Conc..\u00B5g.m3.'
} else {
Expand Down
4 changes: 2 additions & 2 deletions R/airsis_createMonitorObject.R
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,12 @@ if ( FALSE ) {

logger.setLevel(TRACE)

# 2021 "EBAM MULTI2_B" format -- usfs.1085
# 2021 "EBAM MULTI2022" format -- usfs.1072

startdate = strftime(lubridate::now(tzone = "UTC"), "%Y0101", tz = "UTC")
enddate = strftime(lubridate::now(tzone = "UTC"), "%Y%m%d", tz = "UTC")
provider = 'USFS'
unitID = 1085
unitID = 1072
clusterDiameter = 1000
zeroMinimum = TRUE
baseUrl = "http://xxxx.airsis.com/vision/common/CSVExport.aspx?"
Expand Down
12 changes: 12 additions & 0 deletions R/airsis_identifyMonitorType.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ airsis_identifyMonitorType <- function(df) {
olderEbam_2_names <- make.names(olderEbam_2_rawNames)
olderEbam_2_types <- 'ccddcc'

# provider = "USFS"; unitID = "1072"; year = 2022
esamMulti2022_header <- "MasterTable_ID,Alias,Latitude,Longitude,ConcRT,ConcHR,Flow,AT,BP(PA),RHx,RHi,W/S,W/D,BV,Alarm,Oceaneering Unit Voltage,FT,TimeStamp,PDate"
esamMulti2022_rawNames <- unlist(stringr::str_split(esamMulti2022_header, ','))
esamMulti2022_names <- make.names(esamMulti2022_rawNames)
esamMulti2022_types <- 'ccddddddddddddcddcc'

# Need to duplicate these to handle the addition of the 'UnitID' column before 'Alias'

# ARB2_ebamMulti (Starting in August, 2018)
Expand Down Expand Up @@ -235,6 +241,12 @@ airsis_identifyMonitorType <- function(df) {
rawNames <- bam1020_rawNames
columnNames <- bam1020_names
columnTypes <- bam1020_types
} else if ( dplyr::setequal(esamMulti2022_names, colNames) ) {
monitorType <- "ESAM"
monitorSubtype <- "MULTI2022"
rawNames <- esamMulti2022_rawNames
columnNames <- esamMulti2022_names
columnTypes <- esamMulti2022_types
}

monitorTypeList <- list(monitorType = monitorType,
Expand Down
Loading

0 comments on commit e1efba5

Please sign in to comment.