From 0179daaff3b6d2ae4c6937b8c6e80c4fe3adbc3b Mon Sep 17 00:00:00 2001 From: Alex Cebulski Date: Mon, 11 Sep 2023 11:58:35 -0600 Subject: [PATCH 1/3] added functions that uses a rolling window check to find and delete outliers --- R/deleteSpikesStdevWindow.R | 78 +++++++++++++++++ R/findSpikesStdevWindow.R | 153 +++++++++++++++++++++++++++++++++ man/deleteSpikesStdevWindow.Rd | 49 +++++++++++ man/findSpikesStdevWindow.Rd | 60 +++++++++++++ 4 files changed, 340 insertions(+) create mode 100644 R/deleteSpikesStdevWindow.R create mode 100644 R/findSpikesStdevWindow.R create mode 100644 man/deleteSpikesStdevWindow.Rd create mode 100644 man/findSpikesStdevWindow.Rd diff --git a/R/deleteSpikesStdevWindow.R b/R/deleteSpikesStdevWindow.R new file mode 100644 index 0000000..626724d --- /dev/null +++ b/R/deleteSpikesStdevWindow.R @@ -0,0 +1,78 @@ +#' Deletes spikes using rolling window stdev filtering +#' +#' @param obs Required. A \pkg{CRHMr} obs data frame. +#' @param colnum Optional. The number of the column to test for spikes, not +#' including the \code{datetime}. +#' @param min_frac_records Optional. The fraction of records required in a +#' window for a sucessful computation, otherwise the current value is flagged. +#' Default is 0.8. +#' @param lead_window Optional. A list of values corresponding to the offset +#' from the current value. Default is 1:10. +#' @param lag_window Optional. A list of values corresponding to the offset from +#' the current value. Default is 1:10. +#' @param number_sd Optional. The number of standard deviations away from the +#' mean required for the current value to be flagged. +#' @param logfile Optional. Name of the file to be used for logging the action. +#' Normally not used. +#' +#' @return If successful, returns a data frame consisting of the datetime and the original obs values, where +#' all of the spike values have been set to be \code{NA_real_}. If no spikes are found a message is +#' printed and the function returns the value \code{FALSE}. +#' @export +#' +#' @examples +#' deleteSpikesStdevWindow(BadLake7376, 1, min_frac_records = 0.5) +deleteSpikesStdevWindow <- + function(obs, + colnum = 1, + min_frac_records = .8, + lead_window = list(1:10), + lag_window = list(-1:-10), + number_sd = 10, + logfile = ""){ + + if (nrow(obs) == 0) { + stop("Missing obs values") + } + obsName <- deparse(substitute(obs)) + + if (any(is.na(obs[, colnum + 1]))) { + stop("Missing values. Remove before searching for spikes") + } + + if (number_sd == 0) { + stop("sd is <= 0. Set before searching for spikes") + } + + spikeDatetimes <- findSpikesStdevWindow(obs, + colnum, + min_frac_records, + lead_window, + lag_window, + number_sd) + + if (length(spikeDatetimes) == 1) { + if (spikeDatetimes == 0) { + cat("No spikes found\n") + return(FALSE) + } + if (spikeDatetimes == FALSE) { + stop("Error in finding spikes") + } + } + + spikeLocs <- match(spikeDatetimes, obs$datetime) + obs[spikeLocs, colnum + 1] <- NA_real_ + + # output to logfile + outputMessage <- paste(length(spikeDatetimes), " values set to NA_real_") + comment <- paste("findSpikesStdevWindow obs:", obsName, outputMessage, sep = "") + result <- logAction(comment, logfile) + + if (!result) { + return(result) + } else { + return(obs) + } + + } diff --git a/R/findSpikesStdevWindow.R b/R/findSpikesStdevWindow.R new file mode 100644 index 0000000..47f1582 --- /dev/null +++ b/R/findSpikesStdevWindow.R @@ -0,0 +1,153 @@ +#' Finds spikes using rolling window stdev filtering +#' +#' @description Finds spikes using a rolling window with a width defined as a +#' lead and lag offset from the current value. Currently is set up to define +#' spikes within the window given a number of standard deviations away from +#' the mean. NOTE: the first few and last few values will automatically be +#' removed due to the stdev function requiring a minimum of 3 vals. +#' +#' @param obs Required. A \pkg{CRHMr} obs data frame. +#' @param colnum Optional. The number of the column to test for spikes, not +#' including the \code{datetime}. +#' @param min_frac_records Optional. The fraction of records required in a +#' window for a sucessful computation, otherwise the current value is flagged. +#' Default is 0.8. +#' @param lead_window Optional. A list of values corresponding to the offset +#' from the current value. Default is 1:10. +#' @param lag_window Optional. A list of values corresponding to the offset from +#' the current value. Default is 1:10. +#' @param number_sd Optional. The number of standard deviations away from the +#' mean required for the current value to be flagged. +#' @param logfile Optional. Name of the file to be used for logging the action. +#' Normally not used. +#' +#' @return If successful and there are no spikes, returns \code{0}. If there are +#' spikes, returns their \code{datetime} values. If unsuccessful returns +#' \code{FALSE}. +#' @author Alex Cebulski +#' @seealso \code{\link{deleteSpikes}} \code{\link{findGaps}} +#' \code{\link{findDupes}} +#' @export +#' +#' @examples +#' findSpikesStdevWindow(BadLake7376, 1, min_frac_records = 0.5) +findSpikesStdevWindow <- + function(obs, + colnum = 1, + min_frac_records = .8, + lead_window = list(1:10), + lag_window = list(-1:-10), + number_sd = 10, + logfile = "" + ){ + + if (nrow(obs) == 0) { + stop("Error: missing obs values") + } + + obsName <- deparse(substitute(obs)) + + if (any(is.na(obs[, colnum + 1]))) { + stop("Missing values. Remove before searching for spikes") + } + + if (number_sd == 0) { + stop("sd is <= 0. Set before searching for spikes") + } + + datetime_col <- 1 + var_col <- colnum + 1 # raw colnum does not include datetime + select_cols <- c(datetime_col, var_col) + + min_records_window <- max(lead_window[[1]])*min_frac_records + + obs_fltr <- obs[,select_cols] + + # count the number of records in the leading window + obs_fltr$lead_count <- zoo::rollapply(obs_fltr[,var_col], + width=lead_window, + FUN = function(x) sum(!is.na(x)), + fill=NA, + partial = T) + + obs_fltr$lag_count <- zoo::rollapply(obs_fltr[,var_col], + width=lag_window, + FUN = function(x) sum(!is.na(x)), + fill=NA, + partial = T) + + obs_fltr$lead_count_filter <- + ifelse(obs_fltr$lead_count obs_fltr$lead_fltr_min) & + (obs_fltr$lag_count_filter == 1 & + obs_fltr[,var_col] < obs_fltr$lag_fltr_max & + obs_fltr[,var_col] > obs_fltr$lag_fltr_min)] <- F + + locs <- obs_fltr[obs_fltr$sd_filter, 1] + numSpikes <- sum(obs_fltr$sd_filter) + + if (numSpikes == 0) { + outputMessage <- " No spikes found" + returnvalue <- 0 + } + else { + outputMessage <- paste(" ", numSpikes, " spikes found with standard deviations ", + number_sd, " above/below the mean using the rolling window method ", + sep = "") + returnvalue <- locs + } + + # output to logfile + comment <- paste("findStdevSpikeWindow obs:", obsName, outputMessage, sep = "") + result <- logAction(comment, logfile) + if (!result) { + return(result) + } else { + return(returnvalue) + } + } diff --git a/man/deleteSpikesStdevWindow.Rd b/man/deleteSpikesStdevWindow.Rd new file mode 100644 index 0000000..6a4edce --- /dev/null +++ b/man/deleteSpikesStdevWindow.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deleteSpikesStdevWindow.R +\name{deleteSpikesStdevWindow} +\alias{deleteSpikesStdevWindow} +\title{Deletes spikes using rolling window stdev filtering} +\usage{ +deleteSpikesStdevWindow( + obs, + colnum = 1, + min_frac_records = 0.8, + lead_window = list(1:10), + lag_window = list(-1:-10), + number_sd = 10, + logfile = "" +) +} +\arguments{ +\item{obs}{Required. A \pkg{CRHMr} obs data frame.} + +\item{colnum}{Optional. The number of the column to test for spikes, not +including the \code{datetime}.} + +\item{min_frac_records}{Optional. The fraction of records required in a +window for a sucessful computation, otherwise the current value is flagged. +Default is 0.8.} + +\item{lead_window}{Optional. A list of values corresponding to the offset +from the current value. Default is 1:10.} + +\item{lag_window}{Optional. A list of values corresponding to the offset from +the current value. Default is 1:10.} + +\item{number_sd}{Optional. The number of standard deviations away from the +mean required for the current value to be flagged.} + +\item{logfile}{Optional. Name of the file to be used for logging the action. +Normally not used.} +} +\value{ +If successful, returns a data frame consisting of the datetime and the original obs values, where +all of the spike values have been set to be \code{NA_real_}. If no spikes are found a message is +printed and the function returns the value \code{FALSE}. +} +\description{ +Deletes spikes using rolling window stdev filtering +} +\examples{ +deleteSpikesStdevWindow(BadLake7376, 1, min_frac_records = 0.5) +} diff --git a/man/findSpikesStdevWindow.Rd b/man/findSpikesStdevWindow.Rd new file mode 100644 index 0000000..62f069c --- /dev/null +++ b/man/findSpikesStdevWindow.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/findSpikesStdevWindow.R +\name{findSpikesStdevWindow} +\alias{findSpikesStdevWindow} +\title{Finds spikes using rolling window stdev filtering} +\usage{ +findSpikesStdevWindow( + obs, + colnum = 1, + min_frac_records = 0.8, + lead_window = list(1:10), + lag_window = list(-1:-10), + number_sd = 10, + logfile = "" +) +} +\arguments{ +\item{obs}{Required. A \pkg{CRHMr} obs data frame.} + +\item{colnum}{Optional. The number of the column to test for spikes, not +including the \code{datetime}.} + +\item{min_frac_records}{Optional. The fraction of records required in a +window for a sucessful computation, otherwise the current value is flagged. +Default is 0.8.} + +\item{lead_window}{Optional. A list of values corresponding to the offset +from the current value. Default is 1:10.} + +\item{lag_window}{Optional. A list of values corresponding to the offset from +the current value. Default is 1:10.} + +\item{number_sd}{Optional. The number of standard deviations away from the +mean required for the current value to be flagged.} + +\item{logfile}{Optional. Name of the file to be used for logging the action. +Normally not used.} +} +\value{ +If successful and there are no spikes, returns \code{0}. If there are + spikes, returns their \code{datetime} values. If unsuccessful returns + \code{FALSE}. +} +\description{ +Finds spikes using a rolling window with a width defined as a + lead and lag offset from the current value. Currently is set up to define + spikes within the window given a number of standard deviations away from + the mean. NOTE: the first few and last few values will automatically be + removed due to the stdev function requiring a minimum of 3 vals. +} +\examples{ +findSpikesStdevWindow(BadLake7376, 1, min_frac_records = 0.5) +} +\seealso{ +\code{\link{deleteSpikes}} \code{\link{findGaps}} + \code{\link{findDupes}} +} +\author{ +Alex Cebulski +} From 351e3362a040fdde1d02e584bf4d8683661cf7be Mon Sep 17 00:00:00 2001 From: Alex Cebulski Date: Mon, 11 Sep 2023 12:02:33 -0600 Subject: [PATCH 2/3] add function to plot flags found using other CRHMr qaqc functions --- R/plotFlags.R | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ man/plotFlags.Rd | 52 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 R/plotFlags.R create mode 100644 man/plotFlags.Rd diff --git a/R/plotFlags.R b/R/plotFlags.R new file mode 100644 index 0000000..e805071 --- /dev/null +++ b/R/plotFlags.R @@ -0,0 +1,56 @@ +#' Helper function to plot identified flags +#' +#' @description This function takes a vector of datetimes identified by other +#' CRHMr qaqc functions and illustrates these on top of the original data +#' using GGPLOT. It is also easy to pass this ggplot output onto plotly for +#' interactive viewing by running plotly::ggplotly() after this function. +#' +#' @param obs Required. A \pkg{CRHMr} obs data frame. +#' @param datetime_flags Required. A vector of datetimes in POSIXct format, for +#' example the direct output of the findSpikes() function. +#' @param colnum Optional. The number of the column to test for spikes, not +#' including the \code{datetime}. +#' @param gg_flag_shape Optional. An integer representing the ggplot shape type +#' for the flagged values. +#' @param gg_flag_colour Optional. A string representing the ggplot colour +#' for the flagged values. +#' @param gg_flag_size Optional. An integer representing the ggplot shape size +#' for the flagged values. +#' +#' @return a ggplot object. +#' @author Alex Cebulski +#' @seealso \code{\link{findSpikes}} \code{\link{findGaps}} +#' \code{\link{findFlatlines}} \code{\link{findSpikesStdevWindow}} +#' @export +#' +#' @examples +#' plotFlags(BadLake7376, c(as.POSIXct("1974-09-11 12:00:00"))) +plotFlags <- + function(obs, + datetime_flags, + colnum = 1, + gg_flag_shape = 4, + gg_flag_colour = 'red', + gg_flag_size = 1) { + if (colnames(obs)[1] != 'datetime') { + stop("First column of the obs file must be datetime.") + } + + if (!"POSIXct" %in% class(datetime_flags)) { + stop("datetime_flags must be of type POSIXct.") + } + + obs_fltr <- obs[obs$datetime %in% datetime_flags, ] + + gg_ylab <- colnames(obs)[colnum + 1] + + ggplot2::ggplot(obs, aes(datetime, obs[, colnum + 1])) + + geom_line() + + geom_point( + data = obs_fltr, + aes(x = datetime, y = obs_fltr[, colnum + 1]), + shape = gg_flag_shape, + colour = gg_flag_colour + ) + + ggplot2::ylab(gg_ylab) + } diff --git a/man/plotFlags.Rd b/man/plotFlags.Rd new file mode 100644 index 0000000..bf4a82b --- /dev/null +++ b/man/plotFlags.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotFlags.R +\name{plotFlags} +\alias{plotFlags} +\title{Helper function to plot identified flags} +\usage{ +plotFlags( + obs, + datetime_flags, + colnum = 1, + gg_flag_shape = 4, + gg_flag_colour = "red", + gg_flag_size = 1 +) +} +\arguments{ +\item{obs}{Required. A \pkg{CRHMr} obs data frame.} + +\item{datetime_flags}{Required. A vector of datetimes in POSIXct format, for +example the direct output of the findSpikes() function.} + +\item{colnum}{Optional. The number of the column to test for spikes, not +including the \code{datetime}.} + +\item{gg_flag_shape}{Optional. An integer representing the ggplot shape type +for the flagged values.} + +\item{gg_flag_colour}{Optional. A string representing the ggplot colour +for the flagged values.} + +\item{gg_flag_size}{Optional. An integer representing the ggplot shape size +for the flagged values.} +} +\value{ +a ggplot object. +} +\description{ +This function takes a vector of datetimes identified by other + CRHMr qaqc functions and illustrates these on top of the original data + using GGPLOT. It is also easy to pass this ggplot output onto plotly for + interactive viewing by running plotly::ggplotly() after this function. +} +\examples{ +plotFlags(BadLake7376, c(as.POSIXct("1974-09-11 12:00:00"))) +} +\seealso{ +\code{\link{findSpikes}} \code{\link{findGaps}} + \code{\link{findFlatlines}} \code{\link{findSpikesStdevWindow}} +} +\author{ +Alex Cebulski +} From 2bdcebbdda8657bb7e83be35c54b33f3a054c047 Mon Sep 17 00:00:00 2001 From: Alex Cebulski Date: Mon, 11 Sep 2023 12:04:25 -0600 Subject: [PATCH 3/3] update description, namespace and some minor spelling and error fixes --- DESCRIPTION | 2 +- NAMESPACE | 3 +++ R/deleteSpikes.R | 4 ++-- R/findFlatlines.R | 2 +- R/regress.R | 2 +- man/deleteFlatLines.Rd | 2 +- man/deleteSpikes.Rd | 4 ++-- man/findFlatLines.Rd | 2 +- 8 files changed, 12 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 187b3f8..d534c93 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,7 @@ License: GPL-3 LazyData: true Encoding: UTF-8 URL: https://github.com/CentreForHydrology/CRHMr -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 NeedsCompilation: no Packaged: 2016-07-06 18:16:30 UTC; kevin VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 6e7970b..bba5d34 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ export(deDupe) export(deleteFlatLines) export(deletePrjHRUs) export(deleteSpikes) +export(deleteSpikesStdevWindow) export(deltaStorage) export(distributeInst) export(distributeMean) @@ -39,6 +40,7 @@ export(findDupes) export(findFlatLines) export(findGaps) export(findSpikes) +export(findSpikesStdevWindow) export(hruGroupWaterSummary) export(hydroYear) export(hydrograph) @@ -61,6 +63,7 @@ export(phaseCorrect) export(phiv) export(phivt) export(plot2runs) +export(plotFlags) export(plotObs) export(plotPrecipsByYear) export(plotTempsByYear) diff --git a/R/deleteSpikes.R b/R/deleteSpikes.R index 42f5de8..15b6b5c 100644 --- a/R/deleteSpikes.R +++ b/R/deleteSpikes.R @@ -9,9 +9,9 @@ #' will be considered to be a spike. #' @param logfile Optional. Name of the file to be used for logging the action. Normally not used. #' -#' @return If successful, returns a data frame consiting of the datetime and the original obs values, where +#' @return If successful, returns a data frame consisting of the datetime and the original obs values, where #' all of the spike values have been set to be \code{NA_real_}. If no spikes are found a message is -#' printed and the funtion returns the value \code{FALSE}. +#' printed and the function returns the value \code{FALSE}. #' @author Kevin Shook #' @seealso \code{\link{findSpikes}} #' @export diff --git a/R/findFlatlines.R b/R/findFlatlines.R index f1d3719..848d0fc 100644 --- a/R/findFlatlines.R +++ b/R/findFlatlines.R @@ -8,7 +8,7 @@ #' @param logfile Optional. Name of the file to be used for logging the action. #' Normally not used. #' -#' @return obs dataframe with col flat lines replaced with NANs +#' @return datetime vector where flatlines exist for colnum. #' @export #' #' @author Alex Cebulski diff --git a/R/regress.R b/R/regress.R index 8e43b11..e44ccd3 100644 --- a/R/regress.R +++ b/R/regress.R @@ -204,7 +204,7 @@ function(primaryCRHM, primary.columns=1, ggplot2::facet_wrap(~variable, scales="free") } - return(p) + return(p + geom_abline(colour = 'blue')) } diff --git a/man/deleteFlatLines.Rd b/man/deleteFlatLines.Rd index df87fc5..ab227db 100644 --- a/man/deleteFlatLines.Rd +++ b/man/deleteFlatLines.Rd @@ -25,7 +25,7 @@ obs dataframe with col flat lines replaced with NANs Deletes values in a obs dataframe column that have a flatline } \examples{ -findFlatLines(BadLake7376, colnum = 3) +deleteFlatLines(BadLake7376, colnum = 3) } \author{ diff --git a/man/deleteSpikes.Rd b/man/deleteSpikes.Rd index 2cbe710..674e653 100644 --- a/man/deleteSpikes.Rd +++ b/man/deleteSpikes.Rd @@ -25,9 +25,9 @@ will be considered to be a spike.} \item{logfile}{Optional. Name of the file to be used for logging the action. Normally not used.} } \value{ -If successful, returns a data frame consiting of the datetime and the original obs values, where +If successful, returns a data frame consisting of the datetime and the original obs values, where all of the spike values have been set to be \code{NA_real_}. If no spikes are found a message is -printed and the funtion returns the value \code{FALSE}. +printed and the function returns the value \code{FALSE}. } \description{ Finds spikes, and sets their values to be NA_real_. diff --git a/man/findFlatLines.Rd b/man/findFlatLines.Rd index 3a39327..33a3600 100644 --- a/man/findFlatLines.Rd +++ b/man/findFlatLines.Rd @@ -19,7 +19,7 @@ required to be the same before is flagged as a flat line.} Normally not used.} } \value{ -obs dataframe with col flat lines replaced with NANs +datetime vector where flatlines exist for colnum. } \description{ Finds values in a obs dataframe column that have a flatline