From f1e8ced59b29e78efcda2bf1c2dacc2376df3d84 Mon Sep 17 00:00:00 2001 From: philchalmers Date: Thu, 15 Aug 2024 12:30:12 -0400 Subject: [PATCH] add dirs arg --- NEWS.md | 3 ++- R/SimCheck.R | 23 ++++++++++++++++++----- R/SimCollect.R | 30 +++++++++++++++++++----------- R/runArraySimulation.R | 16 +++++----------- man/SimCheck.Rd | 15 ++++++++++----- man/SimCollect.Rd | 23 ++++++++++++----------- man/runArraySimulation.Rd | 16 +++++----------- vignettes/HPC-computing.Rmd | 11 +++++------ 8 files changed, 76 insertions(+), 61 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4701a349..957eafb6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,7 +15,8 @@ and `runArraySimulation()` - `SimCollect()` more efficient when combining a large number of files (e.g., - greater than 5000 `.rds` files stored via `runArraySimulation()`) + greater than 5000 `.rds` files stored via `runArraySimulation()`). Gains a + `dir` argument for this purpose as well so that a full directory can be specified - `SimCheck()` repurposed to check for missing files for `runArraySimulation()` diff --git a/R/SimCheck.R b/R/SimCheck.R index ed7016df..7b01c18c 100644 --- a/R/SimCheck.R +++ b/R/SimCheck.R @@ -4,9 +4,11 @@ #' evaluation check whether all \code{.rds} files have been saved. If missing #' the missing row condition numbers will be returned #' +#' @param dir character vector input indicating the directory +#' containing the \code{.rds} files (see \code{files}) +#' #' @param files vector of file names referring to the saved simulation files. -#' E.g. \code{c('mysim-1.rds', 'mysim-2.rds', ...)}. Default assumes -#' that all the files are available in the current working directory +#' E.g. \code{c('mysim-1.rds', 'mysim-2.rds', ...)} #' #' @param min minimum number after the \code{'-'} deliminator. Default is 1 #' @@ -28,13 +30,24 @@ #' @examples #' \dontrun{ #' +#' # if files are in mysimfiles/ directory +#' SimCheck('mysimfiles') +#' +#' # specifying files explicility #' setwd('mysimfiles/') -#' files <- dir() -#' SimCheck() +#' SimCheck(files=dir()) #' #' } #' -SimCheck <- function(files = dir(), min = 1L, max = NULL){ +SimCheck <- function(dir = NULL, files = NULL, min = 1L, max = NULL){ + if(is.null(dir) && is.null(files)) + stop('either dir or files must be specified') + if(!is.null(dir) && !is.null(files)) + stop('dir OR files must be specified, not both') + if(!is.null(dir)){ + files <- dir(dir) + } else dir <- './' + files <- paste0(dir, files) filename <- strsplit(files[1], '-')[[1L]][1L] if(is.null(max)){ subfiles <- gsub(paste0(filename, '-'), files, replacement = '') diff --git a/R/SimCollect.R b/R/SimCollect.R index 3e78e468..8c8206d0 100644 --- a/R/SimCollect.R +++ b/R/SimCollect.R @@ -8,13 +8,14 @@ #' use as the random numbers will tend to correlate the more it is used) or run independently across different #' nodes/computing cores (e.g., see \code{\link{runArraySimulation}}. #' +#' +#' @param dir a \code{character} vector pointing to the directory name containing the \code{.rds} files. +#' All \code{.rds} files in this directory will be used. For greater specificity use the +#' \code{files} argument instead #' @param files a \code{character} vector containing the names of the simulation's final \code{.rds} files. -#' Default assumes that the current working directory contains all the files #' #' @param filename (optional) name of .rds file to save aggregate simulation file to. If not specified -#' then the results will only be returned in the R console. Note that you probably -#' want to save this one level higher than where the files are listed (e.g., -#' \code{filename = '../mysim'}) +#' then the results will only be returned in the R console. #' #' @param select a character vector indicating columns to variables to select from the #' \code{SimExtract(what='results')} information. This is mainly useful when RAM is an issue @@ -50,7 +51,7 @@ #' Carlo simulation. \code{Journal of Statistics Education, 24}(3), 136-156. #' \doi{10.1080/10691898.2016.1246953} #' -#' @seealso \code{\link{runSimulation}} +#' @seealso \code{\link{runSimulation}}, \code{\link{runArraySimulation}} #' #' @export #' @@ -125,25 +126,32 @@ #' }) |> invisible() #' #' # check that all replications satisfy target -#' files <- paste0('sim_files/job-', 1:nrow(Design), ".rds") -#' SimCollect(files = files, check.only = TRUE) +#' SimCollect('sim_files/', check.only = TRUE) #' #' # this would have been returned were the target.rep supposed to be 1000 -#' SimCollect(files = files, check.only = TRUE, target.reps=1000) +#' SimCollect('sim_files/', check.only = TRUE, target.reps=1000) #' #' # aggregate into single object -#' sim <- SimCollect(files = paste0('sim_files/job-', 1:nrow(Design), ".rds")) +#' sim <- SimCollect('sim_files/') #' sim #' #' SimClean(dir='sim_files/') #' #' } -SimCollect <- function(files = dir(), filename = NULL, +SimCollect <- function(dir=NULL, files = NULL, filename = NULL, select = NULL, check.only = FALSE, target.reps = NULL, warning_details = FALSE, error_details = TRUE){ + if(is.null(dir) && is.null(files)) + stop('either dir or files must be specified') + if(!is.null(dir) && !is.null(files)) + stop('dir OR files must be specified, not both') if(check.only) select <- 'REPLICATIONS' + if(!is.null(dir)){ + files <- dir(dir) + } else dir <- './' + files <- paste0(dir, files) oldfiles <- files files <- oldfiles if(!is.null(files)){ @@ -152,7 +160,7 @@ SimCollect <- function(files = dir(), filename = NULL, return(invisible(NULL)) } if(!is.null(filename)) - if(filename %in% dir()) + if(filename %in% dir(dir)) stop(sprintf('File \'%s\' already exists in working directory', filename), call.=FALSE) replications_only <- ifelse(!is.null(select) && length(select) == 1L && diff --git a/R/runArraySimulation.R b/R/runArraySimulation.R index 64bc89ce..94bd3485 100644 --- a/R/runArraySimulation.R +++ b/R/runArraySimulation.R @@ -228,19 +228,17 @@ #' dir('sim/') #' #' # check that all files saved -#' setwd('sim') -#' SimCheck() +#' SimCheck('sim/') #' -#' condition14 <- readRDS('condition-14.rds') +#' condition14 <- readRDS('sim/condition-14.rds') #' condition14 #' SimResults(condition14) #' #' # aggregate simulation results into single file -#' final <- SimCollect(files=dir()) +#' final <- SimCollect('sim/') #' final #' #' # clean simulation directory -#' setwd('..') #' SimClean(dirs='sim/') #' #' @@ -269,23 +267,19 @@ #' # list saved files #' dir('sim/') #' -#' setwd('sim') -#' #' # note that all row conditions are still stored separately, though note that #' # arrayID is now 2 instead -#' condition14 <- readRDS('condition-14.rds') +#' condition14 <- readRDS('sim/condition-14.rds') #' condition14 #' SimResults(condition14) #' #' # aggregate simulation results into single file -#' final <- SimCollect(files=dir()) +#' final <- SimCollect('sim/') #' final #' #' # clean simulation directory -#' setwd('..') #' SimClean(dirs='sim/') #' -#' #' } #' runArraySimulation <- function(design, ..., replications, diff --git a/man/SimCheck.Rd b/man/SimCheck.Rd index 72fed7d7..e1c0cd22 100644 --- a/man/SimCheck.Rd +++ b/man/SimCheck.Rd @@ -4,12 +4,14 @@ \alias{SimCheck} \title{Check for missing files in array simulations} \usage{ -SimCheck(files = dir(), min = 1L, max = NULL) +SimCheck(dir = NULL, files = NULL, min = 1L, max = NULL) } \arguments{ +\item{dir}{character vector input indicating the directory +containing the \code{.rds} files (see \code{files})} + \item{files}{vector of file names referring to the saved simulation files. -E.g. \code{c('mysim-1.rds', 'mysim-2.rds', ...)}. Default assumes -that all the files are available in the current working directory} +E.g. \code{c('mysim-1.rds', 'mysim-2.rds', ...)}} \item{min}{minimum number after the \code{'-'} deliminator. Default is 1} @@ -24,9 +26,12 @@ the missing row condition numbers will be returned \examples{ \dontrun{ +# if files are in mysimfiles/ directory +SimCheck('mysimfiles') + +# specifying files explicility setwd('mysimfiles/') -files <- dir() -SimCheck() +SimCheck(files=dir()) } diff --git a/man/SimCollect.Rd b/man/SimCollect.Rd index 6518f760..573585a7 100644 --- a/man/SimCollect.Rd +++ b/man/SimCollect.Rd @@ -6,7 +6,8 @@ \title{Collapse separate simulation files into a single result} \usage{ SimCollect( - files = dir(), + dir = NULL, + files = NULL, filename = NULL, select = NULL, check.only = FALSE, @@ -18,13 +19,14 @@ SimCollect( aggregate_simulations(...) } \arguments{ -\item{files}{a \code{character} vector containing the names of the simulation's final \code{.rds} files. -Default assumes that the current working directory contains all the files} +\item{dir}{a \code{character} vector pointing to the directory name containing the \code{.rds} files. +All \code{.rds} files in this directory will be used. For greater specificity use the +\code{files} argument instead} + +\item{files}{a \code{character} vector containing the names of the simulation's final \code{.rds} files.} \item{filename}{(optional) name of .rds file to save aggregate simulation file to. If not specified -then the results will only be returned in the R console. Note that you probably -want to save this one level higher than where the files are listed (e.g., -\code{filename = '../mysim'})} +then the results will only be returned in the R console.} \item{select}{a character vector indicating columns to variables to select from the \code{SimExtract(what='results')} information. This is mainly useful when RAM is an issue @@ -130,14 +132,13 @@ sapply(1:nrow(Design_long), \(i) { }) |> invisible() # check that all replications satisfy target -files <- paste0('sim_files/job-', 1:nrow(Design), ".rds") -SimCollect(files = files, check.only = TRUE) +SimCollect('sim_files/', check.only = TRUE) # this would have been returned were the target.rep supposed to be 1000 -SimCollect(files = files, check.only = TRUE, target.reps=1000) +SimCollect('sim_files/', check.only = TRUE, target.reps=1000) # aggregate into single object -sim <- SimCollect(files = paste0('sim_files/job-', 1:nrow(Design), ".rds")) +sim <- SimCollect('sim_files/') sim SimClean(dir='sim_files/') @@ -154,7 +155,7 @@ Carlo simulation. \code{Journal of Statistics Education, 24}(3), 136-156. \doi{10.1080/10691898.2016.1246953} } \seealso{ -\code{\link{runSimulation}} +\code{\link{runSimulation}}, \code{\link{runArraySimulation}} } \author{ Phil Chalmers \email{rphilip.chalmers@gmail.com} diff --git a/man/runArraySimulation.Rd b/man/runArraySimulation.Rd index 3e447296..b1d65fe3 100644 --- a/man/runArraySimulation.Rd +++ b/man/runArraySimulation.Rd @@ -238,19 +238,17 @@ sapply(1:nrow(Design5), \(arrayID) dir('sim/') # check that all files saved -setwd('sim') -SimCheck() +SimCheck('sim/') -condition14 <- readRDS('condition-14.rds') +condition14 <- readRDS('sim/condition-14.rds') condition14 SimResults(condition14) # aggregate simulation results into single file -final <- SimCollect(files=dir()) +final <- SimCollect('sim/') final # clean simulation directory -setwd('..') SimClean(dirs='sim/') @@ -279,23 +277,19 @@ sapply(1:3, \(arrayID) # list saved files dir('sim/') -setwd('sim') - # note that all row conditions are still stored separately, though note that # arrayID is now 2 instead -condition14 <- readRDS('condition-14.rds') +condition14 <- readRDS('sim/condition-14.rds') condition14 SimResults(condition14) # aggregate simulation results into single file -final <- SimCollect(files=dir()) +final <- SimCollect('sim/') final # clean simulation directory -setwd('..') SimClean(dirs='sim/') - } } diff --git a/vignettes/HPC-computing.Rmd b/vignettes/HPC-computing.Rmd index 0ec85c3d..2b084c29 100644 --- a/vignettes/HPC-computing.Rmd +++ b/vignettes/HPC-computing.Rmd @@ -321,11 +321,10 @@ After some time has elapsed, and the job evaluation is now complete, you'll have ```{r eval=FALSE} -setwd('mysimfiles') library(SimDesign) -SimCheck() # check whether all the .rds files in the simulation are present +SimCheck('mysimfiles/') # check whether all the .rds files in the simulation are present -Final <- SimCollect(files=dir()) +Final <- SimCollect('mysimfiles/') Final ``` @@ -464,7 +463,7 @@ Related to early termination issue above is what to do about the missing replica To start, locate the simulation conditions in the aggregated result that do not meet the target replication criteria. This could be obtained via inspection of the aggregated results ```{r eval=FALSE} -Final <- SimCollect(files=dir()) +Final <- SimCollect('mysimfiles/') Final ``` @@ -547,9 +546,9 @@ to Submit this job to compute all the missing replication information, which stores these files into the same working directory but with the new information stored as `mysim-301.rds` through `mysim-400.rds`. In this example, there will now be a total of 400 files that have been saved. Once complete, run ```{r eval=FALSE} # See if any missing still -SimCollect(files=dir(), check.only=TRUE) +SimCollect('mysimfiles', check.only=TRUE) # Obtain complete simulation results -Final <- SimCollect(files=dir()) +Final <- SimCollect('mysimfiles') ``` one last time, which now reads in the complete set of 400 stored files instead of the previous 300, thereby obtaining the complete set of high-quality simulation results. Rinse and repeat if the same issue appears yet again on the second submission.