From e027ad79b560c58b982338c475db68c974df6d7f Mon Sep 17 00:00:00 2001 From: Daniel Sjoberg Date: Thu, 16 Jan 2025 10:19:58 -0800 Subject: [PATCH] Adding cumulative counts and percents to `ard_categorical()` (#373) **What changes are proposed in this pull request?** * The `ard_categorical()` function can now return cumulative counts and percentages with `ard_categorical(statistic = varname ~ c('n_cum', 'p_cum'))`. (#145) **Reference GitHub issue associated with pull request.** _e.g., 'closes #'_ closes #145 -------------------------------------------------------------------------------- Pre-review Checklist (if item does not apply, mark is as complete) - [x] **All** GitHub Action workflows pass with a :white_check_mark: - [x] PR branch has pulled the most recent updates from master branch: `usethis::pr_merge_main()` - [x] If a bug was fixed, a unit test was added. - [x] Code coverage is suitable for any new functions/features (generally, 100% coverage for new code): `devtools::test_coverage()` - [x] Request a reviewer Reviewer Checklist (if item does not apply, mark is as complete) - [ ] If a bug was fixed, a unit test was added. - [ ] Run `pkgdown::build_site()`. Check the R console for errors, and review the rendered website. - [ ] Code coverage is suitable for any new functions/features: `devtools::test_coverage()` When the branch is ready to be merged: - [ ] Update `NEWS.md` with the changes from this pull request under the heading "`# cards (development version)`". If there is an issue associated with the pull request, reference it in parentheses at the end update (see `NEWS.md` for examples). - [ ] **All** GitHub Action workflows pass with a :white_check_mark: - [ ] Approve Pull Request - [ ] Merge the PR. Please use "Squash and merge" or "Rebase and merge". _Optional Reverse Dependency Checks_: - Install `checked` with `pak::pak("Genentech/checked")` or `pak::pak("checked")` - Check dev versions of `cardx`, `gtsummary`, and `tfrmt` which are in the `ddsjoberg` R Universe ```shell Rscript -e "options(checked.check_envvars = c(NOT_CRAN = TRUE)); checked::check_rev_deps(path = '.', repos = c('https://ddsjoberg.r-universe.dev', 'https://cloud.r-project.org'))" ``` - Check CRAN reverse dependencies but run tests skipped on CRAN ```shell Rscript -e "options(checked.check_envvars = c(NOT_CRAN = TRUE)); checked::check_rev_deps(path = '.', repos = 'https://cloud.r-project.org')" ``` - Check CRAN reverse dependencies in a CRAN-like environment ```shell Rscript -e "options(checked.check_envvars = c(NOT_CRAN = FALSE), checked.check_build_args = '--as-cran'); checked::check_rev_deps(path = '.', repos = 'https://cloud.r-project.org')" ``` --------- Co-authored-by: Becca Krouse <14199771+bzkrouse@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE.md | 6 +- .github/workflows/test-coverage.yaml | 29 ++- DESCRIPTION | 2 +- R/ard_categorical.R | 94 +++++++-- R/default_stat_labels.R | 2 + README.Rmd | 2 +- README.md | 2 +- man/ard_categorical.Rd | 28 ++- man/ard_dichotomous.Rd | 26 ++- man/ard_hierarchical.Rd | 26 ++- man/ard_stack_hierarchical.Rd | 4 +- man/dot-calculate_tabulation_statistics.Rd | 10 +- man/dot-process_denominator.Rd | 6 +- tests/testthat/_snaps/ard_categorical.md | 9 + tests/testthat/test-ard_categorical.R | 223 +++++++++++++++++++++ 15 files changed, 396 insertions(+), 73 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 39ea650d4..4d84b3a4f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -33,17 +33,17 @@ _Optional Reverse Dependency Checks_: - Check dev versions of `cardx`, `gtsummary`, and `tfrmt` which are in the `ddsjoberg` R Universe ```shell - Rscript -e "options(checked.check_envvars = c(NOT_CRAN = TRUE)); checked::check_rev_deps(path = '.', repos = c('https://ddsjoberg.r-universe.dev', 'https://cloud.r-project.org'))" + Rscript -e "options(checked.check_envvars = c(NOT_CRAN = TRUE)); checked::check_rev_deps(path = '.', n = parallel::detectCores() - 2L, repos = c('https://ddsjoberg.r-universe.dev', 'https://cloud.r-project.org'))" ``` - Check CRAN reverse dependencies but run tests skipped on CRAN ```shell - Rscript -e "options(checked.check_envvars = c(NOT_CRAN = TRUE)); checked::check_rev_deps(path = '.', repos = 'https://cloud.r-project.org')" + Rscript -e "options(checked.check_envvars = c(NOT_CRAN = TRUE)); checked::check_rev_deps(path = '.', n = parallel::detectCores() - 2, repos = 'https://cloud.r-project.org')" ``` - Check CRAN reverse dependencies in a CRAN-like environment ```shell - Rscript -e "options(checked.check_envvars = c(NOT_CRAN = FALSE), checked.check_build_args = '--as-cran'); checked::check_rev_deps(path = '.', repos = 'https://cloud.r-project.org')" + Rscript -e "options(checked.check_envvars = c(NOT_CRAN = FALSE), checked.check_build_args = '--as-cran'); checked::check_rev_deps(path = '.', n = parallel::detectCores() - 2, repos = 'https://cloud.r-project.org')" ``` diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 1bfd07001..e050312ff 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -4,9 +4,10 @@ on: push: branches: [main, master] pull_request: - branches: [main, master] -name: test-coverage +name: test-coverage.yaml + +permissions: read-all jobs: test-coverage: @@ -15,7 +16,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -23,28 +24,38 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::covr - needs: coverage, check + extra-packages: any::covr, any::xml2 + needs: coverage - name: Test coverage run: | - covr::codecov( + cov <- covr::package_coverage( quiet = FALSE, clean = FALSE, - install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") + install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") ) + covr::to_cobertura(cov) shell: Rscript {0} + - uses: codecov/codecov-action@v4 + with: + # Fail if error if not on PR, or if on PR and token is given + fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} + file: ./cobertura.xml + plugin: noop + disable_search: true + token: ${{ secrets.CODECOV_TOKEN }} + - name: Show testthat output if: always() run: | ## -------------------------------------------------------------------- - find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true + find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true shell: bash - name: Upload test results if: failure() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage-test-failures path: ${{ runner.temp }}/package diff --git a/DESCRIPTION b/DESCRIPTION index dd075745c..e396c71a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,7 @@ Suggests: spelling (>= 2.2.0), testthat (>= 3.2.0), withr (>= 3.0.0) -Config/Needs/check: hms +Config/Needs/coverage: hms Config/Needs/website: rmarkdown, jsonlite, yaml, gtsummary, tfrmt, insightsengineering/nesttemplate Config/testthat/edition: 3 diff --git a/R/ard_categorical.R b/R/ard_categorical.R index 9a0f672ad..f2ad03740 100644 --- a/R/ard_categorical.R +++ b/R/ard_categorical.R @@ -17,13 +17,13 @@ #' Arguments may be used in conjunction with one another. #' @param variables ([`tidy-select`][dplyr::dplyr_tidy_select])\cr #' columns to include in summaries. Default is `everything()`. -#' @param denominator (`data.frame`, `integer`)\cr -#' Specify this *optional* argument to change the denominator, -#' e.g. the `"N"` statistic. Default is `NULL`. See below for details. +#' @param denominator (`string`, `data.frame`, `integer`)\cr +#' Specify this argument to change the denominator, +#' e.g. the `"N"` statistic. Default is `'column'`. See below for details. #' @param statistic ([`formula-list-selector`][syntax])\cr #' a named list, a list of formulas, -#' or a single formula where the list element one or more of `c("n", "N", "p")` -#' (or the RHS of a formula). +#' or a single formula where the list element one or more of `c("n", "N", "p", "n_cum", "p_cum")` +#' (on the RHS of a formula). #' @param stat_label ([`formula-list-selector`][syntax])\cr #' a named list, a list of formulas, or a single formula where #' the list element is either a named list or a list of formulas defining the @@ -45,14 +45,18 @@ #' In such cases, use the `denominator` argument to specify a new definition #' of `"N"`, and subsequently `"p"`. #' The argument expects one of the following inputs: +#' - a string: one of `"column"`, `"row"`, or `"cell"`. +#' - `"column"`, the default, returns percentages where the sum is equal to +#' one within the variable after the data frame has been subset with `by`/`strata`. +#' - `"row"` gives 'row' percentages where `by`/`strata` columns are the 'top' +#' of a cross table, and the variables are the rows. This is well-defined +#' for a single `by` or `strata` variable, and care must be taken when there +#' are more to ensure the the results are as you expect. +#' - `"cell"` gives percentages where the denominator is the number of non-missing +#' rows in the source data frame. #' - a data frame. Any columns in the data frame that overlap with the `by`/`strata` #' columns will be used to calculate the new `"N"`. #' - an integer. This single integer will be used as the new `"N"` -#' - a string: one of `"column"`, `"row"`, or `"cell"`. `"column"` is equivalent -#' to `denominator=NULL`. `"row"` gives 'row' percentages where `by`/`strata` -#' columns are the 'top' of a cross table, and the variables are the rows. -#' `"cell"` gives percentages where the denominator is the number of non-missing -#' rows in the source data frame. #' - a structured data frame. The data frame will include columns from `by`/`strata`. #' The last column must be named `"...ard_N..."`. The integers in this column will #' be used as the updated `"N"` in the calculations. @@ -104,7 +108,7 @@ ard_categorical.data.frame <- function(data, by = dplyr::group_vars(data), strata = NULL, statistic = everything() ~ c("n", "p", "N"), - denominator = NULL, + denominator = "column", fmt_fn = NULL, stat_label = everything() ~ default_stat_labels(), ...) { @@ -137,8 +141,8 @@ ard_categorical.data.frame <- function(data, ) check_list_elements( x = statistic, - predicate = \(x) is.character(x) && all(x %in% c("n", "p", "N")), - error_msg = "Elements passed in the {.arg statistic} argument must be one or more of {.val {c('n', 'p', 'N')}}" + predicate = \(x) is.character(x) && all(x %in% c("n", "p", "N", "n_cum", "p_cum")), + error_msg = "Elements passed in the {.arg statistic} argument must be one or more of {.val {c('n', 'p', 'N', 'n_cum', 'p_cum')}}" ) # return empty ARD if no variables selected ---------------------------------- @@ -247,7 +251,7 @@ ard_categorical.data.frame <- function(data, imap( statistics_tabulation, function(x, variable) { - if (any(c("N", "p") %in% x[["tabulation"]])) { + if (any(c("N", "p", "p_cum") %in% x[["tabulation"]])) { TRUE } else { NULL @@ -282,7 +286,7 @@ ard_categorical.data.frame <- function(data, )) } } - if ("p" %in% tab_stats[["tabulation"]]) { + if (any(c("p", "p_cum") %in% tab_stats[["tabulation"]])) { df_result_tabulation <- df_result_tabulation |> dplyr::mutate( @@ -290,14 +294,24 @@ ard_categorical.data.frame <- function(data, ) } + df_result_tabulation <- + .add_cum_count_stats( + df_result_tabulation, + variable = variable, + by = by, + strata = strata, + denominator = denominator, + tab_stats = tab_stats + ) + df_result_tabulation |> .nesting_rename_ard_columns(variable = variable, by = by, strata = strata) |> dplyr::mutate( - across(any_of(c("...ard_n...", "...ard_N...", "...ard_p...")), as.list), + across(any_of(c("...ard_n...", "...ard_N...", "...ard_p...", "...ard_n_cum...", "...ard_p_cum...")), as.list), across(c(matches("^group[0-9]+_level$"), any_of("variable_level")), as.list) ) |> tidyr::pivot_longer( - cols = any_of(c("...ard_n...", "...ard_N...", "...ard_p...")), + cols = any_of(c("...ard_n...", "...ard_N...", "...ard_p...", "...ard_n_cum...", "...ard_p_cum...")), names_to = "stat_name", values_to = "stat" ) |> @@ -334,6 +348,52 @@ ard_categorical.data.frame <- function(data, ) } + + +.add_cum_count_stats <- function(x, variable, by, strata, denominator, tab_stats) { + # if no cumulative stats were requested, return the object + if (!any(c("p_cum", "n_cum") %in% tab_stats[["tabulation"]])) { + return(x) + } + + # to return cumulative stats, the denominator must be 'column' or 'row' + if (!is_string(denominator) || !denominator %in% c("column", "row")) { + cli::cli_abort( + "The {.arg denominator} argument must be one of {.val {c(\"column\", \"row\")}} + when cumulative statistics {.val n_cum} or {.val p_cum} are specified, which + were requested for variable {.var {variable}}.", + call = get_cli_abort_call() + ) + } + + # calculate the cumulative statistics + if (denominator %in% "column") { + x <- x |> + dplyr::mutate( + .by = any_of(c(by, strata)), + ...ard_n_cum... = switch("n_cum" %in% tab_stats[["tabulation"]], + cumsum(.data$...ard_n...) + ), + ...ard_p_cum... = switch("p_cum" %in% tab_stats[["tabulation"]], + cumsum(.data$...ard_p...) + ) + ) + } else if (denominator %in% "row") { + x <- x |> + dplyr::mutate( + .by = any_of(variable), + ...ard_n_cum... = switch("n_cum" %in% tab_stats[["tabulation"]], + cumsum(.data$...ard_n...) + ), + ...ard_p_cum... = switch("p_cum" %in% tab_stats[["tabulation"]], + cumsum(.data$...ard_p...) + ) + ) + } + + x +} + #' Results from `table()` as Data Frame #' #' Takes the results from [table()] and returns them as a data frame. diff --git a/R/default_stat_labels.R b/R/default_stat_labels.R index 082e3d28d..1bffdb780 100644 --- a/R/default_stat_labels.R +++ b/R/default_stat_labels.R @@ -21,6 +21,8 @@ default_stat_labels <- function() { n = "n", N = "N", p = "%", + n_cum = "Cumulative n", + p_cum = "Cumulative %", N_obs = "Vector Length", N_miss = "N Missing", N_nonmiss = "N Non-missing", diff --git a/README.Rmd b/README.Rmd index d1e593fbc..b75eed65c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,7 +17,7 @@ knitr::opts_chunk$set( [![CRAN status](https://www.r-pkg.org/badges/version/cards)](https://CRAN.R-project.org/package=cards) -[![Codecov test coverage](https://codecov.io/gh/insightsengineering/cards/branch/main/graph/badge.svg)](https://app.codecov.io/gh/insightsengineering/cards?branch=main) +[![Codecov test coverage](https://codecov.io/gh/insightsengineering/cards/graph/badge.svg)](https://app.codecov.io/gh/insightsengineering/cards) [![Downloads](https://cranlogs.r-pkg.org/badges/cards)](https://cran.r-project.org/package=cards) [![R-CMD-check](https://github.com/insightsengineering/cards/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/insightsengineering/cards/actions/workflows/R-CMD-check.yaml) [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) diff --git a/README.md b/README.md index c5dcaa031..0e8ca51c8 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![CRAN status](https://www.r-pkg.org/badges/version/cards)](https://CRAN.R-project.org/package=cards) [![Codecov test -coverage](https://codecov.io/gh/insightsengineering/cards/branch/main/graph/badge.svg)](https://app.codecov.io/gh/insightsengineering/cards?branch=main) +coverage](https://codecov.io/gh/insightsengineering/cards/graph/badge.svg)](https://app.codecov.io/gh/insightsengineering/cards) [![Downloads](https://cranlogs.r-pkg.org/badges/cards)](https://cran.r-project.org/package=cards) [![R-CMD-check](https://github.com/insightsengineering/cards/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/insightsengineering/cards/actions/workflows/R-CMD-check.yaml) [![Lifecycle: diff --git a/man/ard_categorical.Rd b/man/ard_categorical.Rd index 597162d51..df1513c93 100644 --- a/man/ard_categorical.Rd +++ b/man/ard_categorical.Rd @@ -13,7 +13,7 @@ ard_categorical(data, ...) by = dplyr::group_vars(data), strata = NULL, statistic = everything() ~ c("n", "p", "N"), - denominator = NULL, + denominator = "column", fmt_fn = NULL, stat_label = everything() ~ default_stat_labels(), ... @@ -42,12 +42,12 @@ Arguments may be used in conjunction with one another.} \item{statistic}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, -or a single formula where the list element one or more of \code{c("n", "N", "p")} -(or the RHS of a formula).} +or a single formula where the list element one or more of \code{c("n", "N", "p", "n_cum", "p_cum")} +(on the RHS of a formula).} -\item{denominator}{(\code{data.frame}, \code{integer})\cr -Specify this \emph{optional} argument to change the denominator, -e.g. the \code{"N"} statistic. Default is \code{NULL}. See below for details.} +\item{denominator}{(\code{string}, \code{data.frame}, \code{integer})\cr +Specify this argument to change the denominator, +e.g. the \code{"N"} statistic. Default is \code{'column'}. See below for details.} \item{fmt_fn}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, @@ -83,14 +83,20 @@ In such cases, use the \code{denominator} argument to specify a new definition of \code{"N"}, and subsequently \code{"p"}. The argument expects one of the following inputs: \itemize{ +\item a string: one of \code{"column"}, \code{"row"}, or \code{"cell"}. +\itemize{ +\item \code{"column"}, the default, returns percentages where the sum is equal to +one within the variable after the data frame has been subset with \code{by}/\code{strata}. +\item \code{"row"} gives 'row' percentages where \code{by}/\code{strata} columns are the 'top' +of a cross table, and the variables are the rows. This is well-defined +for a single \code{by} or \code{strata} variable, and care must be taken when there +are more to ensure the the results are as you expect. +\item \code{"cell"} gives percentages where the denominator is the number of non-missing +rows in the source data frame. +} \item a data frame. Any columns in the data frame that overlap with the \code{by}/\code{strata} columns will be used to calculate the new \code{"N"}. \item an integer. This single integer will be used as the new \code{"N"} -\item a string: one of \code{"column"}, \code{"row"}, or \code{"cell"}. \code{"column"} is equivalent -to \code{denominator=NULL}. \code{"row"} gives 'row' percentages where \code{by}/\code{strata} -columns are the 'top' of a cross table, and the variables are the rows. -\code{"cell"} gives percentages where the denominator is the number of non-missing -rows in the source data frame. \item a structured data frame. The data frame will include columns from \code{by}/\code{strata}. The last column must be named \code{"...ard_N..."}. The integers in this column will be used as the updated \code{"N"} in the calculations. diff --git a/man/ard_dichotomous.Rd b/man/ard_dichotomous.Rd index 39e320cbe..1ca5e8f67 100644 --- a/man/ard_dichotomous.Rd +++ b/man/ard_dichotomous.Rd @@ -47,12 +47,12 @@ which returns the largest/last value after a sort.} \item{statistic}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, -or a single formula where the list element one or more of \code{c("n", "N", "p")} -(or the RHS of a formula).} +or a single formula where the list element one or more of \code{c("n", "N", "p", "n_cum", "p_cum")} +(on the RHS of a formula).} -\item{denominator}{(\code{data.frame}, \code{integer})\cr -Specify this \emph{optional} argument to change the denominator, -e.g. the \code{"N"} statistic. Default is \code{NULL}. See below for details.} +\item{denominator}{(\code{string}, \code{data.frame}, \code{integer})\cr +Specify this argument to change the denominator, +e.g. the \code{"N"} statistic. Default is \code{'column'}. See below for details.} \item{fmt_fn}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, @@ -88,14 +88,20 @@ In such cases, use the \code{denominator} argument to specify a new definition of \code{"N"}, and subsequently \code{"p"}. The argument expects one of the following inputs: \itemize{ +\item a string: one of \code{"column"}, \code{"row"}, or \code{"cell"}. +\itemize{ +\item \code{"column"}, the default, returns percentages where the sum is equal to +one within the variable after the data frame has been subset with \code{by}/\code{strata}. +\item \code{"row"} gives 'row' percentages where \code{by}/\code{strata} columns are the 'top' +of a cross table, and the variables are the rows. This is well-defined +for a single \code{by} or \code{strata} variable, and care must be taken when there +are more to ensure the the results are as you expect. +\item \code{"cell"} gives percentages where the denominator is the number of non-missing +rows in the source data frame. +} \item a data frame. Any columns in the data frame that overlap with the \code{by}/\code{strata} columns will be used to calculate the new \code{"N"}. \item an integer. This single integer will be used as the new \code{"N"} -\item a string: one of \code{"column"}, \code{"row"}, or \code{"cell"}. \code{"column"} is equivalent -to \code{denominator=NULL}. \code{"row"} gives 'row' percentages where \code{by}/\code{strata} -columns are the 'top' of a cross table, and the variables are the rows. -\code{"cell"} gives percentages where the denominator is the number of non-missing -rows in the source data frame. \item a structured data frame. The data frame will include columns from \code{by}/\code{strata}. The last column must be named \code{"...ard_N..."}. The integers in this column will be used as the updated \code{"N"} in the calculations. diff --git a/man/ard_hierarchical.Rd b/man/ard_hierarchical.Rd index 89c1927b9..9c93b9687 100644 --- a/man/ard_hierarchical.Rd +++ b/man/ard_hierarchical.Rd @@ -47,12 +47,12 @@ specified here appear in results. Default is \code{dplyr::group_vars(data)}.} \item{statistic}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, -or a single formula where the list element one or more of \code{c("n", "N", "p")} -(or the RHS of a formula).} +or a single formula where the list element one or more of \code{c("n", "N", "p", "n_cum", "p_cum")} +(on the RHS of a formula).} -\item{denominator}{(\code{data.frame}, \code{integer})\cr -Specify this \emph{optional} argument to change the denominator, -e.g. the \code{"N"} statistic. Default is \code{NULL}. See below for details.} +\item{denominator}{(\code{string}, \code{data.frame}, \code{integer})\cr +Specify this argument to change the denominator, +e.g. the \code{"N"} statistic. Default is \code{'column'}. See below for details.} \item{fmt_fn}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, @@ -102,14 +102,20 @@ In such cases, use the \code{denominator} argument to specify a new definition of \code{"N"}, and subsequently \code{"p"}. The argument expects one of the following inputs: \itemize{ +\item a string: one of \code{"column"}, \code{"row"}, or \code{"cell"}. +\itemize{ +\item \code{"column"}, the default, returns percentages where the sum is equal to +one within the variable after the data frame has been subset with \code{by}/\code{strata}. +\item \code{"row"} gives 'row' percentages where \code{by}/\code{strata} columns are the 'top' +of a cross table, and the variables are the rows. This is well-defined +for a single \code{by} or \code{strata} variable, and care must be taken when there +are more to ensure the the results are as you expect. +\item \code{"cell"} gives percentages where the denominator is the number of non-missing +rows in the source data frame. +} \item a data frame. Any columns in the data frame that overlap with the \code{by}/\code{strata} columns will be used to calculate the new \code{"N"}. \item an integer. This single integer will be used as the new \code{"N"} -\item a string: one of \code{"column"}, \code{"row"}, or \code{"cell"}. \code{"column"} is equivalent -to \code{denominator=NULL}. \code{"row"} gives 'row' percentages where \code{by}/\code{strata} -columns are the 'top' of a cross table, and the variables are the rows. -\code{"cell"} gives percentages where the denominator is the number of non-missing -rows in the source data frame. \item a structured data frame. The data frame will include columns from \code{by}/\code{strata}. The last column must be named \code{"...ard_N..."}. The integers in this column will be used as the updated \code{"N"} in the calculations. diff --git a/man/ard_stack_hierarchical.Rd b/man/ard_stack_hierarchical.Rd index 006b94570..4dd66c630 100644 --- a/man/ard_stack_hierarchical.Rd +++ b/man/ard_stack_hierarchical.Rd @@ -69,8 +69,8 @@ summary statistics will be returned. Default is \code{everything()}.} \item{statistic}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, -or a single formula where the list element one or more of \code{c("n", "N", "p")} -(or the RHS of a formula).} +or a single formula where the list element one or more of \code{c("n", "N", "p", "n_cum", "p_cum")} +(on the RHS of a formula).} \item{overall}{(scalar \code{logical})\cr logical indicating whether overall statistics should be calculated (i.e. repeat the operations with \code{by=NULL} in \emph{most cases}, see below for details). diff --git a/man/dot-calculate_tabulation_statistics.Rd b/man/dot-calculate_tabulation_statistics.Rd index deb3f2cbd..d452180cd 100644 --- a/man/dot-calculate_tabulation_statistics.Rd +++ b/man/dot-calculate_tabulation_statistics.Rd @@ -32,14 +32,14 @@ columns specified. Arguments may be used in conjunction with one another.} -\item{denominator}{(\code{data.frame}, \code{integer})\cr -Specify this \emph{optional} argument to change the denominator, -e.g. the \code{"N"} statistic. Default is \code{NULL}. See below for details.} +\item{denominator}{(\code{string}, \code{data.frame}, \code{integer})\cr +Specify this argument to change the denominator, +e.g. the \code{"N"} statistic. Default is \code{'column'}. See below for details.} \item{statistic}{(\code{\link[=syntax]{formula-list-selector}})\cr a named list, a list of formulas, -or a single formula where the list element one or more of \code{c("n", "N", "p")} -(or the RHS of a formula).} +or a single formula where the list element one or more of \code{c("n", "N", "p", "n_cum", "p_cum")} +(on the RHS of a formula).} } \value{ an ARD data frame of class 'card' diff --git a/man/dot-process_denominator.Rd b/man/dot-process_denominator.Rd index 82f9b2997..444fa5795 100644 --- a/man/dot-process_denominator.Rd +++ b/man/dot-process_denominator.Rd @@ -13,9 +13,9 @@ a data frame} \item{variables}{(\code{\link[dplyr:dplyr_tidy_select]{tidy-select}})\cr columns to include in summaries. Default is \code{everything()}.} -\item{denominator}{(\code{data.frame}, \code{integer})\cr -Specify this \emph{optional} argument to change the denominator, -e.g. the \code{"N"} statistic. Default is \code{NULL}. See below for details.} +\item{denominator}{(\code{string}, \code{data.frame}, \code{integer})\cr +Specify this argument to change the denominator, +e.g. the \code{"N"} statistic. Default is \code{'column'}. See below for details.} \item{by, strata}{(\code{\link[dplyr:dplyr_tidy_select]{tidy-select}})\cr columns to use for grouping or stratifying the table output. diff --git a/tests/testthat/_snaps/ard_categorical.md b/tests/testthat/_snaps/ard_categorical.md index 01ee08197..1b9abe7ff 100644 --- a/tests/testthat/_snaps/ard_categorical.md +++ b/tests/testthat/_snaps/ard_categorical.md @@ -184,3 +184,12 @@ Error in `ard_categorical()`: ! Factors with NA levels are not allowed, which are present in column "am". +# ard_categorical() with cumulative counts messaging + + Code + ard_categorical(ADSL, variables = "AGEGR1", by = SEX, statistic = everything() ~ + c("n", "p", "n_cum", "p_cum"), denominator = NULL) + Condition + Error in `ard_categorical()`: + ! The `denominator` argument must be one of "column" and "row" when cumulative statistics "n_cum" or "p_cum" are specified, which were requested for variable `AGEGR1`. + diff --git a/tests/testthat/test-ard_categorical.R b/tests/testthat/test-ard_categorical.R index 6c30dbdd2..690107c31 100644 --- a/tests/testthat/test-ard_categorical.R +++ b/tests/testthat/test-ard_categorical.R @@ -888,6 +888,229 @@ test_that("ard_categorical() errors with incomplete factor columns", { ) }) +test_that("ard_categorical(denominator='column') with cumulative counts", { + # check cumulative stats work without `by/strata` + expect_silent( + ard <- + ard_categorical( + ADSL, + variables = "AGEGR1", + statistic = everything() ~ c("n", "p", "n_cum", "p_cum") + ) + ) + # test the final cum n matches the nrow() + expect_equal( + ard |> + dplyr::filter(stat_name == "n_cum", variable_level %in% dplyr::last(.unique_and_sorted(ADSL$AGEGR1))) |> + dplyr::pull(stat) |> + unlist(), + nrow(ADSL) + ) + # test the final cum p is 1 + expect_equal( + ard |> + dplyr::filter(stat_name == "p_cum", variable_level %in% dplyr::last(.unique_and_sorted(ADSL$AGEGR1))) |> + dplyr::pull(stat) |> + unlist(), + 1 + ) + # check the cum n is correct + expect_equal( + ard |> + dplyr::filter(stat_name %in% "n_cum") |> + dplyr::select(variable_level, stat) |> + deframe(), + table(ADSL$AGEGR1) |> + cumsum() |> + as.list() + ) + # check the cum p is correct + expect_equal( + ard |> + dplyr::filter(stat_name %in% "p_cum") |> + dplyr::select(variable_level, stat) |> + deframe(), + table(ADSL$AGEGR1) |> + prop.table() |> + cumsum() |> + as.list() + ) + + # check cumulative stats work with `by` + expect_silent( + ard <- + ard_categorical( + ADSL, + variables = "AGEGR1", + by = ARM, + statistic = everything() ~ c("n", "p", "n_cum", "p_cum") + ) + ) + # check the cum n is correct + expect_equal( + ard |> + dplyr::filter(stat_name %in% "n_cum", group1_level == "Placebo") |> + dplyr::select(variable_level, stat) |> + deframe(), + table(ADSL$AGEGR1[ADSL$ARM == "Placebo"]) |> + cumsum() |> + as.list() + ) + # check the cum p is correct + expect_equal( + ard |> + dplyr::filter(stat_name %in% "p_cum", group1_level == "Placebo") |> + dplyr::select(variable_level, stat) |> + deframe(), + table(ADSL$AGEGR1[ADSL$ARM == "Placebo"]) |> + prop.table() |> + cumsum() |> + as.list() + ) + + # check with by & strata + expect_silent( + ard <- + ard_categorical( + ADSL, + variables = "AGEGR1", + by = ARM, + strata = SEX, + statistic = everything() ~ c("n", "p", "n_cum", "p_cum") + ) + ) + # check the cum n is correct + expect_equal( + ard |> + dplyr::filter(stat_name %in% "n_cum", group1_level == "Placebo", group2_level == "F") |> + dplyr::select(variable_level, stat) |> + deframe(), + table(ADSL$AGEGR1[ADSL$ARM == "Placebo" & ADSL$SEX == "F"]) |> + cumsum() |> + as.list() + ) + # check the cum p is correct + expect_equal( + ard |> + dplyr::filter(stat_name %in% "p_cum", group1_level == "Placebo", group2_level == "F") |> + dplyr::select(variable_level, stat) |> + deframe(), + table(ADSL$AGEGR1[ADSL$ARM == "Placebo" & ADSL$SEX == "F"]) |> + prop.table() |> + cumsum() |> + as.list() + ) + + # function works when only `n_cum` requested + expect_equal( + ard_categorical( + ADSL, + variables = "AGEGR1", + statistic = everything() ~ "n_cum" + ), + ard_categorical( + ADSL, + variables = "AGEGR1", + statistic = everything() ~ c("n", "p", "n_cum", "p_cum") + ) |> + dplyr::filter(stat_name == "n_cum") + ) + # function works when only `p_cum` requested + expect_equal( + ard_categorical( + ADSL, + variables = "AGEGR1", + statistic = everything() ~ "p_cum" + ), + ard_categorical( + ADSL, + variables = "AGEGR1", + statistic = everything() ~ c("n", "p", "n_cum", "p_cum") + ) |> + dplyr::filter(stat_name == "p_cum") + ) +}) + +test_that("ard_categorical(denominator='row') with cumulative counts", { + # check cumulative stats work without `by/strata` + expect_silent( + ard <- + ard_categorical( + ADSL, + variables = "AGEGR1", + statistic = everything() ~ c("n", "p", "n_cum", "p_cum"), + denominator = "row" + ) + ) + # when no by, the n and n_cum should be the same + expect_true( + ard |> + dplyr::filter(stat_name %in% c("n", "n_cum")) |> + dplyr::mutate( + .by = all_ard_variables(), + check_equal = unlist(stat) == unlist(stat)[1] + ) |> + dplyr::pull(check_equal) |> + unique() + ) + # when no by, the p and p_cum should be the same and equal to 1 + expect_equal( + ard |> + dplyr::filter(stat_name %in% c("p", "p_cum")) |> + dplyr::pull(stat) |> + unlist() |> + unique(), + 1 + ) + + # check cumulative stats work with `by` + expect_silent( + ard <- + ard_categorical( + ADSL, + variables = "AGEGR1", + by = SEX, + statistic = everything() ~ c("n", "p", "n_cum", "p_cum"), + denominator = "row" + ) + ) + # check row n_cum + expect_equal( + ard |> + dplyr::filter(variable_level %in% "<65", stat_name == "n_cum") |> + dplyr::select(group1_level, stat) |> + deframe(), + table(ADSL$SEX[ADSL$AGEGR1 == "<65"]) |> + cumsum() |> + as.list() + ) + # check row p_cum + expect_equal( + ard |> + dplyr::filter(variable_level %in% "<65", stat_name == "p_cum") |> + dplyr::select(group1_level, stat) |> + deframe(), + table(ADSL$SEX[ADSL$AGEGR1 == "<65"]) |> + prop.table() |> + cumsum() |> + as.list() + ) +}) + +test_that("ard_categorical() with cumulative counts messaging", { + # cumulative counts/percents only available when `denominator=c('column', 'row')` + expect_snapshot( + error = TRUE, + ard_categorical( + ADSL, + variables = "AGEGR1", + by = SEX, + statistic = everything() ~ c("n", "p", "n_cum", "p_cum"), + denominator = NULL + ) + ) +}) + test_that("ard_categorical() ordering for multiple strata", { adae_mini <- ADAE |> dplyr::select(USUBJID, TRTA, AESOC, AEDECOD) |>