From 0f04bc5db3a565d0c914a23d52da37c2f87ec71f Mon Sep 17 00:00:00 2001 From: clifmckee Date: Mon, 6 Jan 2025 23:56:37 -0500 Subject: [PATCH 1/5] Address #589 --- modules/Data_Summarization/Data_Summarization.Rmd | 6 +++--- .../lab/Data_Summarization_Lab.Rmd | 12 ++++++------ .../lab/Data_Summarization_Lab_Key.Rmd | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/modules/Data_Summarization/Data_Summarization.Rmd b/modules/Data_Summarization/Data_Summarization.Rmd index 8679900a6..8152a08cd 100644 --- a/modules/Data_Summarization/Data_Summarization.Rmd +++ b/modules/Data_Summarization/Data_Summarization.Rmd @@ -220,8 +220,8 @@ You can also do more elaborate summaries across different groups of data using ` ```{r, eval = FALSE} # General format - Not the code! {data to use} %>% - summarize({summary column name} = {operator(source column)}, - {summary column name} = {operator(source column)}) + summarize({summary column name} = {function(source column)}, + {summary column name} = {function(source column)}) ``` @@ -234,7 +234,7 @@ You can also do more elaborate summaries across different groups of data using ` ```{r, eval = FALSE} # General format - Not the code! {data to use} %>% - summarize({summary column name} = {operator(source column)}) + summarize({summary column name} = {function(source column)}) ``` diff --git a/modules/Data_Summarization/lab/Data_Summarization_Lab.Rmd b/modules/Data_Summarization/lab/Data_Summarization_Lab.Rmd index 1fe8cdb91..c88f3e044 100644 --- a/modules/Data_Summarization/lab/Data_Summarization_Lab.Rmd +++ b/modules/Data_Summarization/lab/Data_Summarization_Lab.Rmd @@ -26,7 +26,7 @@ bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.cs ### 1.1 -How many bike "lanes" are currently in Baltimore? You can assume each observation/row is a different bike "lane". (hint: how do you get the number of rows of a data set? You can use `dim()` or `nrow()` or another function). +How many streets with designated bike lanes are currently in Baltimore? You can assume each observation/row is a different street with one or more bike lanes. (Hint: how do you get the number of rows of a data set? You can use `dim()` or `nrow()` or another function). ```{r 1.1response} @@ -47,7 +47,7 @@ Summarize the data to get the `max` of `length` using the `summarize` function. ``` # General format DATA_TIBBLE %>% - summarize(SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN)) + summarize(SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN)) ``` ```{r 1.3response} @@ -61,8 +61,8 @@ Modify your code from 1.3 to add the `min` of `length` using the `summarize` fun ``` # General format DATA_TIBBLE %>% - summarize(SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN), - SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN) + summarize(SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN), + SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN) ) ``` @@ -80,8 +80,8 @@ Summarize the `bike` data to get the mean of `length` and `dateInstalled`. Make ``` # General format DATA_TIBBLE %>% - summarize(SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN, na.rm = TRUE), - SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN, na.rm = TRUE) + summarize(SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN, na.rm = TRUE), + SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN, na.rm = TRUE) ) ``` diff --git a/modules/Data_Summarization/lab/Data_Summarization_Lab_Key.Rmd b/modules/Data_Summarization/lab/Data_Summarization_Lab_Key.Rmd index 3f337d869..0fd5b3022 100644 --- a/modules/Data_Summarization/lab/Data_Summarization_Lab_Key.Rmd +++ b/modules/Data_Summarization/lab/Data_Summarization_Lab_Key.Rmd @@ -26,7 +26,7 @@ bike <- read_csv(file = "http://jhudatascience.org/intro_to_r/data/Bike_Lanes.cs ### 1.1 -How many bike "lanes" are currently in Baltimore? You can assume each observation/row is a different bike "lane". (hint: how do you get the number of rows of a data set? You can use `dim()` or `nrow()` or another function). +How many streets with designated bike lanes are currently in Baltimore? You can assume each observation/row is a different street with one or more bike lanes. (Hint: how do you get the number of rows of a data set? You can use `dim()` or `nrow()` or another function). ```{r 1.1response} nrow(bike) @@ -54,7 +54,7 @@ Summarize the data to get the `max` of `length` using the `summarize` function. ``` # General format DATA_TIBBLE %>% - summarize(SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN)) + summarize(SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN)) ``` ```{r 1.3response} @@ -70,8 +70,8 @@ Modify your code from 1.3 to add the `min` of `length` using the `summarize` fun ``` # General format DATA_TIBBLE %>% - summarize(SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN), - SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN) + summarize(SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN), + SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN) ) ``` @@ -92,8 +92,8 @@ Summarize the `bike` data to get the mean of `length` and `dateInstalled`. Make ``` # General format DATA_TIBBLE %>% - summarize(SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN, na.rm = TRUE), - SUMMARY_COLUMN_NAME = OPERATOR(SOURCE_COLUMN, na.rm = TRUE) + summarize(SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN, na.rm = TRUE), + SUMMARY_COLUMN_NAME = FUNCTION(SOURCE_COLUMN, na.rm = TRUE) ) ``` From 48d8d4b3d45c368a273f67976995426ff32897d0 Mon Sep 17 00:00:00 2001 From: clifmckee Date: Tue, 7 Jan 2025 13:29:49 -0500 Subject: [PATCH 2/5] Update Data_Summarization.Rmd --- .../Data_Summarization/Data_Summarization.Rmd | 127 +++++++++--------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/modules/Data_Summarization/Data_Summarization.Rmd b/modules/Data_Summarization/Data_Summarization.Rmd index 8152a08cd..424d2d0ac 100644 --- a/modules/Data_Summarization/Data_Summarization.Rmd +++ b/modules/Data_Summarization/Data_Summarization.Rmd @@ -99,7 +99,9 @@ sum(z) ## Some examples -We can use the `mtcars` built-in dataset. The `head` command displays the first rows of an object: +We can use the `mtcars` built-in dataset. "The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973-74 models)." + +The `head` command displays the first rows of an object: ```{r} head(mtcars) @@ -112,23 +114,10 @@ A nice and readable way to chain together multiple R functions. Changes `f(x, y)` to `x %>% f(y)`. -```{r eval=FALSE} -# Going to work -get_dressed(me, - pack_lunch( - check_pockets( - wallet = TRUE, phone = TRUE, keys = TRUE), - items = c("sandwich", "chips", "apple"), lunchbox = TRUE), - pants = TRUE, shirt = TRUE, footwear = "sandals") - -# Going to work, the tidy way -me %>% - get_dressed(pants = TRUE, shirt = TRUE, footwear = "sandals") %>% - pack_lunch(items = c("sandwich", "chips", "apple"), lunchbox = TRUE) %>% - check_pockets(wallet = TRUE, phone = TRUE, keys = TRUE) +```{r, out.width = "50%", echo = FALSE, align = "center"} +knitr::include_graphics("../../images/lol/morning_1.png") ``` - ## Statistical summarization the "tidy" way ```{r} @@ -141,7 +130,7 @@ mtcars %>% pull(wt) %>% quantile(probs = 0.6) ## Behavior of `pull()` function -`pull()` converts a single data column into a vector. This allows you to run summary functions on these data. Once you have "pulled" the data column out, you don't have to name it again in any piped summary functions. +`pull()` converts a single data column into a vector. This allows you to run summary functions on these data. Once you have "pulled" the data column out, you don't have to name it again in any piped summary functions. ```{r} cars_wt <- mtcars %>% pull(wt) @@ -157,18 +146,29 @@ mtcars %>% pull(wt) %>% range(wt) # Incorrect mtcars %>% pull(wt) %>% range() # Correct ``` +## GUT CHECK + +What kind of object do we need to run summary operators like `mean()` ? + +A. A vector of numbers + +B. A vector of characters + +C. A dataset # Summarization on tibbles (data frames) -## TB Incidence +## TB incidence Let's read in a `tibble` of values from TB incidence. +"Tuberculosis incidence, all forms (per 100,000 population per year), for the period 1990-2007 across 208 countries/territories." + ```{r} tb <- read_csv("https://jhudatascience.org/intro_to_r/data/tb.csv") ``` -## TB Incidence +## TB incidence Check out the data: @@ -177,7 +177,7 @@ head(tb) ``` -## TB Incidence +## TB incidence Check out the data: @@ -291,10 +291,11 @@ summary(tb) ## Summary & Lab Part 1 -- summary stats (`mean()`) work with `pull()` +- `pull()` creates a *vector* - don't forget the `na.rm = TRUE` argument! - `summary(x)`: quantile information - `summarize`: creates a summary table of columns of interest +- summary stats (`mean()`) work with vectors or with `summarize()` 🏠 [Class Website](https://jhudatascience.org/intro_to_r/) @@ -306,6 +307,10 @@ summary(tb) Here we will be using the Youth Tobacco Survey data: http://jhudatascience.org/intro_to_r/data/Youth_Tobacco_Survey_YTS_Data.csv +"The YTS was developed to provide states with comprehensive data on both middle school and high school students regarding tobacco use, exposure to environmental tobacco smoke, smoking cessation, school curriculum, minors' ability to purchase or otherwise obtain tobacco products, knowledge and attitudes about tobacco, and familiarity with pro-tobacco and anti-tobacco media messages." + +* Check out the data at: https://catalog.data.gov/dataset/youth-tobacco-survey-yts-data + ```{r} yts <- read_csv("http://jhudatascience.org/intro_to_r/data/Youth_Tobacco_Survey_YTS_Data.csv") head(yts) @@ -324,7 +329,7 @@ yts %>% ## How many `distinct()` values? -`n_distinct()` tells you the number of unique elements. _Must pull the column first!_ +`n_distinct()` tells you the number of unique elements. It needs a vector so you _must pull the column first!_ ```{r} yts %>% @@ -338,7 +343,7 @@ options(max.print = 1000) ``` -## `dplyr`: `count` +## Use `count()` to return row count per category. Use `count` to return a frequency table of unique elements of a data.frame. @@ -347,31 +352,33 @@ yts %>% count(LocationDesc) ``` -## `dplyr`: `count` - -Multiple columns listed further subdivides the count. +## Multiple columns listed further subdivides the `count()` ```{r, message = FALSE} yts %>% count(LocationDesc, TopicDesc) ``` +**Note:** `count()` includes NAs -## `dplyr`: `count` - -Multiple columns listed further subdivides the count. +## GUT CHECK -```{r, message = FALSE} -yts %>% count(LocationDesc, TopicDesc) -``` +The `count()` function can help us tally: -
+A. Sample size -**Note:** `count()` includes NAs +B. Rows per each category +C. How many categories # Grouping -## Perform Operations By Groups: dplyr +## Goal + +We want to find the average frequency that youth use tobacco products in the dataset. + +_How do we do this?_ + +## Perform operations By groups: dplyr `group_by` allows you group the data set by variables/columns you specify: @@ -381,7 +388,7 @@ yts ``` -## Perform Operations By Groups: dplyr +## Perform operations by groups: dplyr `group_by` allows you group the data set by variables/columns you specify: @@ -400,7 +407,7 @@ yts_grouped %>% summarize(avg_percent = mean(Data_Value, na.rm = TRUE)) ``` -## Use the `pipe` to string these together! +## Do it in one step: use `%>%` to string these together! Pipe `yts` into `group_by`, then pipe that into `summarize`: @@ -474,8 +481,8 @@ yts %>% `count()` and `n()` can give very similar information. ```{r} -mtcars %>% count(cyl) -mtcars %>% group_by(cyl) %>% summarize(n()) # n() typically used with summarize +yts %>% count(YEAR) +yts %>% group_by(YEAR) %>% summarize(n = n()) # n() typically used with summarize ``` @@ -487,7 +494,6 @@ mtcars %>% group_by(cyl) %>% summarize(n()) # n() typically used with summarize These functions require a column as a vector using `pull()`. ```{r, message = FALSE} -yts <- read_csv("http://jhudatascience.org/intro_to_r/data/Youth_Tobacco_Survey_YTS_Data.csv") yts_loc <- yts %>% pull(LocationDesc) # pull() to make a vector yts_loc %>% unique() # similar to distinct() ``` @@ -500,27 +506,6 @@ These functions require a column as a vector using `pull()`. yts_loc %>% unique() %>% length() # similar to n_distinct() ``` -## * New! * Many dplyr functions now have a `.by=` argument - -Pipe `yts` into `group_by`, then pipe that into `summarize`: - -```{r eval = FALSE} -yts %>% - group_by(Response) %>% - summarize(avg_percent = mean(Data_Value, na.rm = TRUE), - max_percent = max(Data_Value, na.rm = TRUE)) -``` - -is the same as.. - -```{r eval = FALSE} -yts %>% - summarize(avg_percent = mean(Data_Value, na.rm = TRUE), - max_percent = max(Data_Value, na.rm = TRUE), - .by = Response) -``` - - ## `summary()` vs. `summarize()` * `summary()` (base R) gives statistics table on a dataset. @@ -592,3 +577,23 @@ tb %>% tb %>% summarize(across(starts_with("year"), ~mean(.x, na.rm = TRUE))) ``` + +## * New! * Many dplyr functions now have a `.by=` argument + +Pipe `yts` into `group_by`, then pipe that into `summarize`: + +```{r eval = FALSE} +yts %>% + group_by(Response) %>% + summarize(avg_percent = mean(Data_Value, na.rm = TRUE), + max_percent = max(Data_Value, na.rm = TRUE)) +``` + +is the same as.. + +```{r eval = FALSE} +yts %>% + summarize(avg_percent = mean(Data_Value, na.rm = TRUE), + max_percent = max(Data_Value, na.rm = TRUE), + .by = Response) +``` From f0c983d2bc07a59f6e7c2405cc9fbdc293bd916c Mon Sep 17 00:00:00 2001 From: clifmckee Date: Tue, 7 Jan 2025 18:07:00 -0500 Subject: [PATCH 3/5] Update Data_Summarization.Rmd --- .../Data_Summarization/Data_Summarization.Rmd | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/modules/Data_Summarization/Data_Summarization.Rmd b/modules/Data_Summarization/Data_Summarization.Rmd index 424d2d0ac..29259778f 100644 --- a/modules/Data_Summarization/Data_Summarization.Rmd +++ b/modules/Data_Summarization/Data_Summarization.Rmd @@ -8,11 +8,8 @@ output: ```{r, echo = FALSE, message=FALSE, error = FALSE} -library(knitr) -opts_chunk$set(comment = "", message = FALSE) -suppressWarnings({library(dplyr)}) -library(readr) -library(tidyverse) +knitr::opts_chunk$set(comment = "", message = FALSE) +suppressWarnings(library(tidyverse)) ```