diff --git a/% b/% new file mode 100644 index 0000000..e69de29 diff --git a/inst/Import and Filter TOC from LEEF-2.html b/inst/Import and Filter TOC from LEEF-2.html index f74d1af..918707a 100644 --- a/inst/Import and Filter TOC from LEEF-2.html +++ b/inst/Import and Filter TOC from LEEF-2.html @@ -3047,6 +3047,7 @@

Table of contents

  • Plot the measured concentreation per bottle of all samples @@ -7429,15 +7430,110 @@

    Calculate TOC

    +
    +

    Remove all where conc == NA

    +

    We have some NA values in the calculated conc values. These are removed.

    +
    +
    +Code +
    toc %>%
    +  filter(is.na(conc)) %>%
    +  group_by(filename) %>%
    +  summarise(n = n()) %>% 
    +  collect() %>%
    +  knitr::kable()
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    filenamen
    L2_20221107Anf2
    L2_20221130A48
    L2_20230123A4
    L2_20230130A4
    L2_20230203A2
    L2_20230503A28
    L2_20230505A2
    L2_20230522A4
    L2_20230605B50
    L2_20230626A4
    +
    +
    +

    These can be filtered out

    +
    +
    +Code +
    before <- nrow(toc)
    +toc <- toc %>%
    +  filter(!is.na(conc))
    +after <- nrow(toc)
    +cat("Before : ", before, "\n")
    +
    +
    +
    Before :  14108 
    +
    +
    +Code +
    cat("Removed: ", before - after, "\n")
    +
    +
    +
    Removed:  148 
    +
    +
    +Code +
    cat("After  : ", after, "\n")
    +
    +
    +
    After  :  13960 
    +
    +
    +

    Plot after re-calculation of the TOC values

    Code -
    toc %>%   
    -  ggplot(aes(x=conc)) + 
    -  stat_density(bw = 0.1, na.rm = TRUE) +
    -  facet_grid(rows = vars(inj_type), scales = "free_y")
    +
    toc %>%   
    +  ggplot(aes(x=conc)) + 
    +  stat_density(bw = 0.1, na.rm = TRUE) +
    +  facet_grid(rows = vars(inj_type), scales = "free_y")

    @@ -7450,10 +7546,10 @@

    Code -
    toc %>%
    -  ggplot(aes(x=conc)) + 
    -  stat_density(bw = 0.1, na.rm = TRUE) +
    -  facet_grid(rows = vars(bottle), cols = vars(inj_type), scales = "free")
    +
    toc %>%
    +  ggplot(aes(x=conc)) + 
    +  stat_density(bw = 0.1, na.rm = TRUE) +
    +  facet_grid(rows = vars(bottle), cols = vars(inj_type), scales = "free")

    @@ -7464,12 +7560,12 @@

    TC

    Code -
    toc %>%
    -  filter(inj_type == "TC") %>%
    -  ggplot(aes(x=conc)) + 
    -  geom_vline(xintercept = 5, col = "red") +
    -  stat_density(bw = 0.1, na.rm = TRUE) +
    -  facet_wrap(~bottle, ncol = 1, scales = "free_y")
    +
    toc %>%
    +  filter(inj_type == "TC") %>%
    +  ggplot(aes(x=conc)) + 
    +  geom_vline(xintercept = 5, col = "red") +
    +  stat_density(bw = 0.1, na.rm = TRUE) +
    +  facet_wrap(~bottle, ncol = 1, scales = "free_y")

    @@ -7481,12 +7577,12 @@

    IC

    Code -
    toc %>%
    -  filter(inj_type == "IC") %>%
    -  ggplot(aes(x=conc)) + 
    -  geom_vline(xintercept = 0.3, col = "red") +
    -  stat_density(bw = 0.01, na.rm = TRUE) +
    -  facet_wrap(~bottle, ncol = 1, scales = "free_y")
    +
    toc %>%
    +  filter(inj_type == "IC") %>%
    +  ggplot(aes(x=conc)) + 
    +  geom_vline(xintercept = 0.3, col = "red") +
    +  stat_density(bw = 0.01, na.rm = TRUE) +
    +  facet_wrap(~bottle, ncol = 1, scales = "free_y")

    @@ -7498,12 +7594,12 @@

    TOC

    Code -
    toc %>%
    -  filter(inj_type == "TOC") %>%
    -  ggplot(aes(x=conc)) + 
    -  geom_vline(xintercept = 4.7, col = "red") +
    -  stat_density(bw = 0.1, na.rm = TRUE) +
    -  facet_wrap(~bottle, ncol = 1, scales = "free_y")
    +
    toc %>%
    +  filter(inj_type == "TOC") %>%
    +  ggplot(aes(x=conc)) + 
    +  geom_vline(xintercept = 4.7, col = "red") +
    +  stat_density(bw = 0.1, na.rm = TRUE) +
    +  facet_wrap(~bottle, ncol = 1, scales = "free_y")

    @@ -7515,11 +7611,11 @@

    TN

    Code -
    toc %>%
    -  filter(inj_type == "TN") %>%
    -  ggplot(aes(x=conc)) +
    -  stat_density(bw = 0.1, na.rm = TRUE) +
    -  facet_wrap(~bottle, ncol = 1, scales = "free_y")
    +
    toc %>%
    +  filter(inj_type == "TN") %>%
    +  ggplot(aes(x=conc)) +
    +  stat_density(bw = 0.1, na.rm = TRUE) +
    +  facet_wrap(~bottle, ncol = 1, scales = "free_y")

    @@ -7533,18 +7629,18 @@

    Add to database

    Code -
    toc_fn <- file.path(params$root_folder, "toc.rds")
    -
    -options(RRDarrow = params$arrow)
    -
    -saveRDS(toc, toc_fn)
    -
    -dir.create(params$arrow, recursive = TRUE, showWarnings = FALSE)
    -parquet_add_toc(
    -    fn = toc_fn,
    -    path_to_parquet_root_dir = params$arrow,
    -    rename = FALSE
    -)
    +
    toc_fn <- file.path(params$root_folder, "toc.rds")
    +
    +options(RRDarrow = params$parquet)
    +
    +saveRDS(toc, toc_fn)
    +
    +dir.create(params$parquet, recursive = TRUE, showWarnings = FALSE)
    +parquet_add_toc(
    +    fn = toc_fn,
    +    path_to_parquet_root_dir = params$parquet,
    +    rename = FALSE
    +)
    
    @@ -7554,35 +7650,35 @@ 

    Add to database

    Writing data...
    -
    ! Be careful, path_to_parquet should be a file name, using : /Volumes/LEEF/0.TOC/LEEF-2/arrow/toc/toc.parquet
    +
    ! Be careful, path_to_parquet should be a file name, using : /Volumes/LEEF/0.TOC/LEEF-2/parquet/toc/toc.parquet
    Writing data...
    -✔ Data are available in parquet file under /Volumes/LEEF/0.TOC/LEEF-2/arrow/toc/toc.parquet
    +✔ Data are available in parquet file under /Volumes/LEEF/0.TOC/LEEF-2/parquet/toc/toc.parquet
     Writing data...
    Code -
    ## Add experimenal design
    -
    -object <- read.csv(file.path(params$root_folder, "experimental_design.csv"))
    -
    -path_to_parquet <- file.path(params$arrow, "experimental_design", "")
    -dir.create(path_to_parquet, recursive = TRUE, showWarnings = FALSE)
    -object_to_parquet(object = object, path_to_parquet = path_to_parquet)
    +
    ## Add experimenal design
    +
    +object <- read.csv(file.path(params$root_folder, "experimental_design.csv"))
    +
    +path_to_parquet <- file.path(params$parquet, "experimental_design", "")
    +dir.create(path_to_parquet, recursive = TRUE, showWarnings = FALSE)
    +object_to_parquet(object = object, path_to_parquet = path_to_parquet)
    Writing data...
    -! Be careful, path_to_parquet should be a file name, using : /Volumes/LEEF/0.TOC/LEEF-2/arrow/experimental_design//experimental_design.parquet
    +! Be careful, path_to_parquet should be a file name, using : /Volumes/LEEF/0.TOC/LEEF-2/parquet/experimental_design//experimental_design.parquet
     Writing data...
    -✔ Data are available in parquet file under /Volumes/LEEF/0.TOC/LEEF-2/arrow/experimental_design//experimental_design.parquet
    +✔ Data are available in parquet file under /Volumes/LEEF/0.TOC/LEEF-2/parquet/experimental_design//experimental_design.parquet
     Writing data...
    Code -
    toc_original <- toc
    -
    -toc$id <- 1:nrow(toc)
    +
    toc_original <- toc
    +
    +toc$id <- 1:nrow(toc)

    @@ -7592,39 +7688,33 @@

    Plot after filtering<
    Code -
    if (params$LEEF == "LEEF_1") {
    -  p1 <- plot_tocs_per_bottle_per_timestamp(db =  params$db, c("TC", "TOC"))
    -  p2 <- plot_tocs_per_bottle_per_timestamp(db =  params$db, c("IC"))
    -  p3 <- plot_tocs_per_bottle_per_timestamp(db =  params$db, c("TN"))
    -} else {
    -  p1 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$arrow, c("TC", "TOC"), arrow = TRUE)
    -  p2 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$arrow, c("IC"), arrow = TRUE)
    -  p3 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$arrow, c("TN"), arrow = TRUE)
    -}
    -p1
    +
    if (params$LEEF == "LEEF_1") {
    +  p1 <- plot_tocs_per_bottle_per_timestamp(db =  params$db, c("TC", "TOC"))
    +  p2 <- plot_tocs_per_bottle_per_timestamp(db =  params$db, c("IC"))
    +  p3 <- plot_tocs_per_bottle_per_timestamp(db =  params$db, c("TN"))
    +} else {
    +  p1 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$parquet, c("TC", "TOC"), arrow = TRUE)
    +  p2 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$parquet, c("IC"), arrow = TRUE)
    +  p3 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$parquet, c("TN"), arrow = TRUE)
    +}
    +p1
    -
    -
    Warning: Removed 1 row containing missing values (`geom_line()`).
    -
    -

    +

    Code -
    p2
    +
    p2
    -
    -
    Warning: Removed 1 row containing missing values (`geom_line()`).
    -
    -

    +

    Code -
    p3
    +
    p3
    -

    +

    @@ -7634,18 +7724,18 @@

    Code -
    dupl <- toc %>%
    -  group_by(timestamp, bottle, inj_type) %>%
    -  filter(!is.na(bottle)) %>%
    -  summarise(fn_1 = min(filename), fn_2 = max(filename), count = n(), min_conc = min(conc), max_conc = max(conc)) %>%
    -  filter(count > 1 & count < 10) %>%  
    -  mutate(diff = max_conc - min_conc) %>%
    -  arrange(inj_type, diff)
    -dupl %>% 
    -  group_by(fn_1, fn_2, timestamp) %>%
    -  summarise() %>%
    -  arrange(timestamp) %>%
    -  knitr::kable()
    +
    dupl <- toc %>%
    +  group_by(timestamp, bottle, inj_type) %>%
    +  filter(!is.na(bottle)) %>%
    +  summarise(fn_1 = min(filename), fn_2 = max(filename), count = n(), min_conc = min(conc), max_conc = max(conc)) %>%
    +  filter(count > 1 & count < 10) %>%  
    +  mutate(diff = max_conc - min_conc) %>%
    +  arrange(inj_type, diff)
    +dupl %>% 
    +  group_by(fn_1, fn_2, timestamp) %>%
    +  summarise() %>%
    +  arrange(timestamp) %>%
    +  knitr::kable()
    @@ -7658,50 +7748,30 @@

    -

    - - - - - - - - - - - - - - - + - + - + - + - - - - -
    L2_20221130AL2_20221130B20221130
    L2_20230123AL2_20230123B20230123
    L2_20230130AL2_20230130B20230130
    L2_20230203A L2_20230203B 20230203
    L2_20230213A L2_20230213C 20230213
    L2_20230503B! L2_20230505A 20230505
    L2_20230503B! L2_20230505B 20230505
    L2_20230605B L2_20230605C 20230605
    L2_20230623AL2_20230630A20230630
    @@ -7710,25 +7780,33 @@

    Code -
    dat <- arrow_read_toc(db =  params$arrow) %>% 
    -  collect()
    -if (nrow(dat) > 0){
    -dat$id <- 1:nrow(dat)
    -ids <- dat %>% 
    -  filter(!is.na(bottle)) %>%
    -  group_by(timestamp, bottle, type) %>%
    -  summarize(min(id), max(id), n = n()) %>%
    -  filter(n > 1)
    -ids <- c(ids$`min(id)`, ids$`max(id)`) |>
    -  unique()
    -x <- dat %>% 
    -  filter(id %in% ids) %>%
    -  group_by(day, timestamp, bottle, type) %>%
    -  summarise(timestamp, bottle, type, mic = min(concentration), mac = max(concentration)) %>%
    -  mutate(mdiff = mac - mic)
    -}
    +
    dat <- arrow_read_toc(db =  params$parquet) %>% 
    +  collect()
    +if (nrow(dat) > 0){
    +dat$id <- 1:nrow(dat)
    +ids <- dat %>% 
    +  filter(!is.na(bottle)) %>%
    +  group_by(timestamp, bottle, type) %>%
    +  summarize(min(id), max(id), n = n()) %>%
    +  filter(n > 1)
    +ids <- c(ids$`min(id)`, ids$`max(id)`) |>
    +  unique()
    +x <- dat %>% 
    +  filter(id %in% ids) %>%
    +  group_by(day, timestamp, bottle, type) %>%
    +  summarise(timestamp, bottle, type, mic = min(concentration), mac = max(concentration)) %>%
    +  mutate(mdiff = mac - mic)
    +}
    +
    Warning: There were 2 warnings in `summarise()`.
    +The first warning was:
    +ℹ In argument: `mic = min(concentration)`.
    +Caused by warning in `min()`:
    +! no non-missing arguments to min; returning Inf
    +ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
    +
    +
    Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
     dplyr 1.1.0.
     ℹ Please use `reframe()` instead.
    @@ -7741,23 +7819,17 @@ 

    TN Duplicates

    Code -
    if (nrow(x) > 0){
    -  pl <- x %>%
    -    filter(type == "TN") %>%
    -    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    -    ggplot2::geom_point() +
    -    ggplot2::xlab("Smaler concentration Value") +
    -    ggplot2::ylab("Larger concentration Value")
    -  
    -  ggMarginal(pl, type="histogram")
    -}
    +
    if (nrow(x) > 0){
    +  pl <- x %>%
    +    filter(type == "TN") %>%
    +    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    +    ggplot2::geom_point() +
    +    ggplot2::xlab("Smaler concentration Value") +
    +    ggplot2::ylab("Larger concentration Value")
    +  
    +  ggMarginal(pl, type="histogram")
    +}
    -
    -
    Warning: Removed 52 rows containing missing values (`geom_point()`).
    -
    -
    -

    -
    @@ -7765,23 +7837,17 @@

    TC Duplicates

    Code -
    if (nrow(x) > 0){
    -  pl <- x %>%
    -    filter(type == "TC") %>%
    -    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    -    ggplot2::geom_point() +
    -    ggplot2::xlab("Smaler concentration Value") +
    -    ggplot2::ylab("Larger concentration Value")
    -  
    -  ggMarginal(pl, type="histogram")
    -}
    +
    if (nrow(x) > 0){
    +  pl <- x %>%
    +    filter(type == "TC") %>%
    +    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    +    ggplot2::geom_point() +
    +    ggplot2::xlab("Smaler concentration Value") +
    +    ggplot2::ylab("Larger concentration Value")
    +  
    +  ggMarginal(pl, type="histogram")
    +}
    -
    -
    Warning: Removed 52 rows containing missing values (`geom_point()`).
    -
    -
    -

    -
    @@ -7789,23 +7855,17 @@

    IC Duplicates

    Code -
    if (nrow(x) > 0){
    -  pl <- x %>%
    -    filter(type == "IC") %>%
    -    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    -    ggplot2::geom_point() +
    -    ggplot2::xlab("Smaler concentration Value") +
    -    ggplot2::ylab("Larger concentration Value")
    -  
    -  ggMarginal(pl, type="histogram")
    -}
    +
    if (nrow(x) > 0){
    +  pl <- x %>%
    +    filter(type == "IC") %>%
    +    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    +    ggplot2::geom_point() +
    +    ggplot2::xlab("Smaler concentration Value") +
    +    ggplot2::ylab("Larger concentration Value")
    +  
    +  ggMarginal(pl, type="histogram")
    +}
    -
    -
    Warning: Removed 56 rows containing missing values (`geom_point()`).
    -
    -
    -

    -
    @@ -7813,23 +7873,17 @@

    TOC Duplicates

    Code -
    if (nrow(x) > 0){
    -  pl <- x %>%
    -    filter(type == "TOC") %>%
    -    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    -    ggplot2::geom_point() +
    -    ggplot2::xlab("Smaler concentration Value") +
    -    ggplot2::ylab("Larger concentration Value")
    -  
    -  ggMarginal(pl, type="histogram")
    -}
    +
    if (nrow(x) > 0){
    +  pl <- x %>%
    +    filter(type == "TOC") %>%
    +    ggplot2::ggplot(ggplot2::aes(x = mic, y = mac, colour = type)) +
    +    ggplot2::geom_point() +
    +    ggplot2::xlab("Smaler concentration Value") +
    +    ggplot2::ylab("Larger concentration Value")
    +  
    +  ggMarginal(pl, type="histogram")
    +}
    -
    -
    Warning: Removed 56 rows containing missing values (`geom_point()`).
    -
    -
    -

    -
    @@ -7838,16 +7892,16 @@

    Final diagnostic r
    Code -
    options(knitr.duplicate.label = "allow")
    -try(
    -  report_diagnostic(
    -    db = params$db,  
    -    template = params$LEEF,
    -    suffix = "TOC_added", 
    -    format = "html",
    -    lastDays = 7
    -  )
    -)
    +
    options(knitr.duplicate.label = "allow")
    +try(
    +  report_diagnostic(
    +    db = params$db,  
    +    template = params$LEEF,
    +    suffix = "TOC_added", 
    +    format = "html",
    +    lastDays = 7
    +  )
    +)
    diff --git a/inst/Import and filter TOC LEEF-2.qmd b/inst/Import and filter TOC LEEF-2.qmd index d588002..25997b8 100644 --- a/inst/Import and filter TOC LEEF-2.qmd +++ b/inst/Import and filter TOC LEEF-2.qmd @@ -14,7 +14,7 @@ execute: cache: false params: root_folder: "/Volumes/LEEF/0.TOC/LEEF-2/" - arrow: "/Volumes/LEEF/0.TOC/LEEF-2/arrow" + parquet: "/Volumes/LEEF/0.TOC/LEEF-2/parquet" LEEF: LEEF_2 min_TC: 10 min_IC: 0.75 @@ -439,6 +439,8 @@ for (i in 1:nrow(toc)){ ``` + + ### Calculate TOC Now we re-calculate the TOC values. @@ -461,6 +463,28 @@ toc <- toc %>% ``` +### Remove all where `conc == NA` +We have some `NA` values in the calculated `conc` values. These are removed. +```{r} +toc %>% + filter(is.na(conc)) %>% + group_by(filename) %>% + summarise(n = n()) %>% + collect() %>% + knitr::kable() +``` + +These can be filtered out + +```{r} +before <- nrow(toc) +toc <- toc %>% + filter(!is.na(conc)) +after <- nrow(toc) +cat("Before : ", before, "\n") +cat("Removed: ", before - after, "\n") +cat("After : ", after, "\n") +``` ### Plot after re-calculation of the TOC values @@ -540,14 +564,14 @@ The toc is now added to the database toc_fn <- file.path(params$root_folder, "toc.rds") -options(RRDarrow = params$arrow) +options(RRDarrow = params$parquet) saveRDS(toc, toc_fn) -dir.create(params$arrow, recursive = TRUE, showWarnings = FALSE) +dir.create(params$parquet, recursive = TRUE, showWarnings = FALSE) parquet_add_toc( fn = toc_fn, - path_to_parquet_root_dir = params$arrow, + path_to_parquet_root_dir = params$parquet, rename = FALSE ) @@ -555,7 +579,7 @@ parquet_add_toc( object <- read.csv(file.path(params$root_folder, "experimental_design.csv")) -path_to_parquet <- file.path(params$arrow, "experimental_design", "") +path_to_parquet <- file.path(params$parquet, "experimental_design", "") dir.create(path_to_parquet, recursive = TRUE, showWarnings = FALSE) object_to_parquet(object = object, path_to_parquet = path_to_parquet) @@ -577,9 +601,9 @@ if (params$LEEF == "LEEF_1") { p2 <- plot_tocs_per_bottle_per_timestamp(db = params$db, c("IC")) p3 <- plot_tocs_per_bottle_per_timestamp(db = params$db, c("TN")) } else { - p1 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$arrow, c("TC", "TOC"), arrow = TRUE) - p2 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$arrow, c("IC"), arrow = TRUE) - p3 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$arrow, c("TN"), arrow = TRUE) + p1 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$parquet, c("TC", "TOC"), arrow = TRUE) + p2 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$parquet, c("IC"), arrow = TRUE) + p3 <- LEEF_2_plot_tocs_per_bottle_per_timestamp(db = params$parquet, c("TN"), arrow = TRUE) } p1 p2 @@ -609,7 +633,7 @@ dupl %>% And some plots of the duplicate concentration values only ```{r} -dat <- arrow_read_toc(db = params$arrow) %>% +dat <- arrow_read_toc(db = params$parquet) %>% collect() if (nrow(dat) > 0){ dat$id <- 1:nrow(dat)