Skip to content

Commit

Permalink
Merge pull request #226 from RECETOX/fix_eic_splitting
Browse files Browse the repository at this point in the history
fixed eic splitting and updated tests
  • Loading branch information
hechth authored Sep 16, 2024
2 parents 169b929 + a393e64 commit 6e3ac2e
Show file tree
Hide file tree
Showing 15 changed files with 182 additions and 134 deletions.
18 changes: 17 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,27 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [dev] - unreleased
## [0.13.3] - 2024-09-16
### Changed
- fixed not working eic splitting code [#226](https://github.com/RECETOX/recetox-aplcms/pull/226)

## [0.13.2] - 2024-07-30
### Changed
- Moved import order in the hope it will fix the import bugs by @hechth in [#224](https://github.com/RECETOX/recetox-aplcms/pull/224)


## [0.13.0] - 2024-07-29
### Added
- added tests for mzdata and mzxml formats [#216](https://github.com/RECETOX/recetox-aplcms/pull/216)

### Changed
- feature.alignment function refactored by @KristinaGomoryova in [#221](https://github.com/RECETOX/recetox-aplcms/pull/221)
- remove_noise function updated by @KristinaGomoryova in [#219](https://github.com/RECETOX/recetox-aplcms/pull/219)
- Updated correct time to use precomputed clusters by @zargham-ahmad in [#220](https://github.com/RECETOX/recetox-aplcms/pull/220)
- Fix tests by @hechth in [#222](https://github.com/RECETOX/recetox-aplcms/pull/222)

### Removed
- removed outdated files by @hechth in [#223](https://github.com/RECETOX/recetox-aplcms/pull/223)

## [0.12.0] - 2023-07-10
### Changed
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: recetox.aplcms
Title: Adaptive processing of LC-MS data
Version: 0.13.2
Version: 0.13.3
Date: 2024-07-30
Authors@R: c(
person("Tianwei", "Yu", email = "[email protected]", role = "aut"),
Expand Down
43 changes: 18 additions & 25 deletions R/remove_noise.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,24 +92,17 @@ remove_noise <- function(filename,

run.sel <- raw.prof$height.rec[which(raw.prof$height.rec[, 2] >= raw.prof$min.count.run * min_pres & raw.prof$height.rec[, 3] > baseline_correct), 1]

newprof <- newprof[newprof[, 4] %in% run.sel, ]

if (grouping_threshold < Inf) {
sorted_newprof <- newprof[order(newprof[,2]),]
new_grps <- cumsum(c(0, diff(sorted_newprof[,2])) > grouping_threshold)
sorted_newprof <- cbind(sorted_newprof, new_grps, deparse.level = 0)

sorted_newprof_df <- tibble::as_tibble(sorted_newprof)

newprof <- as.matrix(sorted_newprof_df |>
dplyr::group_by(V4, V5) |>
dplyr::mutate(cluster = cur_group_id()) |>
dplyr::ungroup() |>
dplyr::arrange(cluster) |>
dplyr::select(-V4, -V5)
)
colnames(newprof) <- NULL
}
newprof <- as.data.frame(newprof[newprof[, 4] %in% run.sel, ])
colnames(newprof) <- c("mz", "rt", "intensity", "group_number")

newprof <- tibble::tibble(newprof |>
dplyr::group_by(group_number) |>
dplyr::arrange_at("rt") |>
dplyr::mutate(subset_group_number = cumsum(c(0, abs(diff(rt)) > grouping_threshold))) |>
dplyr::group_by(group_number, subset_group_number) |>
dplyr::mutate(grps = cur_group_id()) |>
dplyr::ungroup() |>
dplyr::select(mz, rt, intensity, grps))

new.prof <- run_filter(
newprof,
Expand All @@ -128,12 +121,12 @@ remove_noise <- function(filename,
)
}

new_rec_tibble <- tibble::tibble(
mz = new.prof$new_rec[, 1],
rt = new.prof$new_rec[, 2],
intensity = new.prof$new_rec[, 3],
group_number = new.prof$new_rec[, 4]
)
# new_rec_tibble <- tibble::tibble(
# mz = new.prof$new_rec[, 1],
# rt = new.prof$new_rec[, 2],
# intensity = new.prof$new_rec[, 3],
# group_number = new.prof$new_rec[, 4]
# )

return(new_rec_tibble)
return(new.prof)
}
100 changes: 52 additions & 48 deletions R/run_filter.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
#' @return unique_grp.
#' @export
compute_uniq_grp <- function(profile, min_count_run, min_pres) {
grps <- profile
ttt <- table(grps)
ttt <- table(profile)
ttt <- ttt[ttt >= max(min_count_run * min_pres, 2)]
unique_grp <- as.numeric(names(ttt))
return(unique_grp)
Expand Down Expand Up @@ -52,7 +51,7 @@ label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) {

# filtering based on the kernel regression estimate
this_smooth <- predict_smoothed_rt(min_run, this_timeline)
if (max(this_smooth) >= min_pres) {
if (max(this_smooth, na.rm = TRUE) >= min_pres) {
measured_points <- good_points <- timeline
measured_points[this_times] <- 1

Expand Down Expand Up @@ -82,63 +81,68 @@ label_val_to_keep <- function(min_run, timeline, min_pres, this_times, times) {
run_filter <- function(newprof,
min_pres,
min_run) {
newprof <- tibble::tibble(mz = newprof[, 1], rt = newprof[, 2], intensi = newprof[, 3], grps = newprof[, 4])

# ordering retention time values
labels <- newprof$rt
times <- unique(labels)
times <- times[order(times)]
# labels <- newprof$rt
# times <- unique(labels)
# times <- times[order(times)]

for (i in 1:length(times)) {
labels[which(newprof$rt == times[i])] <- i # now labels is the index of time points
}
# for (i in 1:length(times)) {
# labels[which(newprof$rt == times[i])] <- i # now labels is the index of time points
# }

newprof$rt <- labels
# newprof$rt <- labels
newprof <- dplyr::arrange_at(newprof, "rt")
times <- unique(newprof$rt)

# calculates the minimun number of rt points to be considered a peak
min_count_run <- min_run * length(times) / (max(times) - min(times))
min_run <- round(min_count_run)
scan_rate <- 1.0 / abs(median(diff(times)))
min_count_run <- round(min_pres * min_run * scan_rate)

# computes unique groups
uniq_grp <- compute_uniq_grp(newprof$grps, min_count_run, min_pres)

# ordered by mz and grps data that are inside unigrps
newprof <- dplyr::filter(newprof, grps %in% uniq_grp) |> dplyr::arrange(grps, mz)

# computes break points i.e. indices of mass differences greater than min_mz_tol
breaks <- compute_breaks_3(newprof$grps)

# init counters for loop
new_rec <- newprof * 0
rec_pointer <- 1
timeline <- rep(0, length(times))
for (m in 2:length(breaks))
{
this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt")

to_keep <- label_val_to_keep(
min_run,
timeline,
min_pres,
this_prof$rt,
times
)

# operation over selected indices
if (sum(to_keep) > 0) {
this_sel <- which(to_keep == 1)
this_new <- dplyr::slice(this_prof, this_sel)
r_new <- nrow(this_new)
new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new
rec_pointer <- rec_pointer + r_new
}
}

new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1))
new_rec[, 2] <- times[new_rec[, 2]]

results <- new("list")
results$new_rec <- new_rec
results <- dplyr::group_by(newprof, grps) |>
dplyr::filter(n() >= min_count_run && abs(span(rt)) >= min_run) |>
dplyr::ungroup() |>
dplyr::rename(group_number = grps)

# # computes break points i.e. indices of mass differences greater than min_mz_tol
# breaks <- compute_breaks_3(newprof$grps)

# # init counters for loop
# new_rec <- newprof * 0
# rec_pointer <- 1
# timeline <- rep(0, length(times))
# for (m in 2:length(breaks))
# {
# this_prof <- dplyr::slice(newprof, (breaks[m - 1] + 1):breaks[m]) |> dplyr::arrange_at("rt")

# to_keep <- label_val_to_keep(
# min_run,
# timeline,
# min_pres,
# this_prof$rt,
# times
# )
# browser()
# # operation over selected indices
# if (sum(to_keep) > 0) {
# this_sel <- which(to_keep == 1)
# this_new <- dplyr::slice(this_prof, this_sel)
# r_new <- nrow(this_new)
# new_rec[rec_pointer:(rec_pointer + r_new - 1), ] <- this_new
# rec_pointer <- rec_pointer + r_new
# }
# }

# new_rec <- dplyr::slice(new_rec, 1:(rec_pointer - 1))
# new_rec[, 2] <- times[new_rec[, 2]]

# results <- new("list")
# results$new_rec <- new_rec

return(results)
}
5 changes: 1 addition & 4 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,18 +110,15 @@ concatenate_feature_tables <- function(features, sample_names) {
}

#' @export
load_aligned_features <- function(metadata_file, intensities_file, rt_file, tol_file) {
load_aligned_features <- function(metadata_file, intensities_file, rt_file) {
metadata <- arrow::read_parquet(metadata_file)
intensities <- arrow::read_parquet(intensities_file)
rt <- arrow::read_parquet(rt_file)
tolerances <- arrow::read_parquet(tol_file)

result <- list()
result$metadata <- as_tibble(metadata)
result$intensity <- as_tibble(intensities)
result$rt <- as_tibble(rt)
result$mz_tol_relative <- tolerances$mz_tolerance
result$rt_tol_relative <- tolerances$rt_tolerance
return(result)
}

Expand Down
17 changes: 12 additions & 5 deletions tests/testthat/test-adjust-time.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ patrick::with_parameters_test_that(
testdata <- file.path("..", "testdata")

extracted <- read_parquet_files(files, "clusters", "_extracted_clusters.parquet")
template_features <- compute_template(extracted)
actual <- compute_template(extracted)

expected <- file.path(testdata, "template", "RCX_shortened.parquet")
expected <- arrow::read_parquet(expected)

expect_equal(template_features, expected)
expected_path <- file.path(testdata, "template", "RCX_shortened.parquet")
expected <- arrow::read_parquet(expected_path)
expect_equal(actual, expected)
},
patrick::cases(
RCX_shortened = list(
Expand All @@ -32,6 +32,10 @@ patrick::with_parameters_test_that(
correct_time(x, template_features)
})

# for(i in 1:3) {
# arrow::write_parquet(corrected[[i]], file.path(testdata, "adjusted", paste0(files[i], ".parquet")))
# }

expected <- read_parquet_files(files, "adjusted", ".parquet")
expect_equal(corrected, expected, tolerance = 0.01)
},
Expand All @@ -56,5 +60,8 @@ test_that("correct_time_v2 is close to correct time", {
expected <- correct_time(clustered_table, template_features)
actual <- correct_time_v2(clustered_table, template_features)

actual <- dplyr::arrange_at(actual, c("cluster", "mz", "area", "rt"))
expected <- dplyr::arrange_at(expected, c("cluster", "mz", "area", "rt"))

expect_equal(actual, expected, tolerance = 0.02)
})
15 changes: 9 additions & 6 deletions tests/testthat/test-compute_clusters.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,18 @@ patrick::with_parameters_test_that(
sample_names = files
)

# for(i in 1:3) {
# arrow::write_parquet(actual$feature_tables[[i]], file.path(testdata, "clusters", paste0(files[i], "_", input ,"_clusters.parquet")))
# }
expected <- read_parquet_files(files, "clusters", paste0("_", input, "_clusters.parquet"))


for(i in seq_along(files)) {
expect_equal(actual$feature_tables[[i]], expected[[i]])
}

expect_equal(actual$mz_tol_relative, expected_mz_tol_relative)
expect_equal(actual$rt_tol_relative, expected_rt_tol_relative)
expect_equal(actual$mz_tol_relative, expected_mz_tol_relative, tolerance=0.1)
expect_equal(actual$rt_tol_relative, expected_rt_tol_relative, tolerance=0.1)

},
patrick::cases(
Expand All @@ -32,16 +35,16 @@ patrick::with_parameters_test_that(
input = "extracted",
mz_max_diff = 10 * 1e-05,
mz_tol_absolute = 0.01,
expected_mz_tol_relative = 6.85676325338646e-06,
expected_rt_tol_relative = 3.61858118506494
expected_mz_tol_relative = 6.849039e-06,
expected_rt_tol_relative = 2.894385
),
RCX_shortened_adjusted = list(
files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened"),
input = "adjusted",
mz_max_diff = 10 * 1e-05,
mz_tol_absolute = 0.01,
expected_mz_tol_relative = 6.85676325338646e-06,
expected_rt_tol_relative = 2.17918873407775
expected_mz_tol_relative = 6.856763e-06,
expected_rt_tol_relative = 1.93185408267324
)
)
)
Expand Down
1 change: 0 additions & 1 deletion tests/testthat/test-extract_features.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ patrick::with_parameters_test_that(
})

expected <- read_parquet_files(expected_files, "extracted", ".parquet")

expect_equal(actual, expected, tolerance = 0.02)
},
patrick::cases(
Expand Down
18 changes: 5 additions & 13 deletions tests/testthat/test-feature-align.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,11 @@ patrick::with_parameters_test_that(
res$mz_tol_relative,
get_num_workers()
)

aligned_actual$mz_tol_relative <- res$mz_tol_relative
aligned_actual$rt_tol_relative <- res$rt_tol_relative


aligned_expected <- load_aligned_features(
file.path(testdata, "aligned", "metadata_table.parquet"),
file.path(testdata, "aligned", "intensity_table.parquet"),
file.path(testdata, "aligned", "rt_table.parquet"),
file.path(testdata, "aligned", "tolerances.parquet")
file.path(testdata, "aligned", "rt_table.parquet")
)

expect_equal(aligned_actual, aligned_expected)
Expand Down Expand Up @@ -75,21 +71,17 @@ patrick::with_parameters_test_that(
aligned_expected <- load_aligned_features(
file.path(testdata, "aligned", "metadata_table.parquet"),
file.path(testdata, "aligned", "intensity_table.parquet"),
file.path(testdata, "aligned", "rt_table.parquet"),
file.path(testdata, "aligned", "tolerances.parquet")
file.path(testdata, "aligned", "rt_table.parquet")
)

aligned_expected["mz_tol_relative"] <- NULL
aligned_expected["rt_tol_relative"] <- NULL

expect_equal(aligned_actual, aligned_expected)
},
patrick::cases(
RCX_shortened = list(
files = c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened"),
min_occurrence = 2,
mz_tol_relative = 6.85676325338646e-06,
rt_tol_relative = 2.17918873407775
mz_tol_relative = 6.84903911826453e-06,
rt_tol_relative = 1.93185408267324
)
)
)
2 changes: 1 addition & 1 deletion tests/testthat/test-find_mz_tolerance.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ test_that("mz tolerance is found", {
do.plot = FALSE
)

expect_equal(mz_tol_relative, 0.01664097, tolerance=0.001)
expect_equal(mz_tol_relative, 0.0135994, tolerance=0.001)
})
Loading

0 comments on commit 6e3ac2e

Please sign in to comment.