Merge pull request #221 from KristinaGomoryova/alignment

feature.alignment function refactored
RECETOX · Jul 29, 2024 · 8261d75 · 8261d75
2 parents 978102d + ef2f003
commit 8261d75
Show file tree

Hide file tree

Showing 25 changed files with 486 additions and 151 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu.
         It takes the adaptive processing of LC/MS metabolomics data further
         with focus on high resolution MS for both LC and GC applications.
 Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach,
-        snow, dplyr, tidyr, stringr, tibble, tools, arrow
+        snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr
 biocViews: Technology, MassSpectrometry
 License: GPL-2
 LazyLoad: yes

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,15 @@
 # Generated by roxygen2: do not edit by hand
 
 export(adaptive.bin)
+export(add_feature_ids)
 export(adjust.time)
 export(aggregate_by_rt)
+export(as_feature_sample_table)
 export(bigauss.esti)
 export(bigauss.esti.EM)
 export(bigauss.mix)
+export(check_files)
+export(clean_data_matrix)
 export(comb)
 export(compute_boundaries)
 export(compute_bounds)
@@ -16,6 +20,7 @@ export(compute_clusters)
 export(compute_clusters_simple)
 export(compute_comb)
 export(compute_corrected_features)
+export(compute_corrected_features_v2)
 export(compute_curr_rec_with_enough_peaks)
 export(compute_delta_rt)
 export(compute_densities)
@@ -40,10 +45,14 @@ export(compute_template)
 export(compute_template_adjusted_rt)
 export(compute_uniq_grp)
 export(correct_time)
+export(correct_time_v2)
 export(count_peaks)
 export(create_aligned_feature_table)
+export(create_features_from_cluster)
+export(create_intensity_row)
+export(create_metadata)
 export(create_output)
-export(create_rows)
+export(create_rt_row)
 export(draw_rt_correction_plot)
 export(draw_rt_normal_peaks)
 export(duplicate.row.remove)
@@ -59,6 +68,7 @@ export(get_features_in_rt_range)
 export(get_mzrange_bound_indices)
 export(get_num_workers)
 export(get_rt_region_indices)
+export(get_sample_name)
 export(get_single_occurrence_mask)
 export(get_times_to_use)
 export(hybrid)
@@ -90,8 +100,6 @@ export(remove_noise)
 export(rev_cum_sum)
 export(rm.ridge)
 export(run_filter)
-export(select_mz)
-export(select_rt)
 export(semi.sup)
 export(solve_a)
 export(solve_sigma)

diff --git a/R/adjust.time.R b/R/adjust.time.R
@@ -2,6 +2,10 @@
 NULL
 #> NULL
 
+#' Combine template and sample features
+#' @param template_features Tibble Template feature table (mz, rt, cluster, sample_id).
+#' @param features Tibble Sample feature table (mz, rt, cluster, sample_id).
+#' @return Tibble Combined feature table (rbind).
 #' @export
 compute_comb <- function(template_features, features) {
   combined <- dplyr::bind_rows(
@@ -11,6 +15,12 @@ compute_comb <- function(template_features, features) {
   return(combined)
 }
 
+#' Select features to use for retention time alignment
+#' @description This function selects features present in both the sample
+#' feature table and template feature table given they have the same cluster,
+#' are adjacent in the combined table.
+#' @param combined Tibble Table with (mz, rt, cluster, sample_id).
+#' @return List of bool Returns list of bools with TRUE at each index where this condition is met.
 #' @export
 compute_sel <- function(combined) {
   l <- nrow(combined)
@@ -19,6 +29,11 @@ compute_sel <- function(combined) {
   return(sel)
 }
 
+#' Create two column table with paired sample and template retention times.
+#' @param combined Tibble Table with features from sample and template.
+#' @param sel list of bools List of bools indiciating which features to pair.
+#' See 'compute_sel'.
+#' @param j string Template sample_id.
 #' @export
 compute_template_adjusted_rt <- function(combined, sel, j) {
   all_features <- cbind(combined$rt[sel], combined$rt[sel + 1])
@@ -34,6 +49,13 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
   return(all_features)
 }
 
+#' Correct the rt in feature table based on paired feature rts and differences.
+#' @description This is a newer implementation based on dplyr which might be more efficient than the other function.
+#' @param features Tibble The feature table for which to correct rts.
+#' @param template_rt List of floats Template retention times for the paired features.
+#' @param delta_rt List of floats Differences between the paired rts.
+#' @return Tibble A table with corrected retention times.
+#' @export
 compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
   features <- features |> dplyr::arrange_at(c("rt", "mz"))
   idx <- dplyr::between(features$rt, min(template_rt), max(template_rt))
@@ -59,29 +81,34 @@ compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
   return(features |> dplyr::arrange_at(c("mz", "rt")))
 }
 
+#' Correct the rt in feature table based on paired feature rts and differences.
+#' @param features Tibble The feature table for which to correct rts.
+#' @param template_rt List of floats Template retention times for the paired features.
+#' @param delta_rt List of floats Differences between the paired rts.
+#' @return Tibble A table with corrected retention times.
 #' @export
-compute_corrected_features <- function(features, delta_rt, avg_time) {
+compute_corrected_features <- function(features, template_rt, delta_rt) {
   features <- features |> dplyr::arrange_at(c("rt", "mz"))
 
   corrected <- features$rt
   original <- features$rt
 
-  idx <- dplyr::between(original, min(delta_rt), max(delta_rt))
+  idx <- dplyr::between(original, min(template_rt), max(template_rt))
   to_correct <- original[idx]
   this.smooth <- ksmooth(
+    template_rt,
     delta_rt,
-    avg_time,
     kernel = "normal",
-    bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
+    bandwidth = (max(template_rt) - min(template_rt)) / 5,
     x.points = to_correct
   )
 
   corrected[idx] <- this.smooth$y + to_correct
   lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
   upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
 
-  idx_lower <- original < min(delta_rt)
-  idx_upper <- original > max(delta_rt)
+  idx_lower <- original < min(template_rt)
+  idx_upper <- original > max(template_rt)
 
   corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment
   corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment
@@ -91,6 +118,10 @@ compute_corrected_features <- function(features, delta_rt, avg_time) {
   return(features)
 }
 
+#' Fill missing values based on original retention times.
+#' @param orig.features Non-corrected feature table.
+#' @param this.features Feature table with eventual missing values.
+#' @return Tibble Feature table with filles values.
 #' @export
 fill_missing_values <- function(orig.feature, this.feature) {
   missing_values <- which(is.na(this.feature$rt))
@@ -104,6 +135,10 @@ fill_missing_values <- function(orig.feature, this.feature) {
   return(this.feature)
 }
 
+#' Function to perform retention time correction
+#' @param this.feature Tibble Feature table for which to correct rt.
+#' @param template_features Tibble Template feature table to use for correction.
+#' @return Tibble this.feature table with corrected rt values.
 #' @export
 correct_time <- function(this.feature, template_features) {
     orig.features <- this.feature
@@ -137,6 +172,10 @@ correct_time <- function(this.feature, template_features) {
   return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster")))
 }
 
+#' Select the template feature table.
+#' @description The current implementation selects the table with the most features as the template.
+#' @param extracted_features List of tables Tables from which to select the template.
+#' @return Tibble Template feature table.
 #' @export
 compute_template <- function(extracted_features) {
   num.ftrs <- sapply(extracted_features, nrow)
@@ -149,6 +188,14 @@ compute_template <- function(extracted_features) {
   return(tibble::as_tibble(template_features))
 }
 
+#' Rewritten version of 'correct_time'
+#' @description This function uses dplyr to do the same as
+#' 'correct_time', just with less code. Most functions used in the original
+#' function are replaced with simple data transformations.
+#' @param features Tibble Table with features to correct.
+#' @param template Tibble Template feature table to use for correction.
+#' @return Tibble Corrected feature table.
+#' @export
 correct_time_v2 <- function(features, template) {
   if (unique(features$sample_id) == unique(template$sample_id))
     return(tibble::as_tibble(features))