From bdd320dfcfc71cd9f4dede01c1ba758578970ed4 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 2 Nov 2023 12:09:53 -0500 Subject: [PATCH 1/3] use map.collect for rayon too improve performance by 6% --- Cargo.lock | 84 +++++++++++++++++------------- webgestalt_lib/Cargo.toml | 5 +- webgestalt_lib/README.md | 6 +-- webgestalt_lib/src/methods/gsea.rs | 28 +++++----- 4 files changed, 69 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8063393..ea0ffde 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,14 +4,15 @@ version = 3 [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -111,9 +112,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.4.6" +version = "4.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d04704f56c2cde07f43e8e2c154b43f216dc5c92fc98ada720177362f953b956" +checksum = "ac495e00dcec98c83465d5ad66c5c4fabd652fd6686e7c6269b117e729a6f17b" dependencies = [ "clap_builder", "clap_derive", @@ -121,9 +122,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.4.6" +version = "4.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e231faeaca65ebd1ea3c737966bf858971cd38c3849107aa3ea7de90a804e45" +checksum = "c77ed9a32a62e6ca27175d00d29d05ca32e396ea1eb5fb01d8256b669cec7663" dependencies = [ "anstream", "anstyle", @@ -133,21 +134,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.4.2" +version = "4.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0862016ff20d69b84ef8247369fabf5c008a7417002411897d40ee1f4532b873" +checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] name = "clap_lex" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" [[package]] name = "colorchoice" @@ -267,15 +268,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.148" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libm" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "matrixmultiply" @@ -363,9 +364,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", "libm", @@ -410,9 +411,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.67" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] @@ -492,12 +493,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "ryu" version = "1.0.15" @@ -521,22 +516,22 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "serde" -version = "1.0.188" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.188" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -594,9 +589,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.37" +version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ "proc-macro2", "quote", @@ -653,16 +648,15 @@ dependencies = [ "pretty_assertions", "rand", "rayon", - "rustc-hash", "serde", "statrs", ] [[package]] name = "wide" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebecebefc38ff1860b4bc47550bbfa63af5746061cf0d29fcd7fa63171602598" +checksum = "c68938b57b33da363195412cfc5fc37c9ed49aa9cfe2156fde64b8d2c9498242" dependencies = [ "bytemuck", "safe_arch", @@ -761,3 +755,23 @@ name = "yansi" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" + +[[package]] +name = "zerocopy" +version = "0.7.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686b7e407015242119c33dab17b8f61ba6843534de936d94368856528eae4dcc" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020f3dfe25dfc38dfea49ce62d5d45ecdd7f0d8a724fa63eb36b6eba4ec76806" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.38", +] diff --git a/webgestalt_lib/Cargo.toml b/webgestalt_lib/Cargo.toml index 3791929..960a0bb 100644 --- a/webgestalt_lib/Cargo.toml +++ b/webgestalt_lib/Cargo.toml @@ -9,12 +9,11 @@ rust-version = "1.63.0" [dependencies] csv = "1.3.0" -serde = { version = "1.0.188", features = ["std", "derive"] } +serde = { version = "1.0.190", features = ["std", "derive"] } rand = { version = "0.8.5", features = ["small_rng"] } rayon = "1.8.0" -rustc-hash = "1.1.0" statrs = "0.16.0" -ahash = "0.8.3" +ahash = "0.8.6" [dev-dependencies] pretty_assertions = "1.4.0" diff --git a/webgestalt_lib/README.md b/webgestalt_lib/README.md index abfebd7..af446f4 100644 --- a/webgestalt_lib/README.md +++ b/webgestalt_lib/README.md @@ -11,7 +11,7 @@ Supported methods include: ## Installation -To use webgestalt_lib in your Rust project, add the following line to your `Cargo.toml`. +To use webgestalt_lib in your Rust project, add the following line to your `Cargo.toml`. ```toml webgestalt_lib = {git = "https://github.com/bzhanglab/webgestalt_rust.git"} @@ -23,8 +23,8 @@ If you are just interested in running an analysis, rather than develop new tools 1. Fast and correct implementations of enrichment methods 2. Full compatibility with the WebGestaltR package - - The R package provides the most reporting functionality, and project was initially created to only assist the R package with the computation aspects + - The R package provides the most reporting functionality, and project was initially created to only assist the R package with the computation aspects 3. Fast compilation times - - Every package install has to build the library from scratch, so the lower number of dependencies, the better + - Every package install has to build the library from scratch, so the lower number of dependencies, the better This crate does not provide any data formatting, or charts to display the results of the analysis. This work has already been done by the [R package](https://github.com/bzhanglab/webgestaltr), and a limited implementation is provided by the Rust CLI. The focus for this library is purely computational. diff --git a/webgestalt_lib/src/methods/gsea.rs b/webgestalt_lib/src/methods/gsea.rs index f013404..c7965f3 100644 --- a/webgestalt_lib/src/methods/gsea.rs +++ b/webgestalt_lib/src/methods/gsea.rs @@ -161,19 +161,21 @@ fn analyte_set_p( inverse_nr, false, ); - let mut es_iter = Vec::new(); // Not parallelized because locking is expensive - (0..permutations).for_each(|i| { - // get es for the permutations - let (p_es, _, _) = enrichment_score( - &has_analyte, - &new_ranks, - &permutations_vec[i], - inverse_size_dif, - inverse_nr, - true, - ); - es_iter.push(p_es); - }); + let es_iter: Vec = (0..permutations) + .into_par_iter() + .map(|i| { + // get es for the permutations + let (p_es, _, _) = enrichment_score( + &has_analyte, + &new_ranks, + &permutations_vec[i], + inverse_size_dif, + inverse_nr, + true, + ); + p_es + }) + .collect(); let side: Vec<&f64> = if real_es >= 0_f64 { // get side of distribution for p value es_iter.iter().filter(|x| *x >= &0_f64).collect() From 2868dc680c2c6978d01e08246d2e7682de4f6c11 Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 2 Nov 2023 12:13:44 -0500 Subject: [PATCH 2/3] use map collect for ora --- webgestalt_lib/src/methods/ora.rs | 43 ++++++++++++++++--------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/webgestalt_lib/src/methods/ora.rs b/webgestalt_lib/src/methods/ora.rs index d539db8..f80fe66 100644 --- a/webgestalt_lib/src/methods/ora.rs +++ b/webgestalt_lib/src/methods/ora.rs @@ -69,29 +69,30 @@ pub fn get_ora( ) -> Vec { let m: i64 = reference.len() as i64; let n: i64 = interest_list.len() as i64; - let res = Arc::new(Mutex::new(Vec::new())); - gmt.par_iter().for_each(|i| { - let mut j: i64 = 0; - let mut enriched_parts: AHashSet = AHashSet::default(); - let mut k: i64 = 0; - for analyte in i.parts.iter() { - if interest_list.contains(analyte) { - k += 1; - enriched_parts.insert(analyte.to_owned()); + let partials: Vec = gmt + .par_iter() + .map(|i| { + let mut j: i64 = 0; + let mut enriched_parts: AHashSet = AHashSet::default(); + let mut k: i64 = 0; + for analyte in i.parts.iter() { + if interest_list.contains(analyte) { + k += 1; + enriched_parts.insert(analyte.to_owned()); + } + if reference.contains(analyte) { + j += 1; + } } - if reference.contains(analyte) { - j += 1; + let p = if k == 0 { 1.0 } else { ora_p(m, j, n, k) }; + PartialORAResult { + set: i.id.clone(), + p, + overlap: k, + expected: j as f64 * n as f64 / m as f64, } - } - let p = if k == 0 { 1.0 } else { ora_p(m, j, n, k) }; - res.lock().unwrap().push(PartialORAResult { - set: i.id.clone(), - p, - overlap: k, - expected: j as f64 * n as f64 / m as f64, - }); - }); - let partials = res.lock().unwrap(); + }) + .collect(); let p_vals: Vec = partials.iter().map(|x| x.p).collect(); let fdrs: Vec = stat::adjust(&p_vals, config.fdr_method); let mut final_res = Vec::new(); From a16b4c5a71c73e0fcba2ccc203ba5dd4657a039f Mon Sep 17 00:00:00 2001 From: John Elizarraras Date: Thu, 2 Nov 2023 13:21:27 -0500 Subject: [PATCH 3/3] 16% speed improvement and rename GSEAResult to PartialGSEAResult FullGSEAResult -> GSEAResult remove overlap --- webgestalt_lib/src/methods/gsea.rs | 96 +++++++++++------------- webgestalt_lib/src/methods/multiomics.rs | 12 +-- 2 files changed, 49 insertions(+), 59 deletions(-) diff --git a/webgestalt_lib/src/methods/gsea.rs b/webgestalt_lib/src/methods/gsea.rs index c7965f3..ff36f92 100644 --- a/webgestalt_lib/src/methods/gsea.rs +++ b/webgestalt_lib/src/methods/gsea.rs @@ -36,20 +36,20 @@ pub struct RankListItem { pub rank: f64, } -struct GSEAResult { +struct PartialGSEAResult { // TODO: Look at adding enrichment and normalized enrichment score set: String, p: f64, es: f64, nes: f64, - overlap: i32, leading_edge: i32, running_sum: Vec, + nes_iter: Vec, } -impl GSEAResult { - pub fn add_fdr(&self, fdr: f64) -> FullGSEAResult { - FullGSEAResult { +impl PartialGSEAResult { + pub fn add_fdr(&self, fdr: f64) -> GSEAResult { + GSEAResult { set: self.set.clone(), p: self.p, fdr, @@ -62,7 +62,7 @@ impl GSEAResult { } #[derive(Clone)] -pub struct FullGSEAResult { +pub struct GSEAResult { /// The set name pub set: String, /// The statistical p-value @@ -113,7 +113,7 @@ fn analyte_set_p( p: f64, permutations_vec: &Vec>, config: &GSEAConfig, -) -> (GSEAResult, Vec) { +) -> PartialGSEAResult { let permutations = permutations_vec.len(); let analyte_set = AHashSet::from_iter(item.parts.iter()); let mut n_r: f64 = 0.0; @@ -128,19 +128,16 @@ fn analyte_set_p( } } if overlap < config.min_overlap || overlap > config.max_overlap { - ( - GSEAResult { - // No GSEA needed - set: item.id.clone(), - p: 1.0, - nes: 0.0, - es: 0.0, - overlap, - leading_edge: 0, - running_sum: Vec::new(), - }, - Vec::new(), - ) + PartialGSEAResult { + // No GSEA needed + set: item.id.clone(), + p: 1.0, + nes: 0.0, + es: 0.0, + leading_edge: 0, + running_sum: Vec::new(), + nes_iter: Vec::new(), + } } else { let inverse_nr = 1.0 / n_r; // Invert n_r for the enrichment score let original_order = (0..analyte_count).collect::>(); // get regular order @@ -215,18 +212,15 @@ fn analyte_set_p( } else { -real_es / down_avg }; - ( - GSEAResult { - set: item.id.clone(), - p, - nes: norm_es, - es: real_es, - overlap, - leading_edge: max_hits, - running_sum, - }, - nes_es, - ) + PartialGSEAResult { + set: item.id.clone(), + p, + nes: norm_es, + es: real_es, + leading_edge: max_hits, + running_sum, + nes_iter: nes_es, + } } } @@ -295,29 +289,25 @@ pub fn gsea( gmt: Vec, config: GSEAConfig, provided_permutations: Option>>, -) -> Vec { +) -> Vec { println!("Starting GSEA Calculation."); analyte_list.sort_by(|a, b| b.rank.partial_cmp(&a.rank).unwrap()); // sort list let (analytes, ranks) = RankListItem::to_vecs(analyte_list.clone()); // seperate into vectors let permutations: Vec> = provided_permutations.unwrap_or(make_permutations(config.permutations, analytes.len())); - let all_nes = Arc::new(Mutex::new(Vec::new())); - let set_nes = Arc::new(Mutex::new(Vec::new())); - let all_res = Arc::new(Mutex::new(Vec::new())); - gmt.par_iter().for_each(|analyte_set| { - // parallelized scoring of all sets - let (y, nes_iter) = - analyte_set_p(&analytes, &ranks, analyte_set, 1.0, &permutations, &config); - if y.overlap >= config.min_overlap && y.overlap <= config.max_overlap { - all_nes.lock().unwrap().extend(nes_iter); - set_nes.lock().unwrap().push(y.nes); - all_res.lock().unwrap().push(y); - } - }); - let null_distribution = all_nes.lock().unwrap(); - let observed_distribution = set_nes.lock().unwrap(); - let partial_results: std::sync::MutexGuard<'_, Vec> = all_res.lock().unwrap(); - let mut final_gsea: Vec = Vec::new(); + let partial_results: Vec = gmt + .par_iter() + .map(|analyte_set| { + // parallelized scoring of all sets + analyte_set_p(&analytes, &ranks, analyte_set, 1.0, &permutations, &config) + }) + .collect(); + let null_distribution: Vec = partial_results + .iter() + .flat_map(|x| x.nes_iter.clone()) + .collect(); + let observed_distribution: Vec = partial_results.iter().map(|x| x.nes).collect(); + let mut final_gsea: Vec = Vec::new(); let postive_top_side = null_distribution .par_iter() .filter(|&x| x >= &0_f64) @@ -334,9 +324,9 @@ pub fn gsea( .par_iter() .filter(|&x| x < &0_f64) .collect(); - for i in 0..partial_results.len() { + for item in &partial_results { // get all FDR values - let nes = partial_results[i].nes; + let nes = item.nes; let top_side: &Vec<&f64> = if nes > 0_f64 { // positive null distribution &postive_top_side @@ -368,7 +358,7 @@ pub fn gsea( .filter(|&x| x.abs() >= nes_abs) .count() as f64; let fdr: f64 = (top_val * bottom_len) / (bottom_val * top_len); // get FDR value - final_gsea.push(partial_results[i].add_fdr(fdr)); + final_gsea.push(item.add_fdr(fdr)); } final_gsea } diff --git a/webgestalt_lib/src/methods/multiomics.rs b/webgestalt_lib/src/methods/multiomics.rs index 2fb40c4..bd700e2 100644 --- a/webgestalt_lib/src/methods/multiomics.rs +++ b/webgestalt_lib/src/methods/multiomics.rs @@ -2,7 +2,7 @@ use ahash::{AHashMap, AHashSet}; use statrs::distribution::{Continuous, ContinuousCDF, Normal}; use super::{ - gsea::{FullGSEAResult, GSEAConfig, RankListItem}, + gsea::{GSEAConfig, GSEAResult, RankListItem}, ora::{get_ora, ORAConfig, ORAResult}, }; use crate::{methods::gsea::gsea, readers::utils::Item}; @@ -61,10 +61,10 @@ pub enum NormalizationMethod { /// /// Returns a [`Vec>`] containing the results of each analysis. If the method was not meta-analysis, then the outer vector will only have one element. /// If the method was meta-analysis, then the first element will be the results of the meta-analysis, and the rest of the elements will be the results of each analysis run individually. -pub fn multiomic_gsea(jobs: Vec, method: MultiOmicsMethod) -> Vec> { +pub fn multiomic_gsea(jobs: Vec, method: MultiOmicsMethod) -> Vec> { if let MultiOmicsMethod::Meta(meta_method) = method { let mut phash: AHashMap> = AHashMap::default(); - let mut results: Vec> = Vec::new(); + let mut results: Vec> = Vec::new(); for job in jobs { let res = gsea(job.rank_list, job.gmt, job.config, None); for row in res.iter() { @@ -73,12 +73,12 @@ pub fn multiomic_gsea(jobs: Vec, method: MultiOmicsMethod) -> Vec = Vec::new(); + let mut final_result: Vec = Vec::new(); match meta_method { MetaAnalysisMethod::Stouffer => { let normal = Normal::new(0.0, 1.0).unwrap(); for set in phash.keys() { - final_result.push(FullGSEAResult { + final_result.push(GSEAResult { set: set.clone(), p: stouffer_with_normal(&phash[set], &normal), fdr: 0.0, @@ -91,7 +91,7 @@ pub fn multiomic_gsea(jobs: Vec, method: MultiOmicsMethod) -> Vec { for set in phash.keys() { - final_result.push(FullGSEAResult { + final_result.push(GSEAResult { set: set.clone(), p: fisher(&phash[set]), fdr: 0.0,