Replies: 2 comments
-
extern crate ndarray_linalg;
extern crate netlib_src;
use criterion::{Benchmark, Criterion, black_box, criterion_group, criterion_main};
use ndarray::{arr1, Array1};
use nalgebra::Vector3;
use rand::random;
fn benchmark(c: &mut Criterion) {
c.bench(
"dot",
Benchmark::new("ndarray", |b| {
let b1: Array1<f64> = Array1::from_vec(vec![random(), random(), random()]);
let b2: Array1<f64> = Array1::from_vec(vec![random(), random(), random()]);
b.iter(|| black_box(b1.dot(&b2)));
})
.with_function("nalgebra", |b| {
let b1 = Vector3::<f64>::new(random(), random(), random());
let b2 = Vector3::<f64>::new(random(), random(), random());
b.iter(|| black_box(b1.dot(&b2)));
})
);
}
criterion_group!(benches, benchmark);
criterion_main!(benches); I ran the benchmark, and got the following result dot/ndarray time: [7.3657 ns 7.4323 ns 7.5013 ns]
change: [-1.9526% -0.4306% +1.1328%] (p = 0.60 > 0.05)
No change in performance detected.
Found 4 outliers among 100 measurements (4.00%)
3 (3.00%) high mild
1 (1.00%) high severe
dot/nalgebra time: [684.51 ps 692.21 ps 700.52 ps]
change: [+0.8324% +2.5543% +4.2500%] (p = 0.00 < 0.05)
Change within noise threshold.
Found 4 outliers among 100 measurements (4.00%)
2 (2.00%) high mild
2 (2.00%) high severe After I made all the relevant (for this particular case) dot/ndarray time: [1.9718 ns 1.9925 ns 2.0154 ns]
change: [-73.509% -73.050% -72.590%] (p = 0.00 < 0.05)
Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
4 (4.00%) high mild
1 (1.00%) high severe
dot/nalgebra time: [695.66 ps 702.03 ps 708.21 ps]
change: [-2.1110% +0.1197% +2.5369%] (p = 0.93 > 0.05)
No change in performance detected.
Found 7 outliers among 100 measurements (7.00%)
3 (3.00%) high mild
4 (4.00%) high severe Once I even get 890ps or so by more aggressively commenting out the code and leaving only logic required for Anyway here is a patch if you'd like to reproduce the results diff --git a/src/linalg/impl_linalg.rs b/src/linalg/impl_linalg.rs
index a40f386..7e06e13 100644
--- a/src/linalg/impl_linalg.rs
+++ b/src/linalg/impl_linalg.rs
@@ -56,6 +56,7 @@ where
/// **Panics** if the array shapes are incompatible.<br>
/// *Note:* If enabled, uses blas `dot` for elements of `f32, f64` when memory
/// layout allows.
+ #[inline(always)]
pub fn dot<Rhs>(&self, rhs: &Rhs) -> <Self as Dot<Rhs>>::Output
where
Self: Dot<Rhs>,
@@ -63,6 +64,7 @@ where
Dot::dot(self, rhs)
}
+ #[inline(always)]
fn dot_generic<S2>(&self, rhs: &ArrayBase<S2, Ix1>) -> A
where
S2: Data<Elem = A>,
@@ -72,6 +74,31 @@ where
assert!(self.len() == rhs.len());
if let Some(self_s) = self.as_slice() {
if let Some(rhs_s) = rhs.as_slice() {
+ let vector_len = self_s.len();
+ if vector_len == 2 {
+ return unsafe {
+ let a = *self_s.get_unchecked(0) * *rhs_s.get_unchecked(0);
+ let b = *self_s.get_unchecked(1) * *rhs_s.get_unchecked(1);
+ a + b
+ };
+ } else if vector_len == 3 {
+ return unsafe {
+ let a = *self_s.get_unchecked(0) * *rhs_s.get_unchecked(0);
+ let b = *self_s.get_unchecked(1) * *rhs_s.get_unchecked(1);
+ let c = *self_s.get_unchecked(2) * *rhs_s.get_unchecked(2);
+ a + b + c
+ };
+ } else if vector_len == 4 {
+ return unsafe {
+ let mut a = *self_s.get_unchecked(0) * *rhs_s.get_unchecked(0);
+ let mut b = *self_s.get_unchecked(1) * *rhs_s.get_unchecked(1);
+ let c = *self_s.get_unchecked(2) * *rhs_s.get_unchecked(2);
+ let d = *self_s.get_unchecked(3) * *rhs_s.get_unchecked(3);
+ a = a + c;
+ b = b + d;
+ a + b
+ };
+ }
return numeric_util::unrolled_dot(self_s, rhs_s);
}
}
@@ -94,6 +121,7 @@ where
}
#[cfg(feature = "blas")]
+ #[inline(always)]
fn dot_impl<S2>(&self, rhs: &ArrayBase<S2, Ix1>) -> A
where
S2: Data<Elem = A>,
@@ -182,6 +210,7 @@ where
/// **Panics** if the arrays are not of the same length.<br>
/// *Note:* If enabled, uses blas `dot` for elements of `f32, f64` when memory
/// layout allows.
+ #[inline(always)]
fn dot(&self, rhs: &ArrayBase<S2, Ix1>) -> A {
self.dot_impl(rhs)
} |
Beta Was this translation helpful? Give feedback.
-
Hi Rolf, While of course it's welcome to improve upon this — it should not be done at the expense of our core competency which is handling big arrays. Do you have an array of 100 elements or 10000 elements? Then it's our game. |
Beta Was this translation helpful? Give feedback.
-
I made a comparison between ndarray and nalgebra dot products of 3d-Vectors. The performance difference is very large whith ndarray being 12 times slower than nalgebra.
Please find the code below. Did I do anything wrong? I would like to create a python module that calls rust functions and as far as I have seen this is easily possible with ndarray but not nalgebra.
Thanks in advance
Rolf
output:
sum1 = 3599999.999718683 dt1 = 114209136
sum2 = 3599999.999718683 dt2 = 8604722
main.rs:
#![allow(non_snake_case)]
#![allow(unused)]
#[macro_use]
extern crate ndarray;
extern crate nalgebra;
use ndarray::prelude::*;
use ndarray::{arr1, arr2, Array1, Array2};
use nalgebra::Vector3;
use std::time::{Duration, Instant};
fn main() {
let n = 10000000;
let mut sum1 = 0.0;
let b1 = arr1(&[1.0, 0.2, 0.3]);
let b2 = arr1(&[0.1, 1.0, 0.2]);
let t1 = Instant::now();
for i in 0..n {
sum1 += b1.dot(&b2);
}
let dt1 = t1.elapsed().as_nanos();
println!("sum1 = {} dt1 = {}", sum1, dt1);
}
Cargo.toml:
[package]
name = "dottest"
version = "0.1.0"
authors = ["ropewe56"]
edition = "2018"
[dependencies]
ndarray = "0.12.1"
nalgebra = "0.18.0"
Beta Was this translation helpful? Give feedback.
All reactions