diff --git a/Cargo.toml b/Cargo.toml index 919e1ad..6e1a064 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hdt" -version = "0.1.0" +version = "0.1.1" repository = "https://github.com/konradhoeffner/hdt" authors = ["Tim Baccaert ", "Konrad Höffner"] license = "MIT" @@ -20,7 +20,7 @@ langtag = "^0.3.2" ntriple = "^0.1.1" rsdict = { version = "0.0.6", features = ["simd"] } sophia = { version = "0.8.0-alpha.0", optional = true } -sucds = "0.6.0" +sucds = "0.7.0" thiserror = "1.0.37" log = "0.4" mownstr = "0.2.0" diff --git a/README.md b/README.md index 41adc27..8047348 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Add the following to Cargo.toml: ```toml [dependencies] -hdt = "0.0.13-alpha.0" +hdt = "0.1.1" ``` Nightly is required: @@ -70,9 +70,11 @@ If you don't want to pull in the Sophia dependency, you can exclude the adapter: ```toml [dependencies] -hdt = { version = "0.0.13-alpha.0", default-features = false } +hdt = { version = "0.1.1", default-features = false } ``` +There is also a runnable example are [in the examples folder](https://github.com/KonradHoeffner/hdt/tree/main/examples), which you can run with `cargo run --example query`. + ## API Documentation See [docs.rs/latest/hdt](https://docs.rs/hdt) or generate for yourself with `cargo doc --no-deps` without disabling default features. diff --git a/src/triples.rs b/src/triples.rs index 6ad04c5..7043d56 100644 --- a/src/triples.rs +++ b/src/triples.rs @@ -8,7 +8,7 @@ use std::convert::TryFrom; use std::fmt; use std::io; use std::io::BufRead; -use sucds::{CompactVector, Searial, WaveletMatrix, WaveletMatrixBuilder}; +use sucds::{bit_vectors::Rank9Sel, char_sequences::WaveletMatrix, int_vectors::CompactVector, Serializable}; mod subject_iter; pub use subject_iter::SubjectIter; @@ -102,7 +102,7 @@ pub struct TriplesBitmap { /// Index for object-based access. Points to the predicate layer. pub op_index: OpIndex, /// wavelet matrix for predicate-based access - pub wavelet_y: WaveletMatrix, + pub wavelet_y: WaveletMatrix, } impl fmt::Debug for TriplesBitmap { @@ -155,7 +155,7 @@ impl TriplesBitmap { while low < high { let mid = (low + high) / 2; - match self.wavelet_y.get(mid).cmp(&element) { + match self.wavelet_y.access(mid).unwrap().cmp(&element) { Ordering::Less => low = mid + 1, Ordering::Greater => high = mid, Ordering::Equal => return Some(mid), @@ -169,15 +169,17 @@ impl TriplesBitmap { self.bin_search_y(property_id, self.find_y(subject_id), self.last_y(subject_id) + 1) } - fn build_wavelet(mut sequence: Sequence) -> WaveletMatrix { + fn build_wavelet(mut sequence: Sequence) -> WaveletMatrix { debug!("Building wavelet matrix..."); - let mut wavelet_builder = WaveletMatrixBuilder::with_width(sequence.bits_per_entry); + let mut builder = + CompactVector::new(sequence.bits_per_entry).expect("Failed to create wavelet matrix builder"); + // possible refactor of Sequence to use sucds CompactVector, then builder can be removed for x in &sequence { - wavelet_builder.push(x); + builder.push_int(x).unwrap(); } assert!(sequence.crc_handle.take().unwrap().join().unwrap(), "wavelet source CRC check failed."); drop(sequence); - let wavelet = wavelet_builder.build().expect("Error building the wavelet matrix. Aborting."); + let wavelet = WaveletMatrix::new(builder).expect("Error building the wavelet matrix. Aborting."); debug!("built wavelet matrix with length {}", wavelet.len()); wavelet } @@ -235,22 +237,23 @@ impl TriplesBitmap { } // reduce memory consumption of index by using adjacency list let mut bitmap_index_dict = RsDict::new(); - let mut cv = CompactVector::with_capacity(entries, sucds::util::needed_bits(entries)); + let mut cv = CompactVector::with_capacity(entries, sucds::utils::needed_bits(entries)) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; let wavelet_y = wavelet_thread.join().unwrap(); /* let get_p = |pos_z: u32| { let pos_y = bitmap_z.dict.rank(pos_z.to_owned() as u64, true); - wavelet_y.get(pos_y as usize) as Id + wavelet_y.access(pos_y as usize).unwrap() as Id }; */ for mut indices in indicess { let mut first = true; // sort by predicate - indices.sort_by_cached_key(|pos_y| wavelet_y.get(*pos_y as usize)); + indices.sort_by_cached_key(|pos_y| wavelet_y.access(*pos_y as usize).unwrap()); for index in indices { bitmap_index_dict.push(first); first = false; - cv.push(index as usize); + cv.push_int(index as usize).unwrap(); } } let bitmap_index = Bitmap { dict: bitmap_index_dict }; diff --git a/src/triples/object_iter.rs b/src/triples/object_iter.rs index df068b9..62f2a8b 100644 --- a/src/triples/object_iter.rs +++ b/src/triples/object_iter.rs @@ -1,6 +1,7 @@ use crate::triples::Id; use crate::triples::TripleId; use crate::triples::TriplesBitmap; +use sucds::int_vectors::Access; // see "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012 // https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36 @@ -33,8 +34,8 @@ impl<'a> Iterator for ObjectIter<'a> { if self.pos_index > self.max_index { return None; } - let pos_y = self.triples.op_index.sequence.get(self.pos_index); - let y = self.triples.wavelet_y.get(pos_y) as Id; + let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap(); + let y = self.triples.wavelet_y.access(pos_y).unwrap() as Id; let x = self.triples.bitmap_y.dict.rank(pos_y as u64, true) as Id + 1; self.pos_index += 1; Some(TripleId::new(x, y, self.o)) diff --git a/src/triples/predicate_iter.rs b/src/triples/predicate_iter.rs index 3858fe7..31618fd 100644 --- a/src/triples/predicate_iter.rs +++ b/src/triples/predicate_iter.rs @@ -18,7 +18,7 @@ impl<'a> PredicateIter<'a> { /// Panics if the object does not exist. pub fn new(triples: &'a TriplesBitmap, p: Id) -> Self { assert!(p != 0, "object 0 does not exist, cant iterate"); - let occs = triples.wavelet_y.rank(triples.wavelet_y.len(), p as usize); + let occs = triples.wavelet_y.rank(triples.wavelet_y.len(), p as usize).unwrap(); //println!("the predicate {} is used by {} subjects in the index", p, occs); PredicateIter { triples, p, i: 0, pos_z: 0, os: 0, s: 0, occs } } @@ -32,7 +32,7 @@ impl<'a> Iterator for PredicateIter<'a> { } if self.os == 0 { // Algorithm 1 findSubj from Martinez et al. 2012 ****** - let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize) as u64; + let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize).unwrap() as u64; self.s = self.triples.bitmap_y.dict.rank(pos_y, true) as Id + 1; // ***************************************************** // SP can have multiple O diff --git a/src/triples/predicate_object_iter.rs b/src/triples/predicate_object_iter.rs index 9deb0ea..32848f6 100644 --- a/src/triples/predicate_object_iter.rs +++ b/src/triples/predicate_object_iter.rs @@ -1,6 +1,7 @@ use crate::triples::Id; use crate::triples::TriplesBitmap; use std::cmp::Ordering; +use sucds::int_vectors::Access; // see filterPredSubj in "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012 // https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36 @@ -21,8 +22,8 @@ impl<'a> PredicateObjectIter<'a> { let mut low = triples.op_index.find(o); let mut high = triples.op_index.last(o); let get_y = |pos_index| { - let pos_y = triples.op_index.sequence.get(pos_index) as u64; - triples.wavelet_y.get(pos_y as usize) as Id + let pos_y = triples.op_index.sequence.access(pos_index).unwrap() as u64; + triples.wavelet_y.access(pos_y as usize).unwrap() as Id }; // Binary search with a twist: // Each value may occur multiple times, so we search for the left and right borders. @@ -68,7 +69,7 @@ impl<'a> Iterator for PredicateObjectIter<'a> { if self.pos_index > self.max_index { return None; } - let pos_y = self.triples.op_index.sequence.get(self.pos_index) as u64; + let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap() as u64; //let y = self.triples.wavelet_y.get(pos_y as usize) as Id; //println!(" op p {y}"); let s = self.triples.bitmap_y.dict.rank(pos_y, true) as Id + 1; diff --git a/src/triples/subject_iter.rs b/src/triples/subject_iter.rs index 4428956..742cddd 100644 --- a/src/triples/subject_iter.rs +++ b/src/triples/subject_iter.rs @@ -17,7 +17,7 @@ pub struct SubjectIter<'a> { impl<'a> SubjectIter<'a> { /// Create an iterator over all triples. - pub const fn new(triples: &'a TriplesBitmap) -> Self { + pub fn new(triples: &'a TriplesBitmap) -> Self { SubjectIter { triples, x: 1, // was 0 in the old code but it should start at 1 @@ -114,7 +114,7 @@ impl<'a> Iterator for SubjectIter<'a> { return None; } - let y = self.triples.wavelet_y.get(self.pos_y) as Id; + let y = self.triples.wavelet_y.access(self.pos_y).unwrap() as Id; if self.search_z > 0 { self.pos_y += 1;