Skip to content

Commit

Permalink
Update sucds from 0.6.0 to 0.7.0 and refactor code accordingly. Relea…
Browse files Browse the repository at this point in the history
…se 0.1.1.
  • Loading branch information
KonradHoeffner committed Apr 27, 2023
1 parent 815dbd4 commit ef4d425
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 24 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "hdt"
version = "0.1.0"
version = "0.1.1"
repository = "https://github.com/konradhoeffner/hdt"
authors = ["Tim Baccaert <[email protected]>", "Konrad Höffner"]
license = "MIT"
Expand All @@ -20,7 +20,7 @@ langtag = "^0.3.2"
ntriple = "^0.1.1"
rsdict = { version = "0.0.6", features = ["simd"] }
sophia = { version = "0.8.0-alpha.0", optional = true }
sucds = "0.6.0"
sucds = "0.7.0"
thiserror = "1.0.37"
log = "0.4"
mownstr = "0.2.0"
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Add the following to Cargo.toml:

```toml
[dependencies]
hdt = "0.0.13-alpha.0"
hdt = "0.1.1"
```

Nightly is required:
Expand Down Expand Up @@ -70,9 +70,11 @@ If you don't want to pull in the Sophia dependency, you can exclude the adapter:

```toml
[dependencies]
hdt = { version = "0.0.13-alpha.0", default-features = false }
hdt = { version = "0.1.1", default-features = false }
```

There is also a runnable example are [in the examples folder](https://github.com/KonradHoeffner/hdt/tree/main/examples), which you can run with `cargo run --example query`.

## API Documentation

See [docs.rs/latest/hdt](https://docs.rs/hdt) or generate for yourself with `cargo doc --no-deps` without disabling default features.
Expand Down
25 changes: 14 additions & 11 deletions src/triples.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::convert::TryFrom;
use std::fmt;
use std::io;
use std::io::BufRead;
use sucds::{CompactVector, Searial, WaveletMatrix, WaveletMatrixBuilder};
use sucds::{bit_vectors::Rank9Sel, char_sequences::WaveletMatrix, int_vectors::CompactVector, Serializable};

mod subject_iter;
pub use subject_iter::SubjectIter;
Expand Down Expand Up @@ -102,7 +102,7 @@ pub struct TriplesBitmap {
/// Index for object-based access. Points to the predicate layer.
pub op_index: OpIndex,
/// wavelet matrix for predicate-based access
pub wavelet_y: WaveletMatrix,
pub wavelet_y: WaveletMatrix<Rank9Sel>,
}

impl fmt::Debug for TriplesBitmap {
Expand Down Expand Up @@ -155,7 +155,7 @@ impl TriplesBitmap {

while low < high {
let mid = (low + high) / 2;
match self.wavelet_y.get(mid).cmp(&element) {
match self.wavelet_y.access(mid).unwrap().cmp(&element) {
Ordering::Less => low = mid + 1,
Ordering::Greater => high = mid,
Ordering::Equal => return Some(mid),
Expand All @@ -169,15 +169,17 @@ impl TriplesBitmap {
self.bin_search_y(property_id, self.find_y(subject_id), self.last_y(subject_id) + 1)
}

fn build_wavelet(mut sequence: Sequence) -> WaveletMatrix {
fn build_wavelet(mut sequence: Sequence) -> WaveletMatrix<Rank9Sel> {
debug!("Building wavelet matrix...");
let mut wavelet_builder = WaveletMatrixBuilder::with_width(sequence.bits_per_entry);
let mut builder =
CompactVector::new(sequence.bits_per_entry).expect("Failed to create wavelet matrix builder");
// possible refactor of Sequence to use sucds CompactVector, then builder can be removed
for x in &sequence {
wavelet_builder.push(x);
builder.push_int(x).unwrap();
}
assert!(sequence.crc_handle.take().unwrap().join().unwrap(), "wavelet source CRC check failed.");
drop(sequence);
let wavelet = wavelet_builder.build().expect("Error building the wavelet matrix. Aborting.");
let wavelet = WaveletMatrix::new(builder).expect("Error building the wavelet matrix. Aborting.");
debug!("built wavelet matrix with length {}", wavelet.len());
wavelet
}
Expand Down Expand Up @@ -235,22 +237,23 @@ impl TriplesBitmap {
}
// reduce memory consumption of index by using adjacency list
let mut bitmap_index_dict = RsDict::new();
let mut cv = CompactVector::with_capacity(entries, sucds::util::needed_bits(entries));
let mut cv = CompactVector::with_capacity(entries, sucds::utils::needed_bits(entries))
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
let wavelet_y = wavelet_thread.join().unwrap();
/*
let get_p = |pos_z: u32| {
let pos_y = bitmap_z.dict.rank(pos_z.to_owned() as u64, true);
wavelet_y.get(pos_y as usize) as Id
wavelet_y.access(pos_y as usize).unwrap() as Id
};
*/
for mut indices in indicess {
let mut first = true;
// sort by predicate
indices.sort_by_cached_key(|pos_y| wavelet_y.get(*pos_y as usize));
indices.sort_by_cached_key(|pos_y| wavelet_y.access(*pos_y as usize).unwrap());
for index in indices {
bitmap_index_dict.push(first);
first = false;
cv.push(index as usize);
cv.push_int(index as usize).unwrap();
}
}
let bitmap_index = Bitmap { dict: bitmap_index_dict };
Expand Down
5 changes: 3 additions & 2 deletions src/triples/object_iter.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::triples::Id;
use crate::triples::TripleId;
use crate::triples::TriplesBitmap;
use sucds::int_vectors::Access;

// see "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012
// https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36
Expand Down Expand Up @@ -33,8 +34,8 @@ impl<'a> Iterator for ObjectIter<'a> {
if self.pos_index > self.max_index {
return None;
}
let pos_y = self.triples.op_index.sequence.get(self.pos_index);
let y = self.triples.wavelet_y.get(pos_y) as Id;
let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap();
let y = self.triples.wavelet_y.access(pos_y).unwrap() as Id;
let x = self.triples.bitmap_y.dict.rank(pos_y as u64, true) as Id + 1;
self.pos_index += 1;
Some(TripleId::new(x, y, self.o))
Expand Down
4 changes: 2 additions & 2 deletions src/triples/predicate_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ impl<'a> PredicateIter<'a> {
/// Panics if the object does not exist.
pub fn new(triples: &'a TriplesBitmap, p: Id) -> Self {
assert!(p != 0, "object 0 does not exist, cant iterate");
let occs = triples.wavelet_y.rank(triples.wavelet_y.len(), p as usize);
let occs = triples.wavelet_y.rank(triples.wavelet_y.len(), p as usize).unwrap();
//println!("the predicate {} is used by {} subjects in the index", p, occs);
PredicateIter { triples, p, i: 0, pos_z: 0, os: 0, s: 0, occs }
}
Expand All @@ -32,7 +32,7 @@ impl<'a> Iterator for PredicateIter<'a> {
}
if self.os == 0 {
// Algorithm 1 findSubj from Martinez et al. 2012 ******
let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize) as u64;
let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize).unwrap() as u64;
self.s = self.triples.bitmap_y.dict.rank(pos_y, true) as Id + 1;
// *****************************************************
// SP can have multiple O
Expand Down
7 changes: 4 additions & 3 deletions src/triples/predicate_object_iter.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::triples::Id;
use crate::triples::TriplesBitmap;
use std::cmp::Ordering;
use sucds::int_vectors::Access;

// see filterPredSubj in "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012
// https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36
Expand All @@ -21,8 +22,8 @@ impl<'a> PredicateObjectIter<'a> {
let mut low = triples.op_index.find(o);
let mut high = triples.op_index.last(o);
let get_y = |pos_index| {
let pos_y = triples.op_index.sequence.get(pos_index) as u64;
triples.wavelet_y.get(pos_y as usize) as Id
let pos_y = triples.op_index.sequence.access(pos_index).unwrap() as u64;
triples.wavelet_y.access(pos_y as usize).unwrap() as Id
};
// Binary search with a twist:
// Each value may occur multiple times, so we search for the left and right borders.
Expand Down Expand Up @@ -68,7 +69,7 @@ impl<'a> Iterator for PredicateObjectIter<'a> {
if self.pos_index > self.max_index {
return None;
}
let pos_y = self.triples.op_index.sequence.get(self.pos_index) as u64;
let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap() as u64;
//let y = self.triples.wavelet_y.get(pos_y as usize) as Id;
//println!(" op p {y}");
let s = self.triples.bitmap_y.dict.rank(pos_y, true) as Id + 1;
Expand Down
4 changes: 2 additions & 2 deletions src/triples/subject_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pub struct SubjectIter<'a> {

impl<'a> SubjectIter<'a> {
/// Create an iterator over all triples.
pub const fn new(triples: &'a TriplesBitmap) -> Self {
pub fn new(triples: &'a TriplesBitmap) -> Self {
SubjectIter {
triples,
x: 1, // was 0 in the old code but it should start at 1
Expand Down Expand Up @@ -114,7 +114,7 @@ impl<'a> Iterator for SubjectIter<'a> {
return None;
}

let y = self.triples.wavelet_y.get(self.pos_y) as Id;
let y = self.triples.wavelet_y.access(self.pos_y).unwrap() as Id;

if self.search_z > 0 {
self.pos_y += 1;
Expand Down

0 comments on commit ef4d425

Please sign in to comment.