Skip to content

Commit

Permalink
fix: skip special tokens in the example
Browse files Browse the repository at this point in the history
  • Loading branch information
ChieloNewctle committed Apr 25, 2024
1 parent 09090e1 commit 1a6d18b
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ See [`examples/rand-infer.rs`](examples/rand-infer.rs).
echo '"def helloworl"' | cargo run --example rand-infer
```

```sh
echo '"def hellowor<unk>l"' | cargo run --example rand-infer
```

## TODOs

- [ ] Python bindings
Expand Down
29 changes: 28 additions & 1 deletion examples/rand-infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,15 @@ fn build_vocab<T: AsRef<Tokenizer>>(tokenizer: T) -> Result<Vec<Vec<u8>>> {
match parse_byte_repr(token) {
Ok(byte) => token_bytes[id as usize].push(byte),
Err(_) => {
if tokenizer
.get_added_vocabulary()
.get_added_tokens_decoder()
.contains_key(&id)
{
// ignore special tokens
continue;
}

let decoded = tokenizer
.decode(&[dummy_token_id, id, dummy_token_id], false)
.map_err(|e| eyre!(e))?;
Expand Down Expand Up @@ -167,6 +176,24 @@ async fn main_body() -> Result<()> {
.decode(tokenized.get_ids(), false)
.map_err(|e| eyre!(e))?;

let offset = tokenized
.get_ids()
.iter()
.filter_map(|&id| {
tokenizer
.get_added_vocabulary()
.get_added_tokens_decoder()
.get(&id)
})
.last()
.and_then(|special_token| {
println!("{special_token:?}");
text.rfind(&special_token.content)
.map(|pos| pos + special_token.content.len())
})
.unwrap_or(0);
println!("search from pos {offset}\n");

let Some((tree, mut req)) = SearchTree::new(
automaton.clone(),
|end_pos| async {
Expand All @@ -178,7 +205,7 @@ async fn main_body() -> Result<()> {
Ok::<_, tokenizers::Error>(res)
},
text.as_str(),
0,
offset,
)
.await
.map_err(|e| eyre!(e))?
Expand Down

0 comments on commit 1a6d18b

Please sign in to comment.