Skip to content

Commit

Permalink
Merge pull request #1 from matthiasthomas/main
Browse files Browse the repository at this point in the history
feat: add support for gpt4o
  • Loading branch information
hupe1980 authored May 18, 2024
2 parents ab2365a + 0e2b676 commit 275e4ff
Show file tree
Hide file tree
Showing 7 changed files with 200,070 additions and 3 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# ✂️ go-tiktoken
![Build Status](https://github.com/hupe1980/go-tiktoken/workflows/build/badge.svg)
![Build Status](https://github.com/hupe1980/go-tiktoken/workflows/build/badge.svg)
[![Go Reference](https://pkg.go.dev/badge/github.com/hupe1980/go-tiktoken.svg)](https://pkg.go.dev/github.com/hupe1980/go-tiktoken)
> OpenAI's [tiktoken](https://github.com/openai/tiktoken) tokenizer written in Go. The vocabularies are embedded and do not need to be downloaded at runtime.
Expand Down Expand Up @@ -43,12 +43,13 @@ Tokens: [Hello World]
For more example usage, see [_examples](./_examples).

## Supported Encodings
- ✅ o200k_base
- ✅ cl100k_base
- ✅ p50k_base
- ✅ p50k_edit
- ✅ r50k_base
- ✅ gpt2
- ✅ claude
- ✅ gpt2
- ✅ claude

## License
[MIT](LICENCE)
2 changes: 2 additions & 0 deletions encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ func NewEncodingByName(encoding string) (*Encoding, error) {
)

switch encoding {
case O200kBase:
codec, err = NewO200KBase()
case CL100kBase:
codec, err = NewCL100kBase()
case P50kBase:
Expand Down
28 changes: 28 additions & 0 deletions encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,34 @@ func TestCL100kEncoding(t *testing.T) {
})
}

func TestO200kEncoding(t *testing.T) {
encoding, err := NewEncodingByName(O200kBase)
assert.NoError(t, err)

t.Run("default", func(t *testing.T) {
text := "hello world"
ids, _ := encoding.EncodeOrdinary(text)
assert.ElementsMatch(t, []uint{24912, 2375}, ids)
})

t.Run("special token", func(t *testing.T) {
text := "hello <|endoftext|>"
ids, _, err := encoding.Encode(text, []string{"all"}, nil)
assert.NoError(t, err)
assert.ElementsMatch(t, []uint{24912, 220, 199999}, ids)
})

t.Run("decode", func(t *testing.T) {
assert.Equal(t, "hello world", string(encoding.Decode([]uint{24912, 2375})))
})

t.Run("not allowed", func(t *testing.T) {
text := "hello <|endoftext|>"
_, _, err := encoding.Encode(text, nil, []string{"<|endoftext|>"})
assert.Error(t, err)
})
}

func TestSpecialTokenRegex(t *testing.T) {
testCases := []struct {
name string
Expand Down
29 changes: 29 additions & 0 deletions o200k_base.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package tiktoken

import (
_ "embed"
"strings"
)

//go:embed resource/o200k_base.tiktoken
var o200kBase string

// NewO200KBase creates a new Codec instance for the o200k_base tokenization scheme.
// It loads the mergeable ranks from the embedded o200kBase resource.
// The function returns a pointer to the Codec or an error if any.
func NewO200KBase() (*Codec, error) {
ranks, err := ConvertToMergeableBPERanks(strings.NewReader(o200kBase))
if err != nil {
return nil, err
}

return &Codec{
Name: "o200k_base",
PatStr: `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
MergeableRanks: ranks,
SpecialTokens: map[string]uint{
EndOfText: 199999,
EndOfPrompt: 200018,
},
}, nil
}
Loading

0 comments on commit 275e4ff

Please sign in to comment.