Merge pull request #1 from matthiasthomas/main

feat: add support for gpt4o
hupe1980 · May 18, 2024 · 275e4ff · 275e4ff
2 parents ab2365a + 0e2b676
commit 275e4ff
Show file tree

Hide file tree

Showing 7 changed files with 200,070 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # ✂️ go-tiktoken
-![Build Status](https://github.com/hupe1980/go-tiktoken/workflows/build/badge.svg) 
+![Build Status](https://github.com/hupe1980/go-tiktoken/workflows/build/badge.svg)
 [![Go Reference](https://pkg.go.dev/badge/github.com/hupe1980/go-tiktoken.svg)](https://pkg.go.dev/github.com/hupe1980/go-tiktoken)
 > OpenAI's [tiktoken](https://github.com/openai/tiktoken) tokenizer written in Go. The vocabularies are embedded and do not need to be downloaded at runtime.
 
@@ -43,12 +43,13 @@ Tokens: [Hello  World]
 For more example usage, see [_examples](./_examples).
 
 ## Supported Encodings
+- ✅ o200k_base
 - ✅ cl100k_base
 - ✅ p50k_base
 - ✅ p50k_edit
 - ✅ r50k_base
-- ✅ gpt2 
-- ✅ claude 
+- ✅ gpt2
+- ✅ claude
 
 ## License
 [MIT](LICENCE)
diff --git a/encoding.go b/encoding.go
@@ -24,6 +24,8 @@ func NewEncodingByName(encoding string) (*Encoding, error) {
 	)
 
 	switch encoding {
+	case O200kBase:
+		codec, err = NewO200KBase()
 	case CL100kBase:
 		codec, err = NewCL100kBase()
 	case P50kBase:

diff --git a/encoding_test.go b/encoding_test.go
@@ -63,6 +63,34 @@ func TestCL100kEncoding(t *testing.T) {
 	})
 }
 
+func TestO200kEncoding(t *testing.T) {
+	encoding, err := NewEncodingByName(O200kBase)
+	assert.NoError(t, err)
+
+	t.Run("default", func(t *testing.T) {
+		text := "hello world"
+		ids, _ := encoding.EncodeOrdinary(text)
+		assert.ElementsMatch(t, []uint{24912, 2375}, ids)
+	})
+
+	t.Run("special token", func(t *testing.T) {
+		text := "hello <|endoftext|>"
+		ids, _, err := encoding.Encode(text, []string{"all"}, nil)
+		assert.NoError(t, err)
+		assert.ElementsMatch(t, []uint{24912, 220, 199999}, ids)
+	})
+
+	t.Run("decode", func(t *testing.T) {
+		assert.Equal(t, "hello world", string(encoding.Decode([]uint{24912, 2375})))
+	})
+
+	t.Run("not allowed", func(t *testing.T) {
+		text := "hello <|endoftext|>"
+		_, _, err := encoding.Encode(text, nil, []string{"<|endoftext|>"})
+		assert.Error(t, err)
+	})
+}
+
 func TestSpecialTokenRegex(t *testing.T) {
 	testCases := []struct {
 		name                 string

diff --git a/o200k_base.go b/o200k_base.go
@@ -0,0 +1,29 @@
+package tiktoken
+
+import (
+	_ "embed"
+	"strings"
+)
+
+//go:embed resource/o200k_base.tiktoken
+var o200kBase string
+
+// NewO200KBase creates a new Codec instance for the o200k_base tokenization scheme.
+// It loads the mergeable ranks from the embedded o200kBase resource.
+// The function returns a pointer to the Codec or an error if any.
+func NewO200KBase() (*Codec, error) {
+	ranks, err := ConvertToMergeableBPERanks(strings.NewReader(o200kBase))
+	if err != nil {
+		return nil, err
+	}
+
+	return &Codec{
+		Name:           "o200k_base",
+		PatStr:         `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		MergeableRanks: ranks,
+		SpecialTokens: map[string]uint{
+			EndOfText:   199999,
+			EndOfPrompt: 200018,
+		},
+	}, nil
+}