From 0b9b43bede9abd3e62859a18d6e8d28771cf2119 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 08:11:21 -0400 Subject: [PATCH 01/29] Create the basic class for holding metrics --- guidance/models/_guidance_metrics.py | 6 ++++++ guidance/models/_model.py | 2 ++ 2 files changed, 8 insertions(+) create mode 100644 guidance/models/_guidance_metrics.py diff --git a/guidance/models/_guidance_metrics.py b/guidance/models/_guidance_metrics.py new file mode 100644 index 000000000..ce79bf8c8 --- /dev/null +++ b/guidance/models/_guidance_metrics.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel, NonNegativeInt + + +class GuidanceMetrics(BaseModel): + prompt_tokens: NonNegativeInt = 0 + generated_tokens: NonNegativeInt = 0 diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 9213d2f92..c454ce8fe 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -52,6 +52,8 @@ from .. import _serialization_pb2 +from ._guidance_metrics import GuidanceMetrics + if TYPE_CHECKING: from ..library._block import ContextBlock From 6faf8db034ee75d7848fadd410af593cec9b7b63 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 08:17:28 -0400 Subject: [PATCH 02/29] Put in, along with a very basic test --- guidance/models/_model.py | 1 + tests/library/test_gen.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index c454ce8fe..3ba86aa8c 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -922,6 +922,7 @@ def __init__(self, engine, echo=True, **kwargs): self._last_event_stream = ( 0 # used to track the last event streaming call to enable throttling ) + self.metrics = GuidanceMetrics() @property def active_role_end(self): diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index d89806b12..b0a7f6a80 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -85,11 +85,13 @@ def test_unicode(selected_model): # fmt: on -def test_unicode2(selected_model): +def test_unicode2(selected_model: models.Model): lm = selected_model prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) - assert True + assert lm.metrics.prompt_tokens > 0 + assert lm.metrics.generated_tokens > 0 + assert lm.metrics.generated_tokens <= 10 def test_gsm8k(): From bbbec17e344f5b209f91434d2964ca73665a6ba3 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 08:26:47 -0400 Subject: [PATCH 03/29] Another test to watch the metrics --- tests/models/common_chat_testing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/common_chat_testing.py b/tests/models/common_chat_testing.py index 99c45e860..03d2281e9 100644 --- a/tests/models/common_chat_testing.py +++ b/tests/models/common_chat_testing.py @@ -16,6 +16,9 @@ def smoke_chat(lm: models.Chat, has_system_role: bool = True): print(str(lm)) assert len(lm["text"]) > 0 assert str(lm).endswith("Pick a number: <|im_end|>") + assert lm.metrics.prompt_tokens > 0 + assert lm.metrics.generated_tokens > 0 + assert lm.metrics.generated_tokens <= 10 def longer_chat_1(lm: models.Chat, has_system_role: bool = True): From 25f42bf76e5e5ab32a76f6a86f34451e798fa0d2 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 10:42:22 -0400 Subject: [PATCH 04/29] Getting things close to working..... --- guidance/models/_model.py | 17 ++++++++++++++--- guidance/models/transformers/_transformers.py | 4 ++++ tests/library/test_gen.py | 15 ++++++++++++++- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 3ba86aa8c..929b3f519 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -9,7 +9,7 @@ from pprint import pprint -from typing import Dict, TYPE_CHECKING +from typing import Dict, Tuple, TYPE_CHECKING import numpy as np @@ -131,6 +131,7 @@ class EngineCallResponse: capture_groups: dict capture_group_log_probs: dict new_token_count: int + metrics: GuidanceMetrics def __init__( self, @@ -204,6 +205,9 @@ def __init__(self, tokenizer, compute_log_probs=False): ) self._token_trie.match = True self._token_trie.match_version = 0 + # Any time get_logits is called, it should update this + # This does add to the list of "Thread Unsafety" + self.metrics = GuidanceMetrics() def start(self, parser, grammar, ensure_bos_token=True): """Start processing parser state executed through the grammar. @@ -846,11 +850,11 @@ def _cleanup_tokens(self, token_ids, token_byte_positions): return token_ids, token_byte_positions - def get_logits(self, token_ids, forced_bytes, current_temp): + def get_logits(self, token_ids, forced_bytes, current_temp) -> Tuple[np.ndarray, GuidanceMetrics]: """A fake method designed to be overriden by subclasses.""" # pretend to extend the KV cache and update the log probs - return np.randn(len(self.tokenizer.tokens)) + return np.randn(len(self.tokenizer.tokens)), GuidanceMetrics() def _report_failed_match(self, prompt): """Note that this can be overridden by subclasses that have more likely reasons than a bug in the token set (like remote models).""" @@ -1367,6 +1371,9 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): # we will return a new extended version of ourselves, which we track as `lm` lm = self + # Prepare our metrics update. This is part of our Thread Unsafety programme + metrics_before = lm.engine.metrics.model_copy(deep=True) + # single generation if n == 1: generated_value = "" @@ -1448,6 +1455,10 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): unreplace_model_variables(replacements) + # Now update our metrics while maintaining Thread Unsafety + lm.metrics.prompt_tokens += (self.engine.metrics.prompt_tokens - metrics_before.prompt_tokens) + lm.metrics.generated_tokens += (self.engine.metrics.generated_tokens - metrics_before.generated_tokens) + logger.debug("finish Model._run_stateless") return lm diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index e570614e5..2939aa1b1 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -266,6 +266,10 @@ def get_logits(self, token_ids, forced_bytes, current_temp): model_out.logits[0, -1, : len(self.tokenizer.tokens)].cpu().numpy() ) + # Update metrics + self.metrics.prompt_tokens += len(new_token_ids) + self.metrics.generated_tokens += 1 + return self._cached_logits diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index b0a7f6a80..38db4ef9c 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -73,6 +73,18 @@ def test_stop_quote(selected_model): assert not lm["title"].endswith('"') +def test_metrics_smoke(selected_model): + lm = selected_model + + lm += "abc" + lm += gen("first", max_tokens=1) + assert lm.metrics.generated_tokens == 1 + + lm += "efg" + lm += gen("second", max_tokens=1) + assert lm.metrics.generated_tokens == 2 + + def test_unicode(selected_model): # black makes this test ugly -- easier to read with fmt: off # fmt: off @@ -89,9 +101,10 @@ def test_unicode2(selected_model: models.Model): lm = selected_model prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) + print(f"Output: {str(lm)}") assert lm.metrics.prompt_tokens > 0 assert lm.metrics.generated_tokens > 0 - assert lm.metrics.generated_tokens <= 10 + assert lm.metrics.generated_tokens <= 10 + 1 def test_gsm8k(): From 230f782624c9b9f1a1205a976cf2e3e0edad1784 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 10:45:32 -0400 Subject: [PATCH 05/29] Remove minor hangover --- guidance/models/_model.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 929b3f519..272a7deeb 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -9,7 +9,7 @@ from pprint import pprint -from typing import Dict, Tuple, TYPE_CHECKING +from typing import Dict, TYPE_CHECKING import numpy as np @@ -741,7 +741,7 @@ def __call__(self, parser, grammar, ensure_bos_token=True): self.start(parser, grammar, ensure_bos_token) # TODO: remove this after the next release. This verifies that calling Rust works. - assert("def" == engine_start("abc", "def", 1)) + assert "def" == engine_start("abc", "def", 1) logits = None while True: @@ -850,11 +850,11 @@ def _cleanup_tokens(self, token_ids, token_byte_positions): return token_ids, token_byte_positions - def get_logits(self, token_ids, forced_bytes, current_temp) -> Tuple[np.ndarray, GuidanceMetrics]: + def get_logits(self, token_ids, forced_bytes, current_temp) -> np.ndarray: """A fake method designed to be overriden by subclasses.""" # pretend to extend the KV cache and update the log probs - return np.randn(len(self.tokenizer.tokens)), GuidanceMetrics() + return np.randn(len(self.tokenizer.tokens)) def _report_failed_match(self, prompt): """Note that this can be overridden by subclasses that have more likely reasons than a bug in the token set (like remote models).""" @@ -1456,8 +1456,12 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): unreplace_model_variables(replacements) # Now update our metrics while maintaining Thread Unsafety - lm.metrics.prompt_tokens += (self.engine.metrics.prompt_tokens - metrics_before.prompt_tokens) - lm.metrics.generated_tokens += (self.engine.metrics.generated_tokens - metrics_before.generated_tokens) + lm.metrics.prompt_tokens += ( + self.engine.metrics.prompt_tokens - metrics_before.prompt_tokens + ) + lm.metrics.generated_tokens += ( + self.engine.metrics.generated_tokens - metrics_before.generated_tokens + ) logger.debug("finish Model._run_stateless") From bdd80e7375e28e3c73c23bbae5e3dc98769a7e67 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 10:46:09 -0400 Subject: [PATCH 06/29] Another oversight --- guidance/models/_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 272a7deeb..f773bd880 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -131,7 +131,6 @@ class EngineCallResponse: capture_groups: dict capture_group_log_probs: dict new_token_count: int - metrics: GuidanceMetrics def __init__( self, From 392a4794137a75d13ab65c35116579262cf4ab37 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 10:47:19 -0400 Subject: [PATCH 07/29] Add a comment --- guidance/models/_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index f773bd880..3862ced03 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -925,6 +925,8 @@ def __init__(self, engine, echo=True, **kwargs): self._last_event_stream = ( 0 # used to track the last event streaming call to enable throttling ) + + # Metrics for the model self.metrics = GuidanceMetrics() @property From 76d533e45e4812fa8c358afd9f8ee81c28a107a5 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Tue, 7 May 2024 11:05:00 -0400 Subject: [PATCH 08/29] Need to be able to reset the metrics on the Model --- guidance/models/_model.py | 3 +++ tests/library/test_gen.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 3862ced03..2c8f5bbd5 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -929,6 +929,9 @@ def __init__(self, engine, echo=True, **kwargs): # Metrics for the model self.metrics = GuidanceMetrics() + def reset_metrics(self): + self.metrics = GuidanceMetrics() + @property def active_role_end(self): """The default end patterns we should use for `gen` calls. diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 38db4ef9c..6b8af0c6f 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -75,6 +75,7 @@ def test_stop_quote(selected_model): def test_metrics_smoke(selected_model): lm = selected_model + lm.reset_metrics() lm += "abc" lm += gen("first", max_tokens=1) @@ -99,6 +100,7 @@ def test_unicode(selected_model): def test_unicode2(selected_model: models.Model): lm = selected_model + lm.reset_metrics() prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) print(f"Output: {str(lm)}") From 34881c97c62263cdebe7e19cb76a46dec91aad16 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 08:13:42 -0400 Subject: [PATCH 09/29] Thinking about another metric --- guidance/models/_model.py | 6 ++++++ tests/library/test_gen.py | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 2c8f5bbd5..2ab3e413c 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -932,6 +932,12 @@ def __init__(self, engine, echo=True, **kwargs): def reset_metrics(self): self.metrics = GuidanceMetrics() + @property + def current_token_count(self)->int: + current_string = str(self) + current_tokens = self.engine.tokenizer(current_string) + return len(current_tokens) + @property def active_role_end(self): """The default end patterns we should use for `gen` calls. diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 6b8af0c6f..af8153064 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -73,7 +73,7 @@ def test_stop_quote(selected_model): assert not lm["title"].endswith('"') -def test_metrics_smoke(selected_model): +def test_metrics_smoke(selected_model: models.Model): lm = selected_model lm.reset_metrics() @@ -85,6 +85,10 @@ def test_metrics_smoke(selected_model): lm += gen("second", max_tokens=1) assert lm.metrics.generated_tokens == 2 + assert lm.current_token_count >= ( + lm.metrics.prompt_tokens + lm.metrics.generated_tokens + ) + def test_unicode(selected_model): # black makes this test ugly -- easier to read with fmt: off From 96de164f1b465665365cb0ceb68ac4d630eb7f78 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 09:01:59 -0400 Subject: [PATCH 10/29] Figure out how to call tokeniser --- guidance/models/transformers/_transformers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 2939aa1b1..589c461a4 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -121,7 +121,10 @@ def _tokenizer(self, model, **kwargs): ), "You must give a model name when you provide a tokenizer object!" return tokenizer - + + def __call__(self, byte_string): + tokenisation = self._orig_tokenizer(byte_string) + return tokenisation['input_ids'] class TransformersEngine(Engine): def __init__(self, model, tokenizer, compute_log_probs, **kwargs): From 2fa6521e745a2fcde8b0b2bc264681ec63022805 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 09:03:09 -0400 Subject: [PATCH 11/29] Reformat --- guidance/models/_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 2ab3e413c..dffc1b912 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -933,7 +933,7 @@ def reset_metrics(self): self.metrics = GuidanceMetrics() @property - def current_token_count(self)->int: + def current_token_count(self) -> int: current_string = str(self) current_tokens = self.engine.tokenizer(current_string) return len(current_tokens) From c3a0c6b8cf876dee5d4b2a7da4c0b36bd385ac52 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 09:08:04 -0400 Subject: [PATCH 12/29] Do some renaming --- ...idance_metrics.py => _guidance_engine_metrics.py} | 2 +- guidance/models/_model.py | 12 ++++++------ tests/library/test_gen.py | 12 ++++++------ tests/models/common_chat_testing.py | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) rename guidance/models/{_guidance_metrics.py => _guidance_engine_metrics.py} (76%) diff --git a/guidance/models/_guidance_metrics.py b/guidance/models/_guidance_engine_metrics.py similarity index 76% rename from guidance/models/_guidance_metrics.py rename to guidance/models/_guidance_engine_metrics.py index ce79bf8c8..29f58d31f 100644 --- a/guidance/models/_guidance_metrics.py +++ b/guidance/models/_guidance_engine_metrics.py @@ -1,6 +1,6 @@ from pydantic import BaseModel, NonNegativeInt -class GuidanceMetrics(BaseModel): +class GuidanceEngineMetrics(BaseModel): prompt_tokens: NonNegativeInt = 0 generated_tokens: NonNegativeInt = 0 diff --git a/guidance/models/_model.py b/guidance/models/_model.py index dffc1b912..5a936915e 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -52,7 +52,7 @@ from .. import _serialization_pb2 -from ._guidance_metrics import GuidanceMetrics +from ._guidance_engine_metrics import GuidanceEngineMetrics if TYPE_CHECKING: from ..library._block import ContextBlock @@ -206,7 +206,7 @@ def __init__(self, tokenizer, compute_log_probs=False): self._token_trie.match_version = 0 # Any time get_logits is called, it should update this # This does add to the list of "Thread Unsafety" - self.metrics = GuidanceMetrics() + self.metrics = GuidanceEngineMetrics() def start(self, parser, grammar, ensure_bos_token=True): """Start processing parser state executed through the grammar. @@ -927,10 +927,10 @@ def __init__(self, engine, echo=True, **kwargs): ) # Metrics for the model - self.metrics = GuidanceMetrics() + self.engine_metrics = GuidanceEngineMetrics() def reset_metrics(self): - self.metrics = GuidanceMetrics() + self.engine_metrics = GuidanceEngineMetrics() @property def current_token_count(self) -> int: @@ -1466,10 +1466,10 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): unreplace_model_variables(replacements) # Now update our metrics while maintaining Thread Unsafety - lm.metrics.prompt_tokens += ( + lm.engine_metrics.prompt_tokens += ( self.engine.metrics.prompt_tokens - metrics_before.prompt_tokens ) - lm.metrics.generated_tokens += ( + lm.engine_metrics.generated_tokens += ( self.engine.metrics.generated_tokens - metrics_before.generated_tokens ) diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index af8153064..cac39630a 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -79,14 +79,14 @@ def test_metrics_smoke(selected_model: models.Model): lm += "abc" lm += gen("first", max_tokens=1) - assert lm.metrics.generated_tokens == 1 + assert lm.engine_metrics.generated_tokens == 1 lm += "efg" lm += gen("second", max_tokens=1) - assert lm.metrics.generated_tokens == 2 + assert lm.engine_metrics.generated_tokens == 2 assert lm.current_token_count >= ( - lm.metrics.prompt_tokens + lm.metrics.generated_tokens + lm.engine_metrics.prompt_tokens + lm.engine_metrics.generated_tokens ) @@ -108,9 +108,9 @@ def test_unicode2(selected_model: models.Model): prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) print(f"Output: {str(lm)}") - assert lm.metrics.prompt_tokens > 0 - assert lm.metrics.generated_tokens > 0 - assert lm.metrics.generated_tokens <= 10 + 1 + assert lm.engine_metrics.prompt_tokens > 0 + assert lm.engine_metrics.generated_tokens > 0 + assert lm.engine_metrics.generated_tokens <= 10 + 1 def test_gsm8k(): diff --git a/tests/models/common_chat_testing.py b/tests/models/common_chat_testing.py index 03d2281e9..e7f1d5c34 100644 --- a/tests/models/common_chat_testing.py +++ b/tests/models/common_chat_testing.py @@ -16,9 +16,9 @@ def smoke_chat(lm: models.Chat, has_system_role: bool = True): print(str(lm)) assert len(lm["text"]) > 0 assert str(lm).endswith("Pick a number: <|im_end|>") - assert lm.metrics.prompt_tokens > 0 - assert lm.metrics.generated_tokens > 0 - assert lm.metrics.generated_tokens <= 10 + assert lm.engine_metrics.prompt_tokens > 0 + assert lm.engine_metrics.generated_tokens > 0 + assert lm.engine_metrics.generated_tokens <= 10 def longer_chat_1(lm: models.Chat, has_system_role: bool = True): From ff46ec177e126ab4ae5b1d7c8e43c483f2850171 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 09:34:58 -0400 Subject: [PATCH 13/29] Some more output --- tests/library/test_gen.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index cac39630a..4d0e4a5d7 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -107,10 +107,15 @@ def test_unicode2(selected_model: models.Model): lm.reset_metrics() prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) + print(f"{prompt=}") + print(f"Prompt tokens: {len(lm.engine.tokenizer(prompt))}") + print(f"{lm.engine_metrics=}") + print(f"{lm.current_token_count=}") + print(f"{lm.token_count=}") print(f"Output: {str(lm)}") assert lm.engine_metrics.prompt_tokens > 0 assert lm.engine_metrics.generated_tokens > 0 - assert lm.engine_metrics.generated_tokens <= 10 + 1 + assert lm.engine_metrics.generated_tokens <= 10 def test_gsm8k(): From 2faa583090a63061ebd52b450f1dee4524da05e4 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 12:08:29 -0400 Subject: [PATCH 14/29] Trying to count forced tokens --- guidance/models/_guidance_engine_metrics.py | 1 + guidance/models/_model.py | 4 ++++ tests/library/test_gen.py | 14 +++++++++++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/guidance/models/_guidance_engine_metrics.py b/guidance/models/_guidance_engine_metrics.py index 29f58d31f..354a6cf38 100644 --- a/guidance/models/_guidance_engine_metrics.py +++ b/guidance/models/_guidance_engine_metrics.py @@ -4,3 +4,4 @@ class GuidanceEngineMetrics(BaseModel): prompt_tokens: NonNegativeInt = 0 generated_tokens: NonNegativeInt = 0 + forced_tokens: NonNegativeInt = 0 diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 5a936915e..e7828de99 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -687,6 +687,7 @@ def next(self, logits): self._sampled_token = self.tokenizer.tokens[self._sampled_token_ind] self._new_bytes_prob = 1.0 self._was_forced = True + self.metrics.forced_tokens += 1 # we are at the end of the grammar elif next_byte_mask_sum == 0: @@ -1472,6 +1473,9 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): lm.engine_metrics.generated_tokens += ( self.engine.metrics.generated_tokens - metrics_before.generated_tokens ) + lm.engine_metrics.forced_tokens += ( + self.engine.metrics.forced_tokens - metrics_before.forced_tokens + ) logger.debug("finish Model._run_stateless") diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 4d0e4a5d7..3098324d6 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -2,7 +2,7 @@ import pytest -from guidance import gen, models +from guidance import gen, models, select def test_basic(): @@ -78,17 +78,29 @@ def test_metrics_smoke(selected_model: models.Model): lm.reset_metrics() lm += "abc" + print(f"{lm.engine_metrics=}") lm += gen("first", max_tokens=1) + print(f"{lm.engine_metrics=}") assert lm.engine_metrics.generated_tokens == 1 lm += "efg" lm += gen("second", max_tokens=1) + print(f"{lm.engine_metrics=}") assert lm.engine_metrics.generated_tokens == 2 assert lm.current_token_count >= ( lm.engine_metrics.prompt_tokens + lm.engine_metrics.generated_tokens ) +def test_metrics_select(selected_model: models.Model): + lm = selected_model + lm.reset_metrics() + + lm += "This is a great day to " + lm += select(["ride a bike", "row a boat", "go for a swim"]) + print(f"lm={str(lm)}") + print(f"{lm.engine_metrics=}") + assert False def test_unicode(selected_model): # black makes this test ugly -- easier to read with fmt: off From f8de7c82e7ddcf41f867dca7b3565ec631d91f18 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 12:27:32 -0400 Subject: [PATCH 15/29] Try following things through --- tests/library/test_gen.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 3098324d6..2e691ce03 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -92,6 +92,7 @@ def test_metrics_smoke(selected_model: models.Model): lm.engine_metrics.prompt_tokens + lm.engine_metrics.generated_tokens ) + def test_metrics_select(selected_model: models.Model): lm = selected_model lm.reset_metrics() @@ -100,8 +101,13 @@ def test_metrics_select(selected_model: models.Model): lm += select(["ride a bike", "row a boat", "go for a swim"]) print(f"lm={str(lm)}") print(f"{lm.engine_metrics=}") + lm += " and afterwards " + lm += select(["walk to town", "walk to a show"]) + print(f"lm={str(lm)}") + print(f"{lm.engine_metrics=}") assert False + def test_unicode(selected_model): # black makes this test ugly -- easier to read with fmt: off # fmt: off From 822f8e17a95b17d96d28727acb9487055fb77b4c Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 14:00:57 -0400 Subject: [PATCH 16/29] I don't think I need these bits --- guidance/models/_model.py | 25 ++++++------------- guidance/models/transformers/_transformers.py | 4 --- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index e7828de99..95ce44a2e 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -204,9 +204,6 @@ def __init__(self, tokenizer, compute_log_probs=False): ) self._token_trie.match = True self._token_trie.match_version = 0 - # Any time get_logits is called, it should update this - # This does add to the list of "Thread Unsafety" - self.metrics = GuidanceEngineMetrics() def start(self, parser, grammar, ensure_bos_token=True): """Start processing parser state executed through the grammar. @@ -687,7 +684,6 @@ def next(self, logits): self._sampled_token = self.tokenizer.tokens[self._sampled_token_ind] self._new_bytes_prob = 1.0 self._was_forced = True - self.metrics.forced_tokens += 1 # we are at the end of the grammar elif next_byte_mask_sum == 0: @@ -758,6 +754,8 @@ def __call__(self, parser, grammar, ensure_bos_token=True): response_new_token_count, ) = response_state + print(f"{response_is_generated=} {response_new_token_count=} {response_new_bytes=}") + yield EngineCallResponse( new_bytes=response_new_bytes, is_generated=response_is_generated, @@ -1382,9 +1380,6 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): # we will return a new extended version of ourselves, which we track as `lm` lm = self - # Prepare our metrics update. This is part of our Thread Unsafety programme - metrics_before = lm.engine.metrics.model_copy(deep=True) - # single generation if n == 1: generated_value = "" @@ -1398,6 +1393,11 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): # if not self.engine.compute_log_probs: # chunk.new_bytes_prob = 1.0 + if chunk.is_generated: + self.engine_metrics.generated_tokens += chunk.new_token_count + else: + self.engine_metrics.forced_tokens += chunk.new_token_count + # convert the bytes to a string (delaying if we don't yet have a valid unicode string) lm.token_count += chunk.new_token_count chunk.new_bytes = delayed_bytes + chunk.new_bytes @@ -1466,17 +1466,6 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): unreplace_model_variables(replacements) - # Now update our metrics while maintaining Thread Unsafety - lm.engine_metrics.prompt_tokens += ( - self.engine.metrics.prompt_tokens - metrics_before.prompt_tokens - ) - lm.engine_metrics.generated_tokens += ( - self.engine.metrics.generated_tokens - metrics_before.generated_tokens - ) - lm.engine_metrics.forced_tokens += ( - self.engine.metrics.forced_tokens - metrics_before.forced_tokens - ) - logger.debug("finish Model._run_stateless") return lm diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 589c461a4..4de21be5b 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -269,10 +269,6 @@ def get_logits(self, token_ids, forced_bytes, current_temp): model_out.logits[0, -1, : len(self.tokenizer.tokens)].cpu().numpy() ) - # Update metrics - self.metrics.prompt_tokens += len(new_token_ids) - self.metrics.generated_tokens += 1 - return self._cached_logits From 5c500517cd3f5430d154fe04faab6550fd1c0db4 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 14:24:30 -0400 Subject: [PATCH 17/29] Tweak where stats are grabbed --- guidance/models/_guidance_engine_metrics.py | 1 - guidance/models/_model.py | 4 ++-- tests/library/test_gen.py | 7 ++++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/guidance/models/_guidance_engine_metrics.py b/guidance/models/_guidance_engine_metrics.py index 354a6cf38..65d7395b7 100644 --- a/guidance/models/_guidance_engine_metrics.py +++ b/guidance/models/_guidance_engine_metrics.py @@ -2,6 +2,5 @@ class GuidanceEngineMetrics(BaseModel): - prompt_tokens: NonNegativeInt = 0 generated_tokens: NonNegativeInt = 0 forced_tokens: NonNegativeInt = 0 diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 95ce44a2e..762b95eac 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -539,18 +539,18 @@ def next(self, logits): # self._captured_log_prob_data.update(new_captured_log_prob_data) # yield out, self._is_generated, self._new_bytes_prob, self._captured_data, self._captured_log_prob_data, self._token_count - self._last_token_count # note that we don't capture groups until a complete parse right now... + self._token_count += 1 # note we only update this for tokens that emit non-hidden content response_state = ( out, is_generated, self._new_bytes_prob if self.compute_log_probs else 1.0, self._captured_data, self._captured_log_prob_data, - self._token_count - self._last_token_count, + self._token_count - self._last_token_count + 1, ) self._last_token_count = self._token_count self._hidden_count = 0 - self._token_count += 1 # note we only update this for tokens that emit non-hidden content else: self._hidden_count -= len(new_bytes) diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 2e691ce03..f7b1e8c8a 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -101,11 +101,16 @@ def test_metrics_select(selected_model: models.Model): lm += select(["ride a bike", "row a boat", "go for a swim"]) print(f"lm={str(lm)}") print(f"{lm.engine_metrics=}") + assert lm.engine_metrics.forced_tokens > 0 + assert lm.engine_metrics.generated_tokens > 0 + assert lm.engine_metrics.forced_tokens > lm.engine_metrics.generated_tokens + prev_stats = lm.engine_metrics.copy() lm += " and afterwards " lm += select(["walk to town", "walk to a show"]) print(f"lm={str(lm)}") print(f"{lm.engine_metrics=}") - assert False + assert lm.engine_metrics.forced_tokens > prev_stats.forced_tokens + assert lm.engine_metrics.generated_tokens > prev_stats.generated_tokens def test_unicode(selected_model): From 67e21c6ea7b9023d8c31b77ea4a103fee06120b6 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 14:33:02 -0400 Subject: [PATCH 18/29] Tidy up tests --- tests/library/test_gen.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index f7b1e8c8a..5a1a5096d 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -77,20 +77,25 @@ def test_metrics_smoke(selected_model: models.Model): lm = selected_model lm.reset_metrics() - lm += "abc" + lm += "abcd" print(f"{lm.engine_metrics=}") lm += gen("first", max_tokens=1) print(f"{lm.engine_metrics=}") - assert lm.engine_metrics.generated_tokens == 1 + # Can't be sure of exact count due to token healing + assert ( + lm.engine_metrics.generated_tokens == 1 + or lm.engine_metrics.generated_tokens == 2 + ) + assert lm.engine_metrics.forced_tokens == 0 - lm += "efg" + lm += "fg" lm += gen("second", max_tokens=1) - print(f"{lm.engine_metrics=}") - assert lm.engine_metrics.generated_tokens == 2 - - assert lm.current_token_count >= ( - lm.engine_metrics.prompt_tokens + lm.engine_metrics.generated_tokens + # Again, trouble with healing + assert ( + lm.engine_metrics.generated_tokens >= 2 + and lm.engine_metrics.generated_tokens <= 4 ) + assert lm.engine_metrics.forced_tokens == 0 def test_metrics_select(selected_model: models.Model): @@ -130,15 +135,11 @@ def test_unicode2(selected_model: models.Model): lm.reset_metrics() prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) - print(f"{prompt=}") - print(f"Prompt tokens: {len(lm.engine.tokenizer(prompt))}") - print(f"{lm.engine_metrics=}") - print(f"{lm.current_token_count=}") - print(f"{lm.token_count=}") - print(f"Output: {str(lm)}") - assert lm.engine_metrics.prompt_tokens > 0 - assert lm.engine_metrics.generated_tokens > 0 - assert lm.engine_metrics.generated_tokens <= 10 + assert ( + lm.engine_metrics.generated_tokens == 10 + or lm.engine_metrics.generated_tokens == 11 + ) + assert lm.engine_metrics.forced_tokens == 0 def test_gsm8k(): From bcc269f6e06e82552dd539c7a1e3c4b511185a3a Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 14:33:09 -0400 Subject: [PATCH 19/29] Remove extra --- guidance/models/_model.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 762b95eac..03713bba7 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -754,7 +754,9 @@ def __call__(self, parser, grammar, ensure_bos_token=True): response_new_token_count, ) = response_state - print(f"{response_is_generated=} {response_new_token_count=} {response_new_bytes=}") + print( + f"{response_is_generated=} {response_new_token_count=} {response_new_bytes=}" + ) yield EngineCallResponse( new_bytes=response_new_bytes, @@ -931,12 +933,6 @@ def __init__(self, engine, echo=True, **kwargs): def reset_metrics(self): self.engine_metrics = GuidanceEngineMetrics() - @property - def current_token_count(self) -> int: - current_string = str(self) - current_tokens = self.engine.tokenizer(current_string) - return len(current_tokens) - @property def active_role_end(self): """The default end patterns we should use for `gen` calls. From b728b0f458f3217174f652e968e403c64d36e82d Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 15:01:58 -0400 Subject: [PATCH 20/29] Try to figure out if syntax makes a difference --- guidance/models/_model.py | 2 +- tests/library/test_gen.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 03713bba7..f8d41732c 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -546,7 +546,7 @@ def next(self, logits): self._new_bytes_prob if self.compute_log_probs else 1.0, self._captured_data, self._captured_log_prob_data, - self._token_count - self._last_token_count + 1, + self._token_count - self._last_token_count, ) self._last_token_count = self._token_count diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 5a1a5096d..1e1a0855b 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -74,7 +74,7 @@ def test_stop_quote(selected_model): def test_metrics_smoke(selected_model: models.Model): - lm = selected_model + lm = selected_model.copy() lm.reset_metrics() lm += "abcd" @@ -99,7 +99,7 @@ def test_metrics_smoke(selected_model: models.Model): def test_metrics_select(selected_model: models.Model): - lm = selected_model + lm = selected_model.copy() lm.reset_metrics() lm += "This is a great day to " @@ -118,6 +118,30 @@ def test_metrics_select(selected_model: models.Model): assert lm.engine_metrics.generated_tokens > prev_stats.generated_tokens +def test_metrics_alt_expressions(selected_model: models.Model): + lm = selected_model.copy() + lm2 = selected_model.copy() + lm.reset_metrics() + lm2.reset_metrics() + + prompt = "abcdefg" + + lm += prompt + gen(max_tokens=10) + print(f"\nlm={str(lm)}") + print(f"{lm.engine_metrics=}\n") + + lm2 += prompt + lm2 += gen(max_tokens=10) + print(f"\nlm2={str(lm2)}") + print(f"{lm2.engine_metrics=}\n") + + assert str(lm) == str(lm2) + assert lm.engine_metrics.generated_tokens == 10 + assert lm2.engine_metrics.generated_tokens == 10 + assert lm.engine_metrics.forced_tokens == 0 + assert lm2.engine_metrics.forced_tokens == 0 + + def test_unicode(selected_model): # black makes this test ugly -- easier to read with fmt: off # fmt: off From 9f330c31919ba05e9d20e078d75ac342f9d198da Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Wed, 8 May 2024 16:57:37 -0400 Subject: [PATCH 21/29] Latest attempt to get consistent token results --- guidance/models/_guidance_engine_metrics.py | 1 + guidance/models/_model.py | 13 ++++++++++++- guidance/models/transformers/_transformers.py | 2 +- tests/library/test_gen.py | 7 +++++-- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/guidance/models/_guidance_engine_metrics.py b/guidance/models/_guidance_engine_metrics.py index 65d7395b7..3a773543c 100644 --- a/guidance/models/_guidance_engine_metrics.py +++ b/guidance/models/_guidance_engine_metrics.py @@ -4,3 +4,4 @@ class GuidanceEngineMetrics(BaseModel): generated_tokens: NonNegativeInt = 0 forced_tokens: NonNegativeInt = 0 + model_input_tokens: NonNegativeInt = 0 diff --git a/guidance/models/_model.py b/guidance/models/_model.py index f8d41732c..c6228ce9b 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -131,6 +131,7 @@ class EngineCallResponse: capture_groups: dict capture_group_log_probs: dict new_token_count: int + last_model_token_count: int def __init__( self, @@ -140,6 +141,7 @@ def __init__( capture_groups, capture_group_log_probs, new_token_count, + last_model_token_count, ): self.new_bytes = new_bytes self.is_generated = is_generated @@ -147,6 +149,7 @@ def __init__( self.capture_groups = capture_groups self.capture_group_log_probs = capture_group_log_probs self.new_token_count = new_token_count + self.last_model_token_count = last_model_token_count def _to_proto(self): """Converts an EngineCallResponse object to its Protobuf representation. @@ -739,6 +742,7 @@ def __call__(self, parser, grammar, ensure_bos_token=True): # TODO: remove this after the next release. This verifies that calling Rust works. assert "def" == engine_start("abc", "def", 1) + last_model_token_count = 0 logits = None while True: is_done, logits_state, response_state = self.next(logits) @@ -765,13 +769,19 @@ def __call__(self, parser, grammar, ensure_bos_token=True): capture_groups=response_capture_groups, capture_group_log_probs=response_capture_group_log_probs, new_token_count=response_new_token_count, + last_model_token_count=last_model_token_count, ) + last_model_token_count = 0 if logits_state is not None: token_ids, forced_bytes, current_temp = logits_state - logits = self.get_logits(token_ids, forced_bytes, current_temp) + logits, model_token_count = self.get_logits( + token_ids, forced_bytes, current_temp + ) + last_model_token_count = model_token_count if is_done: + assert last_model_token_count == 0, "Unyielded input tokens" break def _tokenize_prefix(self, byte_string): @@ -1393,6 +1403,7 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): self.engine_metrics.generated_tokens += chunk.new_token_count else: self.engine_metrics.forced_tokens += chunk.new_token_count + self.engine_metrics.model_input_tokens += chunk.last_model_token_count # convert the bytes to a string (delaying if we don't yet have a valid unicode string) lm.token_count += chunk.new_token_count diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 4de21be5b..63e8fe4a9 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -269,7 +269,7 @@ def get_logits(self, token_ids, forced_bytes, current_temp): model_out.logits[0, -1, : len(self.tokenizer.tokens)].cpu().numpy() ) - return self._cached_logits + return self._cached_logits, len(new_token_ids) class Transformers(Model): diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 1e1a0855b..577762460 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -138,8 +138,11 @@ def test_metrics_alt_expressions(selected_model: models.Model): assert str(lm) == str(lm2) assert lm.engine_metrics.generated_tokens == 10 assert lm2.engine_metrics.generated_tokens == 10 - assert lm.engine_metrics.forced_tokens == 0 - assert lm2.engine_metrics.forced_tokens == 0 + + assert ( + lm.engine_metrics.forced_tokens + lm.engine_metrics.model_input_tokens + == lm2.engine_metrics.forced_tokens + lm2.engine_metrics.model_input_tokens + ) def test_unicode(selected_model): From 216a5de6f326801544dc171d7d855a936e65d6d6 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 06:20:00 -0400 Subject: [PATCH 22/29] Rethink the metrics --- guidance/models/_guidance_engine_metrics.py | 3 +- guidance/models/_model.py | 42 ++++--------------- guidance/models/transformers/_transformers.py | 2 + 3 files changed, 12 insertions(+), 35 deletions(-) diff --git a/guidance/models/_guidance_engine_metrics.py b/guidance/models/_guidance_engine_metrics.py index 3a773543c..f9688f988 100644 --- a/guidance/models/_guidance_engine_metrics.py +++ b/guidance/models/_guidance_engine_metrics.py @@ -2,6 +2,5 @@ class GuidanceEngineMetrics(BaseModel): - generated_tokens: NonNegativeInt = 0 - forced_tokens: NonNegativeInt = 0 model_input_tokens: NonNegativeInt = 0 + model_output_tokens: NonNegativeInt = 0 diff --git a/guidance/models/_model.py b/guidance/models/_model.py index c6228ce9b..95e9fe995 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -36,6 +36,8 @@ "Failed to load guidance.cpp, falling back to Python mirror implementations..." ) from .. import _cpp as cpp + +from ._guidance_engine_metrics import GuidanceEngineMetrics from .._rust.guidancerust import engine_start from .._utils import softmax, CaptureEvents from .._parser import EarleyCommitParser, Parser @@ -52,8 +54,6 @@ from .. import _serialization_pb2 -from ._guidance_engine_metrics import GuidanceEngineMetrics - if TYPE_CHECKING: from ..library._block import ContextBlock @@ -131,7 +131,6 @@ class EngineCallResponse: capture_groups: dict capture_group_log_probs: dict new_token_count: int - last_model_token_count: int def __init__( self, @@ -141,7 +140,6 @@ def __init__( capture_groups, capture_group_log_probs, new_token_count, - last_model_token_count, ): self.new_bytes = new_bytes self.is_generated = is_generated @@ -149,7 +147,6 @@ def __init__( self.capture_groups = capture_groups self.capture_group_log_probs = capture_group_log_probs self.new_token_count = new_token_count - self.last_model_token_count = last_model_token_count def _to_proto(self): """Converts an EngineCallResponse object to its Protobuf representation. @@ -208,6 +205,8 @@ def __init__(self, tokenizer, compute_log_probs=False): self._token_trie.match = True self._token_trie.match_version = 0 + self.metrics = GuidanceEngineMetrics() + def start(self, parser, grammar, ensure_bos_token=True): """Start processing parser state executed through the grammar. @@ -542,7 +541,6 @@ def next(self, logits): # self._captured_log_prob_data.update(new_captured_log_prob_data) # yield out, self._is_generated, self._new_bytes_prob, self._captured_data, self._captured_log_prob_data, self._token_count - self._last_token_count # note that we don't capture groups until a complete parse right now... - self._token_count += 1 # note we only update this for tokens that emit non-hidden content response_state = ( out, is_generated, @@ -554,6 +552,7 @@ def next(self, logits): self._last_token_count = self._token_count self._hidden_count = 0 + self._token_count += 1 # note we only update this for tokens that emit non-hidden content else: self._hidden_count -= len(new_bytes) @@ -740,9 +739,8 @@ def __call__(self, parser, grammar, ensure_bos_token=True): self.start(parser, grammar, ensure_bos_token) # TODO: remove this after the next release. This verifies that calling Rust works. - assert "def" == engine_start("abc", "def", 1) + assert("def" == engine_start("abc", "def", 1)) - last_model_token_count = 0 logits = None while True: is_done, logits_state, response_state = self.next(logits) @@ -758,10 +756,6 @@ def __call__(self, parser, grammar, ensure_bos_token=True): response_new_token_count, ) = response_state - print( - f"{response_is_generated=} {response_new_token_count=} {response_new_bytes=}" - ) - yield EngineCallResponse( new_bytes=response_new_bytes, is_generated=response_is_generated, @@ -769,19 +763,13 @@ def __call__(self, parser, grammar, ensure_bos_token=True): capture_groups=response_capture_groups, capture_group_log_probs=response_capture_group_log_probs, new_token_count=response_new_token_count, - last_model_token_count=last_model_token_count, ) - last_model_token_count = 0 if logits_state is not None: token_ids, forced_bytes, current_temp = logits_state - logits, model_token_count = self.get_logits( - token_ids, forced_bytes, current_temp - ) - last_model_token_count = model_token_count + logits = self.get_logits(token_ids, forced_bytes, current_temp) if is_done: - assert last_model_token_count == 0, "Unyielded input tokens" break def _tokenize_prefix(self, byte_string): @@ -860,7 +848,7 @@ def _cleanup_tokens(self, token_ids, token_byte_positions): return token_ids, token_byte_positions - def get_logits(self, token_ids, forced_bytes, current_temp) -> np.ndarray: + def get_logits(self, token_ids, forced_bytes, current_temp): """A fake method designed to be overriden by subclasses.""" # pretend to extend the KV cache and update the log probs @@ -937,12 +925,6 @@ def __init__(self, engine, echo=True, **kwargs): 0 # used to track the last event streaming call to enable throttling ) - # Metrics for the model - self.engine_metrics = GuidanceEngineMetrics() - - def reset_metrics(self): - self.engine_metrics = GuidanceEngineMetrics() - @property def active_role_end(self): """The default end patterns we should use for `gen` calls. @@ -1399,12 +1381,6 @@ def _run_stateless(self, stateless_function, temperature=0.0, top_p=1.0, n=1): # if not self.engine.compute_log_probs: # chunk.new_bytes_prob = 1.0 - if chunk.is_generated: - self.engine_metrics.generated_tokens += chunk.new_token_count - else: - self.engine_metrics.forced_tokens += chunk.new_token_count - self.engine_metrics.model_input_tokens += chunk.last_model_token_count - # convert the bytes to a string (delaying if we don't yet have a valid unicode string) lm.token_count += chunk.new_token_count chunk.new_bytes = delayed_bytes + chunk.new_bytes @@ -1654,4 +1630,4 @@ def _check_dominated(node, parser, match_version, next_byte_mask): parser.pos = curr_pos if not child_dominate: return False - return True + return True \ No newline at end of file diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 63e8fe4a9..8c55246ef 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -268,6 +268,8 @@ def get_logits(self, token_ids, forced_bytes, current_temp): self._cached_logits = ( model_out.logits[0, -1, : len(self.tokenizer.tokens)].cpu().numpy() ) + self.metrics.model_input_tokens += len(new_token_ids) + self.metrics.model_output_tokens += 1 return self._cached_logits, len(new_token_ids) From a083f1bbd6fce886ba3765fb797e5ba26c6c7b3c Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 06:28:07 -0400 Subject: [PATCH 23/29] Add a reset method --- guidance/models/_model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/guidance/models/_model.py b/guidance/models/_model.py index 95e9fe995..fdabaf80e 100644 --- a/guidance/models/_model.py +++ b/guidance/models/_model.py @@ -207,6 +207,9 @@ def __init__(self, tokenizer, compute_log_probs=False): self.metrics = GuidanceEngineMetrics() + def reset_metrics(self): + self.metrics = GuidanceEngineMetrics() + def start(self, parser, grammar, ensure_bos_token=True): """Start processing parser state executed through the grammar. From 66a3b05a90e35ee467fd3a7fa4fdf1691d97bcc9 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 06:31:18 -0400 Subject: [PATCH 24/29] Undo another change --- guidance/models/transformers/_transformers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 8c55246ef..30ddd5c49 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -121,10 +121,11 @@ def _tokenizer(self, model, **kwargs): ), "You must give a model name when you provide a tokenizer object!" return tokenizer - + def __call__(self, byte_string): tokenisation = self._orig_tokenizer(byte_string) - return tokenisation['input_ids'] + return tokenisation["input_ids"] + class TransformersEngine(Engine): def __init__(self, model, tokenizer, compute_log_probs, **kwargs): @@ -271,7 +272,7 @@ def get_logits(self, token_ids, forced_bytes, current_temp): self.metrics.model_input_tokens += len(new_token_ids) self.metrics.model_output_tokens += 1 - return self._cached_logits, len(new_token_ids) + return self._cached_logits class Transformers(Model): From 6af11bada25021a16df1d4c0e0264f7414adbf59 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 06:34:37 -0400 Subject: [PATCH 25/29] Fix tests --- tests/library/test_gen.py | 94 +++++++++++++++------------------------ 1 file changed, 37 insertions(+), 57 deletions(-) diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 577762460..4e5361c01 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -74,75 +74,53 @@ def test_stop_quote(selected_model): def test_metrics_smoke(selected_model: models.Model): - lm = selected_model.copy() - lm.reset_metrics() + lm = selected_model + lm.engine.reset_metrics() lm += "abcd" - print(f"{lm.engine_metrics=}") + print(f"{lm.engine.metrics=}") lm += gen("first", max_tokens=1) - print(f"{lm.engine_metrics=}") + print(f"{lm.engine.metrics=}") # Can't be sure of exact count due to token healing assert ( - lm.engine_metrics.generated_tokens == 1 - or lm.engine_metrics.generated_tokens == 2 + lm.engine.metrics.model_output_tokens == 1 + or lm.engine.metrics.model_output_tokens == 2 ) - assert lm.engine_metrics.forced_tokens == 0 + assert lm.engine.metrics.model_input_tokens > 1 lm += "fg" lm += gen("second", max_tokens=1) # Again, trouble with healing assert ( - lm.engine_metrics.generated_tokens >= 2 - and lm.engine_metrics.generated_tokens <= 4 + lm.engine.metrics.model_output_tokens == 1 + or lm.engine.metrics.model_output_tokens == 2 + ) + assert ( + lm.engine.metrics.model_output_tokens >= 2 + or lm.engine.metrics.model_output_tokens <= 4 ) - assert lm.engine_metrics.forced_tokens == 0 def test_metrics_select(selected_model: models.Model): - lm = selected_model.copy() - lm.reset_metrics() - - lm += "This is a great day to " - lm += select(["ride a bike", "row a boat", "go for a swim"]) - print(f"lm={str(lm)}") - print(f"{lm.engine_metrics=}") - assert lm.engine_metrics.forced_tokens > 0 - assert lm.engine_metrics.generated_tokens > 0 - assert lm.engine_metrics.forced_tokens > lm.engine_metrics.generated_tokens - prev_stats = lm.engine_metrics.copy() - lm += " and afterwards " - lm += select(["walk to town", "walk to a show"]) - print(f"lm={str(lm)}") - print(f"{lm.engine_metrics=}") - assert lm.engine_metrics.forced_tokens > prev_stats.forced_tokens - assert lm.engine_metrics.generated_tokens > prev_stats.generated_tokens - - -def test_metrics_alt_expressions(selected_model: models.Model): - lm = selected_model.copy() - lm2 = selected_model.copy() - lm.reset_metrics() - lm2.reset_metrics() - - prompt = "abcdefg" - - lm += prompt + gen(max_tokens=10) - print(f"\nlm={str(lm)}") - print(f"{lm.engine_metrics=}\n") - - lm2 += prompt - lm2 += gen(max_tokens=10) - print(f"\nlm2={str(lm2)}") - print(f"{lm2.engine_metrics=}\n") - - assert str(lm) == str(lm2) - assert lm.engine_metrics.generated_tokens == 10 - assert lm2.engine_metrics.generated_tokens == 10 - - assert ( - lm.engine_metrics.forced_tokens + lm.engine_metrics.model_input_tokens - == lm2.engine_metrics.forced_tokens + lm2.engine_metrics.model_input_tokens + lm = selected_model + lm.engine.reset_metrics() + + lm += "I will " + lm += select( + [ + "ride a bicycle down the road", + "row in a boat along the river", + "go for a swim in the ocean", + ] ) + print(f"lm={str(lm)}") + print(f"{lm.engine.metrics=}") + assert lm.engine.metrics.model_input_tokens > 1 + assert lm.engine.metrics.model_output_tokens > 0 + # Guidance should be able to force the generation after only a couple of tokens + # so even though the options are long, relatively few output tokens should be + # needed + assert lm.engine.metrics.model_input_tokens > lm.engine.metrics.model_output_tokens def test_unicode(selected_model): @@ -159,14 +137,16 @@ def test_unicode(selected_model): def test_unicode2(selected_model: models.Model): lm = selected_model - lm.reset_metrics() + lm.engine.reset_metrics() prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) + assert lm.engine.metrics.model_input_tokens > 1 + # Due to token healing, we can't be sure of the + # precise output count assert ( - lm.engine_metrics.generated_tokens == 10 - or lm.engine_metrics.generated_tokens == 11 + lm.engine.metrics.model_output_tokens == 10 + or lm.engine.metrics.model_output_tokens == 11 ) - assert lm.engine_metrics.forced_tokens == 0 def test_gsm8k(): From b25381e6100e34c8018f38988f43cb9ee643109e Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 06:40:56 -0400 Subject: [PATCH 26/29] Better name --- guidance/models/_guidance_engine_metrics.py | 4 +-- guidance/models/transformers/_transformers.py | 4 +-- tests/library/test_gen.py | 26 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/guidance/models/_guidance_engine_metrics.py b/guidance/models/_guidance_engine_metrics.py index f9688f988..cc2c36cfb 100644 --- a/guidance/models/_guidance_engine_metrics.py +++ b/guidance/models/_guidance_engine_metrics.py @@ -2,5 +2,5 @@ class GuidanceEngineMetrics(BaseModel): - model_input_tokens: NonNegativeInt = 0 - model_output_tokens: NonNegativeInt = 0 + engine_input_tokens: NonNegativeInt = 0 + engine_output_tokens: NonNegativeInt = 0 diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 30ddd5c49..5f395db61 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -269,8 +269,8 @@ def get_logits(self, token_ids, forced_bytes, current_temp): self._cached_logits = ( model_out.logits[0, -1, : len(self.tokenizer.tokens)].cpu().numpy() ) - self.metrics.model_input_tokens += len(new_token_ids) - self.metrics.model_output_tokens += 1 + self.metrics.engine_input_tokens += len(new_token_ids) + self.metrics.engine_output_tokens += 1 return self._cached_logits diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 4e5361c01..379fd65fc 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -83,21 +83,21 @@ def test_metrics_smoke(selected_model: models.Model): print(f"{lm.engine.metrics=}") # Can't be sure of exact count due to token healing assert ( - lm.engine.metrics.model_output_tokens == 1 - or lm.engine.metrics.model_output_tokens == 2 + lm.engine.metrics.engine_output_tokens == 1 + or lm.engine.metrics.engine_output_tokens == 2 ) - assert lm.engine.metrics.model_input_tokens > 1 + assert lm.engine.metrics.engine_input_tokens > 1 lm += "fg" lm += gen("second", max_tokens=1) # Again, trouble with healing assert ( - lm.engine.metrics.model_output_tokens == 1 - or lm.engine.metrics.model_output_tokens == 2 + lm.engine.metrics.engine_output_tokens == 1 + or lm.engine.metrics.engine_output_tokens == 2 ) assert ( - lm.engine.metrics.model_output_tokens >= 2 - or lm.engine.metrics.model_output_tokens <= 4 + lm.engine.metrics.engine_output_tokens >= 2 + or lm.engine.metrics.engine_output_tokens <= 4 ) @@ -115,12 +115,12 @@ def test_metrics_select(selected_model: models.Model): ) print(f"lm={str(lm)}") print(f"{lm.engine.metrics=}") - assert lm.engine.metrics.model_input_tokens > 1 - assert lm.engine.metrics.model_output_tokens > 0 + assert lm.engine.metrics.engine_input_tokens > 1 + assert lm.engine.metrics.engine_output_tokens > 0 # Guidance should be able to force the generation after only a couple of tokens # so even though the options are long, relatively few output tokens should be # needed - assert lm.engine.metrics.model_input_tokens > lm.engine.metrics.model_output_tokens + assert lm.engine.metrics.engine_input_tokens > lm.engine.metrics.engine_output_tokens def test_unicode(selected_model): @@ -140,12 +140,12 @@ def test_unicode2(selected_model: models.Model): lm.engine.reset_metrics() prompt = "Janet’s ducks lay 16 eggs per day" lm += prompt + gen(max_tokens=10) - assert lm.engine.metrics.model_input_tokens > 1 + assert lm.engine.metrics.engine_input_tokens > 1 # Due to token healing, we can't be sure of the # precise output count assert ( - lm.engine.metrics.model_output_tokens == 10 - or lm.engine.metrics.model_output_tokens == 11 + lm.engine.metrics.engine_output_tokens == 10 + or lm.engine.metrics.engine_output_tokens == 11 ) From 268d4a0d71ea6d4424c080d495bd4729704284f1 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 16:03:06 -0400 Subject: [PATCH 27/29] Don't have things for common_chat_testing yet --- tests/models/common_chat_testing.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/models/common_chat_testing.py b/tests/models/common_chat_testing.py index e7f1d5c34..99c45e860 100644 --- a/tests/models/common_chat_testing.py +++ b/tests/models/common_chat_testing.py @@ -16,9 +16,6 @@ def smoke_chat(lm: models.Chat, has_system_role: bool = True): print(str(lm)) assert len(lm["text"]) > 0 assert str(lm).endswith("Pick a number: <|im_end|>") - assert lm.engine_metrics.prompt_tokens > 0 - assert lm.engine_metrics.generated_tokens > 0 - assert lm.engine_metrics.generated_tokens <= 10 def longer_chat_1(lm: models.Chat, has_system_role: bool = True): From 4d851b07690f6568bb6071637449ef9e4157bed6 Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 16:04:44 -0400 Subject: [PATCH 28/29] Hook metrics into llamacpp --- guidance/models/llama_cpp/_llama_cpp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/guidance/models/llama_cpp/_llama_cpp.py b/guidance/models/llama_cpp/_llama_cpp.py index dba6f0738..34231aae1 100644 --- a/guidance/models/llama_cpp/_llama_cpp.py +++ b/guidance/models/llama_cpp/_llama_cpp.py @@ -193,9 +193,12 @@ def get_logits(self, token_ids, forced_bytes, current_temp): batch.logits[n_tokens - 1] = True ret = llama_cpp.llama_decode(self.model_obj.ctx, batch) + self.metrics.engine_input_tokens += n_tokens if ret != 0: raise Exception(f"Call to llama_cpp.llama_decode returned {ret}.") + self.metrics.engine_output_tokens += 1 + # get the logits logits = llama_cpp.llama_get_logits(self.model_obj.ctx) if llama_cpp.__version__ < "0.2.58": From d860cb2310a66395a3f3269fd6f84791e6725a2f Mon Sep 17 00:00:00 2001 From: "Richard Edgar (Microsoft)" Date: Thu, 9 May 2024 16:24:03 -0400 Subject: [PATCH 29/29] Fix test --- tests/library/test_gen.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/library/test_gen.py b/tests/library/test_gen.py index 379fd65fc..61b0296fb 100644 --- a/tests/library/test_gen.py +++ b/tests/library/test_gen.py @@ -86,19 +86,17 @@ def test_metrics_smoke(selected_model: models.Model): lm.engine.metrics.engine_output_tokens == 1 or lm.engine.metrics.engine_output_tokens == 2 ) - assert lm.engine.metrics.engine_input_tokens > 1 + assert lm.engine.metrics.engine_input_tokens >= 1 + last_input_tokens = lm.engine.metrics.engine_input_tokens lm += "fg" lm += gen("second", max_tokens=1) # Again, trouble with healing - assert ( - lm.engine.metrics.engine_output_tokens == 1 - or lm.engine.metrics.engine_output_tokens == 2 - ) assert ( lm.engine.metrics.engine_output_tokens >= 2 or lm.engine.metrics.engine_output_tokens <= 4 ) + assert lm.engine.metrics.engine_input_tokens > last_input_tokens def test_metrics_select(selected_model: models.Model): @@ -120,7 +118,9 @@ def test_metrics_select(selected_model: models.Model): # Guidance should be able to force the generation after only a couple of tokens # so even though the options are long, relatively few output tokens should be # needed - assert lm.engine.metrics.engine_input_tokens > lm.engine.metrics.engine_output_tokens + assert ( + lm.engine.metrics.engine_input_tokens > lm.engine.metrics.engine_output_tokens + ) def test_unicode(selected_model):