From 46340aa58b51a0714066a9faeba18c6cb2128f34 Mon Sep 17 00:00:00 2001
From: Harsha Nori <harsha.nori@live.com>
Date: Tue, 7 Jan 2025 12:33:24 -0800
Subject: [PATCH 1/4] Update to version 0.2.0

---
 guidance/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/guidance/__init__.py b/guidance/__init__.py
index 7fec48a57..c8de13295 100644
--- a/guidance/__init__.py
+++ b/guidance/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.0rc1"
+__version__ = "0.2.0"
 
 import sys
 import types

From 71f1a6876cfddf68e9676a3d6b6bfec31f282f40 Mon Sep 17 00:00:00 2001
From: JC1DA <nlhuynh.2014@phdis.smu.edu.sg>
Date: Mon, 13 Jan 2025 13:51:05 -0800
Subject: [PATCH 2/4] Exclude llama-cpp-python 0.3.6 in testcases (#1096)

Latest llama-cpp-python 0.3.6 breaks some of our testcases.

Relevant issue: https://github.com/ggerganov/llama.cpp/issues/11197
(Different results returned using same prompt with temperature 0)

Temporarily exclude this version until it's fixed

Signed-off-by: Loc Huynh <lohuynh@microsoft.com>
Co-authored-by: Loc Huynh <lohuynh@microsoft.com>
---
 .github/workflows/action_gpu_basic_tests.yml   | 2 +-
 .github/workflows/action_plain_basic_tests.yml | 2 +-
 .github/workflows/ci_tests.yml                 | 4 ++--
 .github/workflows/notebook_tests.yml           | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/action_gpu_basic_tests.yml b/.github/workflows/action_gpu_basic_tests.yml
index 97190ec34..5b5a09bad 100644
--- a/.github/workflows/action_gpu_basic_tests.yml
+++ b/.github/workflows/action_gpu_basic_tests.yml
@@ -62,7 +62,7 @@ jobs:
           pip install accelerate
           echo "=============================="
           pip uninstall -y llama-cpp-python
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"
diff --git a/.github/workflows/action_plain_basic_tests.yml b/.github/workflows/action_plain_basic_tests.yml
index 789a53ab9..0a2d0b6c3 100644
--- a/.github/workflows/action_plain_basic_tests.yml
+++ b/.github/workflows/action_plain_basic_tests.yml
@@ -40,7 +40,7 @@ jobs:
           pip install sentencepiece
           echo "=============================="
           pip uninstall -y llama-cpp-python
-          pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84"
+          pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84,!=0.3.6"
           echo "=============================="
           pip uninstall -y transformers
           pip install "transformers!=4.43.0,!=4.43.1,!=4.43.2,!=4.43.3" # Issue 965
diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
index 1911f99fe..54453442d 100644
--- a/.github/workflows/ci_tests.yml
+++ b/.github/workflows/ci_tests.yml
@@ -57,7 +57,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"
@@ -153,7 +153,7 @@ jobs:
           echo "======================"
           nvcc --version
           echo "======================"
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"
diff --git a/.github/workflows/notebook_tests.yml b/.github/workflows/notebook_tests.yml
index d2b19e622..fd217d321 100644
--- a/.github/workflows/notebook_tests.yml
+++ b/.github/workflows/notebook_tests.yml
@@ -60,7 +60,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"

From aaa5f00ff9aa12531264e2ff0d029bf727f634a4 Mon Sep 17 00:00:00 2001
From: Hudson Cooper <hncooper96@gmail.com>
Date: Wed, 15 Jan 2025 12:38:28 -0800
Subject: [PATCH 3/4] [Bug] HybridCache not subscriptable (#1047)

Use transformers `Cache` interface rather than legacy tuples (still included for backwards compatibility).

- When we overflow the size of StaticCache, HybridCache, reallocate a cache with double the size
      - Other fixed-size caches will just raise a warning and delete the cache until we adapt doubling logic to those cache types
- Use `Cache.crop` when available for backtracking the cache
- When `Cache.crop` is unavailable, try `Cache.reset` to avoid reallocation, finally falling back on deleting the cache
---
 guidance/models/transformers/_transformers.py | 67 +++++++++++++++++--
 1 file changed, 60 insertions(+), 7 deletions(-)

diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py
index 19b73a038..93edd531e 100644
--- a/guidance/models/transformers/_transformers.py
+++ b/guidance/models/transformers/_transformers.py
@@ -409,7 +409,7 @@ def __init__(self,
             self.model = model.__class__.__name__
         self.device = self.model_obj.device  # otherwise note the current device
 
-        self._past_key_values = None
+        self._past_key_values: Union[transformers_package.Cache, tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]], None] = None
         self._cached_logits = None
         self._cached_token_ids: list[int] = []
 
@@ -479,13 +479,66 @@ def get_logits(self, token_ids):
 
         # reset the cache length according to that number of positions
         past_key_values = self._past_key_values
-        past_length = past_key_values[0][0].size(-2) if past_key_values is not None else 0
-        if past_length > num_cached:
-            # note we recompute the last token because we don't bother to handle the special case of just computing logits
+        max_cache_shape = None
+        if past_key_values is None:
+            past_length = 0
+        elif isinstance(past_key_values, tuple):
+            past_length = past_key_values[0][0].size(-2)
+        elif isinstance(past_key_values, transformers_package.Cache):
+            # TODO: use model's `cache_position` as this may be deprecated in a future version
+            # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L64
+            past_length = past_key_values.get_seq_length()
+            # TODO: use `get_max_cache_shape` as `get_max_length` will be deprecated in a future version
+            # (`get_max_cache_shape` is not yet available so we can't use it yet)
+            # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L67
+            max_cache_shape = past_key_values.get_max_length()
+        else:
+            raise TypeError(f"Unknown type of past_key_values: {type(past_key_values)}")
+
+        if max_cache_shape is not None and len(token_ids) > max_cache_shape:
+            # TODO: this seems to get set to the length of the first sequence we pass for models using
+            # StaticCache or HybridCache. We need to initialize our own cache with a large enough size
+            # if we want to continue generation with the same cache.
+            if isinstance(past_key_values, (transformers_package.StaticCache, transformers_package.HybridCache)):
+                # The __init__ API isn't consistent between different cache types, but there seems to be consistency
+                # between these two types, so we can use the same logic for both.
+                warnings.warn("Cache is too small. Re-initializing cache with larger size.")
+                cache_type = type(past_key_values)
+                config = self.model_obj.config
+                device = self.model_obj.device
+                hf_device_map = getattr(self.model_obj, "hf_device_map", {})
+                # hf_device_map is not always a complete mapping of layers to devices...
+                layer_device_map = {k: hf_device_map.get(k, device) for k in range(config.num_hidden_layers)}
+                self._past_key_values = cache_type(
+                    config=config,
+                    batch_size=past_key_values.batch_size,
+                    # Double the cache size to be safe
+                    max_cache_len=len(token_ids)*2,
+                    dtype=past_key_values.dtype,
+                    layer_device_map=layer_device_map,
+                )
+            else:
+                warnings.warn(f"Cache is too small. Resetting cache (no method implemented to resize cache for type {type(past_key_values)}).")
+                self._past_key_values = None
+            past_length = 0
+        elif past_length > num_cached:
             past_length = max(0, num_cached - 1)
-            self._past_key_values = tuple(
-                tuple(p[..., :past_length, :] for p in v) for v in past_key_values
-            )
+            if isinstance(past_key_values, tuple):
+                self._past_key_values = tuple(
+                    tuple(p[..., :past_length, :] for p in v) for v in past_key_values
+                )
+            else:
+                if hasattr(past_key_values, "crop"):
+                    self._past_key_values.crop(past_length)
+                else:
+                    warnings.warn(f"Cropping unsupported for cache type: {type(self._past_key_values)}. Resetting cache.")
+                    if hasattr(self._past_key_values, "reset"):
+                        # Use built-in reset method if available to avoid constructing/allocating a new cache
+                        self._past_key_values.reset()
+                    else:
+                        self._past_key_values = None
+                    past_length = 0
+
         cache_token_ids[past_length:] = []
 
         # call the model

From 87bc0a87da7197f80ecba774200fc7e511448411 Mon Sep 17 00:00:00 2001
From: Hudson Cooper <hncooper96@gmail.com>
Date: Thu, 16 Jan 2025 13:35:07 -0800
Subject: [PATCH 4/4] Drop macos-14 runner as it's no longer x86 (#1098)

No need to use the macos-14 runner as it's not actually testing x86
anymore. Switching to just the macos-latest runner (silicon).
---
 .github/workflows/pypi_upload.yml      |  2 --
 .github/workflows/workflow-pr-gate.yml | 24 ++----------------------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/pypi_upload.yml b/.github/workflows/pypi_upload.yml
index e3fa04eac..a1029a08f 100644
--- a/.github/workflows/pypi_upload.yml
+++ b/.github/workflows/pypi_upload.yml
@@ -31,8 +31,6 @@ jobs:
             {
               cibuildwheel --print-build-identifiers --platform linux --archs x86_64 \
               | jq -nRc '{"only": inputs, "os": "ubuntu-latest"}' \
-              && cibuildwheel --print-build-identifiers --platform macos --archs x86_64 \
-              | jq -nRc '{"only": inputs, "os": "macos-14"}' \
               && cibuildwheel --print-build-identifiers --platform macos --archs arm64 \
               | jq -nRc '{"only": inputs, "os": "macos-latest"}' \
               && cibuildwheel --print-build-identifiers --platform windows --archs auto64 \
diff --git a/.github/workflows/workflow-pr-gate.yml b/.github/workflows/workflow-pr-gate.yml
index a1e3cfb70..f1ae74a65 100644
--- a/.github/workflows/workflow-pr-gate.yml
+++ b/.github/workflows/workflow-pr-gate.yml
@@ -14,7 +14,7 @@ jobs:
   bare-install:
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-14]
+        os: [ubuntu-latest, windows-latest, macos-latest]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     runs-on: ${{ matrix.os }}
     steps:
@@ -113,28 +113,8 @@ jobs:
 
 # Third Stage ==============================================================
 # Windows and MacOS, plus other GPU Linux tests
-  
-  basic-tests-mac-x86:
-    needs: end-stage-2
-    strategy:
-      fail-fast: false # Don't cancel all on first failure
-      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
-        model:
-          - "transformers_gpt2_cpu"
-          - "transformers_phi2_cpu"
-          # - "transformers_mistral_7b_cpu" See Issue 713
-          - "llamacpp_llama2_7b_cpu"
-          - "llamacpp_mistral_7b_cpu"
-          # - "transformers_phi3_mini_4k_instruct_cpu" Gives trouble on MacOS
-          - "llamacpp_phi3_mini_4k_instruct_cpu"
-    uses: ./.github/workflows/action_plain_basic_tests.yml
-    with:
-      os: macos-14
-      python-version: ${{ matrix.python-version }}
-      model: ${{ matrix.model }}
 
-  basic-tests-mac-arm:
+  basic-tests-mac:
     needs: end-stage-2
     strategy:
       fail-fast: false # Don't cancel all on first failure