Merge branch 'main' into hudson-ai-patch-1

guidance-ai · Jan 16, 2025 · 4e2a861 · 4e2a861
2 parents e718564 + 87bc0a8
commit 4e2a861
Show file tree

Hide file tree

Showing 8 changed files with 68 additions and 37 deletions.
diff --git a/.github/workflows/action_gpu_basic_tests.yml b/.github/workflows/action_gpu_basic_tests.yml
@@ -62,7 +62,7 @@ jobs:
           pip install accelerate
           echo "=============================="
           pip uninstall -y llama-cpp-python
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"

diff --git a/.github/workflows/action_plain_basic_tests.yml b/.github/workflows/action_plain_basic_tests.yml
@@ -40,7 +40,7 @@ jobs:
           pip install sentencepiece
           echo "=============================="
           pip uninstall -y llama-cpp-python
-          pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84"
+          pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84,!=0.3.6"
           echo "=============================="
           pip uninstall -y transformers
           pip install "transformers!=4.43.0,!=4.43.1,!=4.43.2,!=4.43.3" # Issue 965

diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
@@ -57,7 +57,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"
@@ -153,7 +153,7 @@ jobs:
           echo "======================"
           nvcc --version
           echo "======================"
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"

diff --git a/.github/workflows/notebook_tests.yml b/.github/workflows/notebook_tests.yml
@@ -60,7 +60,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"

diff --git a/.github/workflows/pypi_upload.yml b/.github/workflows/pypi_upload.yml
@@ -31,8 +31,6 @@ jobs:
             {
               cibuildwheel --print-build-identifiers --platform linux --archs x86_64 \
               | jq -nRc '{"only": inputs, "os": "ubuntu-latest"}' \
-              && cibuildwheel --print-build-identifiers --platform macos --archs x86_64 \
-              | jq -nRc '{"only": inputs, "os": "macos-14"}' \
               && cibuildwheel --print-build-identifiers --platform macos --archs arm64 \
               | jq -nRc '{"only": inputs, "os": "macos-latest"}' \
               && cibuildwheel --print-build-identifiers --platform windows --archs auto64 \

diff --git a/.github/workflows/workflow-pr-gate.yml b/.github/workflows/workflow-pr-gate.yml
@@ -14,7 +14,7 @@ jobs:
   bare-install:
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-14]
+        os: [ubuntu-latest, windows-latest, macos-latest]
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     runs-on: ${{ matrix.os }}
     steps:
@@ -113,28 +113,8 @@ jobs:
 
 # Third Stage ==============================================================
 # Windows and MacOS, plus other GPU Linux tests
-
-  basic-tests-mac-x86:
-    needs: end-stage-2
-    strategy:
-      fail-fast: false # Don't cancel all on first failure
-      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
-        model:
-          - "transformers_gpt2_cpu"
-          - "transformers_phi2_cpu"
-          # - "transformers_mistral_7b_cpu" See Issue 713
-          - "llamacpp_llama2_7b_cpu"
-          - "llamacpp_mistral_7b_cpu"
-          # - "transformers_phi3_mini_4k_instruct_cpu" Gives trouble on MacOS
-          - "llamacpp_phi3_mini_4k_instruct_cpu"
-    uses: ./.github/workflows/action_plain_basic_tests.yml
-    with:
-      os: macos-14
-      python-version: ${{ matrix.python-version }}
-      model: ${{ matrix.model }}
 
-  basic-tests-mac-arm:
+  basic-tests-mac:
     needs: end-stage-2
     strategy:
       fail-fast: false # Don't cancel all on first failure

diff --git a/guidance/__init__.py b/guidance/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.0rc1"
+__version__ = "0.2.0"
 
 import sys
 import types

diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py
@@ -409,7 +409,7 @@ def __init__(self,
             self.model = model.__class__.__name__
         self.device = self.model_obj.device  # otherwise note the current device
 
-        self._past_key_values = None
+        self._past_key_values: Union[transformers_package.Cache, tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]], None] = None
         self._cached_logits = None
         self._cached_token_ids: list[int] = []
 
@@ -479,13 +479,66 @@ def get_logits(self, token_ids):
 
         # reset the cache length according to that number of positions
         past_key_values = self._past_key_values
-        past_length = past_key_values[0][0].size(-2) if past_key_values is not None else 0
-        if past_length > num_cached:
-            # note we recompute the last token because we don't bother to handle the special case of just computing logits
+        max_cache_shape = None
+        if past_key_values is None:
+            past_length = 0
+        elif isinstance(past_key_values, tuple):
+            past_length = past_key_values[0][0].size(-2)
+        elif isinstance(past_key_values, transformers_package.Cache):
+            # TODO: use model's `cache_position` as this may be deprecated in a future version
+            # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L64
+            past_length = past_key_values.get_seq_length()
+            # TODO: use `get_max_cache_shape` as `get_max_length` will be deprecated in a future version
+            # (`get_max_cache_shape` is not yet available so we can't use it yet)
+            # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L67
+            max_cache_shape = past_key_values.get_max_length()
+        else:
+            raise TypeError(f"Unknown type of past_key_values: {type(past_key_values)}")
+
+        if max_cache_shape is not None and len(token_ids) > max_cache_shape:
+            # TODO: this seems to get set to the length of the first sequence we pass for models using
+            # StaticCache or HybridCache. We need to initialize our own cache with a large enough size
+            # if we want to continue generation with the same cache.
+            if isinstance(past_key_values, (transformers_package.StaticCache, transformers_package.HybridCache)):
+                # The __init__ API isn't consistent between different cache types, but there seems to be consistency
+                # between these two types, so we can use the same logic for both.
+                warnings.warn("Cache is too small. Re-initializing cache with larger size.")
+                cache_type = type(past_key_values)
+                config = self.model_obj.config
+                device = self.model_obj.device
+                hf_device_map = getattr(self.model_obj, "hf_device_map", {})
+                # hf_device_map is not always a complete mapping of layers to devices...
+                layer_device_map = {k: hf_device_map.get(k, device) for k in range(config.num_hidden_layers)}
+                self._past_key_values = cache_type(
+                    config=config,
+                    batch_size=past_key_values.batch_size,
+                    # Double the cache size to be safe
+                    max_cache_len=len(token_ids)*2,
+                    dtype=past_key_values.dtype,
+                    layer_device_map=layer_device_map,
+                )
+            else:
+                warnings.warn(f"Cache is too small. Resetting cache (no method implemented to resize cache for type {type(past_key_values)}).")
+                self._past_key_values = None
+            past_length = 0
+        elif past_length > num_cached:
             past_length = max(0, num_cached - 1)
-            self._past_key_values = tuple(
-                tuple(p[..., :past_length, :] for p in v) for v in past_key_values
-            )
+            if isinstance(past_key_values, tuple):
+                self._past_key_values = tuple(
+                    tuple(p[..., :past_length, :] for p in v) for v in past_key_values
+                )
+            else:
+                if hasattr(past_key_values, "crop"):
+                    self._past_key_values.crop(past_length)
+                else:
+                    warnings.warn(f"Cropping unsupported for cache type: {type(self._past_key_values)}. Resetting cache.")
+                    if hasattr(self._past_key_values, "reset"):
+                        # Use built-in reset method if available to avoid constructing/allocating a new cache
+                        self._past_key_values.reset()
+                    else:
+                        self._past_key_values = None
+                    past_length = 0
+
         cache_token_ids[past_length:] = []
 
         # call the model