diff --git a/.github/workflows/action_gpu_basic_tests.yml b/.github/workflows/action_gpu_basic_tests.yml index 97190ec34..5b5a09bad 100644 --- a/.github/workflows/action_gpu_basic_tests.yml +++ b/.github/workflows/action_gpu_basic_tests.yml @@ -62,7 +62,7 @@ jobs: pip install accelerate echo "==============================" pip uninstall -y llama-cpp-python - CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84" + CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6" - name: Check GPU available run: | python -c "import torch; assert torch.cuda.is_available()" diff --git a/.github/workflows/action_plain_basic_tests.yml b/.github/workflows/action_plain_basic_tests.yml index 789a53ab9..0a2d0b6c3 100644 --- a/.github/workflows/action_plain_basic_tests.yml +++ b/.github/workflows/action_plain_basic_tests.yml @@ -40,7 +40,7 @@ jobs: pip install sentencepiece echo "==============================" pip uninstall -y llama-cpp-python - pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84" + pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84,!=0.3.6" echo "==============================" pip uninstall -y transformers pip install "transformers!=4.43.0,!=4.43.1,!=4.43.2,!=4.43.3" # Issue 965 diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml index 1911f99fe..54453442d 100644 --- a/.github/workflows/ci_tests.yml +++ b/.github/workflows/ci_tests.yml @@ -57,7 +57,7 @@ jobs: - name: GPU pip installs run: | pip install accelerate - CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84" + CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6" - name: Check GPU available run: | python -c "import torch; assert torch.cuda.is_available()" @@ -153,7 +153,7 @@ jobs: echo "======================" nvcc --version echo "======================" - CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75" + CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.3.6" - name: Check GPU available run: | python -c "import torch; assert torch.cuda.is_available()" diff --git a/.github/workflows/notebook_tests.yml b/.github/workflows/notebook_tests.yml index d2b19e622..fd217d321 100644 --- a/.github/workflows/notebook_tests.yml +++ b/.github/workflows/notebook_tests.yml @@ -60,7 +60,7 @@ jobs: - name: GPU pip installs run: | pip install accelerate - CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84" + CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6" - name: Check GPU available run: | python -c "import torch; assert torch.cuda.is_available()" diff --git a/.github/workflows/pypi_upload.yml b/.github/workflows/pypi_upload.yml index e3fa04eac..a1029a08f 100644 --- a/.github/workflows/pypi_upload.yml +++ b/.github/workflows/pypi_upload.yml @@ -31,8 +31,6 @@ jobs: { cibuildwheel --print-build-identifiers --platform linux --archs x86_64 \ | jq -nRc '{"only": inputs, "os": "ubuntu-latest"}' \ - && cibuildwheel --print-build-identifiers --platform macos --archs x86_64 \ - | jq -nRc '{"only": inputs, "os": "macos-14"}' \ && cibuildwheel --print-build-identifiers --platform macos --archs arm64 \ | jq -nRc '{"only": inputs, "os": "macos-latest"}' \ && cibuildwheel --print-build-identifiers --platform windows --archs auto64 \ diff --git a/.github/workflows/workflow-pr-gate.yml b/.github/workflows/workflow-pr-gate.yml index a1e3cfb70..f1ae74a65 100644 --- a/.github/workflows/workflow-pr-gate.yml +++ b/.github/workflows/workflow-pr-gate.yml @@ -14,7 +14,7 @@ jobs: bare-install: strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-14] + os: [ubuntu-latest, windows-latest, macos-latest] python-version: ["3.9", "3.10", "3.11", "3.12"] runs-on: ${{ matrix.os }} steps: @@ -113,28 +113,8 @@ jobs: # Third Stage ============================================================== # Windows and MacOS, plus other GPU Linux tests - - basic-tests-mac-x86: - needs: end-stage-2 - strategy: - fail-fast: false # Don't cancel all on first failure - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - model: - - "transformers_gpt2_cpu" - - "transformers_phi2_cpu" - # - "transformers_mistral_7b_cpu" See Issue 713 - - "llamacpp_llama2_7b_cpu" - - "llamacpp_mistral_7b_cpu" - # - "transformers_phi3_mini_4k_instruct_cpu" Gives trouble on MacOS - - "llamacpp_phi3_mini_4k_instruct_cpu" - uses: ./.github/workflows/action_plain_basic_tests.yml - with: - os: macos-14 - python-version: ${{ matrix.python-version }} - model: ${{ matrix.model }} - basic-tests-mac-arm: + basic-tests-mac: needs: end-stage-2 strategy: fail-fast: false # Don't cancel all on first failure diff --git a/guidance/__init__.py b/guidance/__init__.py index 7fec48a57..c8de13295 100644 --- a/guidance/__init__.py +++ b/guidance/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.0rc1" +__version__ = "0.2.0" import sys import types diff --git a/guidance/models/transformers/_transformers.py b/guidance/models/transformers/_transformers.py index 19b73a038..93edd531e 100644 --- a/guidance/models/transformers/_transformers.py +++ b/guidance/models/transformers/_transformers.py @@ -409,7 +409,7 @@ def __init__(self, self.model = model.__class__.__name__ self.device = self.model_obj.device # otherwise note the current device - self._past_key_values = None + self._past_key_values: Union[transformers_package.Cache, tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]], None] = None self._cached_logits = None self._cached_token_ids: list[int] = [] @@ -479,13 +479,66 @@ def get_logits(self, token_ids): # reset the cache length according to that number of positions past_key_values = self._past_key_values - past_length = past_key_values[0][0].size(-2) if past_key_values is not None else 0 - if past_length > num_cached: - # note we recompute the last token because we don't bother to handle the special case of just computing logits + max_cache_shape = None + if past_key_values is None: + past_length = 0 + elif isinstance(past_key_values, tuple): + past_length = past_key_values[0][0].size(-2) + elif isinstance(past_key_values, transformers_package.Cache): + # TODO: use model's `cache_position` as this may be deprecated in a future version + # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L64 + past_length = past_key_values.get_seq_length() + # TODO: use `get_max_cache_shape` as `get_max_length` will be deprecated in a future version + # (`get_max_cache_shape` is not yet available so we can't use it yet) + # https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L67 + max_cache_shape = past_key_values.get_max_length() + else: + raise TypeError(f"Unknown type of past_key_values: {type(past_key_values)}") + + if max_cache_shape is not None and len(token_ids) > max_cache_shape: + # TODO: this seems to get set to the length of the first sequence we pass for models using + # StaticCache or HybridCache. We need to initialize our own cache with a large enough size + # if we want to continue generation with the same cache. + if isinstance(past_key_values, (transformers_package.StaticCache, transformers_package.HybridCache)): + # The __init__ API isn't consistent between different cache types, but there seems to be consistency + # between these two types, so we can use the same logic for both. + warnings.warn("Cache is too small. Re-initializing cache with larger size.") + cache_type = type(past_key_values) + config = self.model_obj.config + device = self.model_obj.device + hf_device_map = getattr(self.model_obj, "hf_device_map", {}) + # hf_device_map is not always a complete mapping of layers to devices... + layer_device_map = {k: hf_device_map.get(k, device) for k in range(config.num_hidden_layers)} + self._past_key_values = cache_type( + config=config, + batch_size=past_key_values.batch_size, + # Double the cache size to be safe + max_cache_len=len(token_ids)*2, + dtype=past_key_values.dtype, + layer_device_map=layer_device_map, + ) + else: + warnings.warn(f"Cache is too small. Resetting cache (no method implemented to resize cache for type {type(past_key_values)}).") + self._past_key_values = None + past_length = 0 + elif past_length > num_cached: past_length = max(0, num_cached - 1) - self._past_key_values = tuple( - tuple(p[..., :past_length, :] for p in v) for v in past_key_values - ) + if isinstance(past_key_values, tuple): + self._past_key_values = tuple( + tuple(p[..., :past_length, :] for p in v) for v in past_key_values + ) + else: + if hasattr(past_key_values, "crop"): + self._past_key_values.crop(past_length) + else: + warnings.warn(f"Cropping unsupported for cache type: {type(self._past_key_values)}. Resetting cache.") + if hasattr(self._past_key_values, "reset"): + # Use built-in reset method if available to avoid constructing/allocating a new cache + self._past_key_values.reset() + else: + self._past_key_values = None + past_length = 0 + cache_token_ids[past_length:] = [] # call the model