Skip to content

Commit

Permalink
Merge branch 'main' into hudson-ai-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
hudson-ai authored Jan 16, 2025
2 parents e718564 + 87bc0a8 commit 4e2a861
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 37 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/action_gpu_basic_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
pip install accelerate
echo "=============================="
pip uninstall -y llama-cpp-python
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
- name: Check GPU available
run: |
python -c "import torch; assert torch.cuda.is_available()"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/action_plain_basic_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
pip install sentencepiece
echo "=============================="
pip uninstall -y llama-cpp-python
pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84"
pip install "llama-cpp-python!=0.2.58,!=0.2.79,!=0.2.84,!=0.3.6"
echo "=============================="
pip uninstall -y transformers
pip install "transformers!=4.43.0,!=4.43.1,!=4.43.2,!=4.43.3" # Issue 965
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
- name: GPU pip installs
run: |
pip install accelerate
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
- name: Check GPU available
run: |
python -c "import torch; assert torch.cuda.is_available()"
Expand Down Expand Up @@ -153,7 +153,7 @@ jobs:
echo "======================"
nvcc --version
echo "======================"
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.3.6"
- name: Check GPU available
run: |
python -c "import torch; assert torch.cuda.is_available()"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/notebook_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
- name: GPU pip installs
run: |
pip install accelerate
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84"
CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75,!=0.2.84,!=0.3.6"
- name: Check GPU available
run: |
python -c "import torch; assert torch.cuda.is_available()"
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/pypi_upload.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ jobs:
{
cibuildwheel --print-build-identifiers --platform linux --archs x86_64 \
| jq -nRc '{"only": inputs, "os": "ubuntu-latest"}' \
&& cibuildwheel --print-build-identifiers --platform macos --archs x86_64 \
| jq -nRc '{"only": inputs, "os": "macos-14"}' \
&& cibuildwheel --print-build-identifiers --platform macos --archs arm64 \
| jq -nRc '{"only": inputs, "os": "macos-latest"}' \
&& cibuildwheel --print-build-identifiers --platform windows --archs auto64 \
Expand Down
24 changes: 2 additions & 22 deletions .github/workflows/workflow-pr-gate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
bare-install:
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-14]
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ["3.9", "3.10", "3.11", "3.12"]
runs-on: ${{ matrix.os }}
steps:
Expand Down Expand Up @@ -113,28 +113,8 @@ jobs:

# Third Stage ==============================================================
# Windows and MacOS, plus other GPU Linux tests

basic-tests-mac-x86:
needs: end-stage-2
strategy:
fail-fast: false # Don't cancel all on first failure
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
model:
- "transformers_gpt2_cpu"
- "transformers_phi2_cpu"
# - "transformers_mistral_7b_cpu" See Issue 713
- "llamacpp_llama2_7b_cpu"
- "llamacpp_mistral_7b_cpu"
# - "transformers_phi3_mini_4k_instruct_cpu" Gives trouble on MacOS
- "llamacpp_phi3_mini_4k_instruct_cpu"
uses: ./.github/workflows/action_plain_basic_tests.yml
with:
os: macos-14
python-version: ${{ matrix.python-version }}
model: ${{ matrix.model }}

basic-tests-mac-arm:
basic-tests-mac:
needs: end-stage-2
strategy:
fail-fast: false # Don't cancel all on first failure
Expand Down
2 changes: 1 addition & 1 deletion guidance/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.2.0rc1"
__version__ = "0.2.0"

import sys
import types
Expand Down
67 changes: 60 additions & 7 deletions guidance/models/transformers/_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def __init__(self,
self.model = model.__class__.__name__
self.device = self.model_obj.device # otherwise note the current device

self._past_key_values = None
self._past_key_values: Union[transformers_package.Cache, tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]], None] = None
self._cached_logits = None
self._cached_token_ids: list[int] = []

Expand Down Expand Up @@ -479,13 +479,66 @@ def get_logits(self, token_ids):

# reset the cache length according to that number of positions
past_key_values = self._past_key_values
past_length = past_key_values[0][0].size(-2) if past_key_values is not None else 0
if past_length > num_cached:
# note we recompute the last token because we don't bother to handle the special case of just computing logits
max_cache_shape = None
if past_key_values is None:
past_length = 0
elif isinstance(past_key_values, tuple):
past_length = past_key_values[0][0].size(-2)
elif isinstance(past_key_values, transformers_package.Cache):
# TODO: use model's `cache_position` as this may be deprecated in a future version
# https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L64
past_length = past_key_values.get_seq_length()
# TODO: use `get_max_cache_shape` as `get_max_length` will be deprecated in a future version
# (`get_max_cache_shape` is not yet available so we can't use it yet)
# https://github.com/huggingface/transformers/blob/70b07d97cf2c5f61fff55700b65528a1b6845cd2/src/transformers/cache_utils.py#L67
max_cache_shape = past_key_values.get_max_length()
else:
raise TypeError(f"Unknown type of past_key_values: {type(past_key_values)}")

if max_cache_shape is not None and len(token_ids) > max_cache_shape:
# TODO: this seems to get set to the length of the first sequence we pass for models using
# StaticCache or HybridCache. We need to initialize our own cache with a large enough size
# if we want to continue generation with the same cache.
if isinstance(past_key_values, (transformers_package.StaticCache, transformers_package.HybridCache)):
# The __init__ API isn't consistent between different cache types, but there seems to be consistency
# between these two types, so we can use the same logic for both.
warnings.warn("Cache is too small. Re-initializing cache with larger size.")
cache_type = type(past_key_values)
config = self.model_obj.config
device = self.model_obj.device
hf_device_map = getattr(self.model_obj, "hf_device_map", {})
# hf_device_map is not always a complete mapping of layers to devices...
layer_device_map = {k: hf_device_map.get(k, device) for k in range(config.num_hidden_layers)}
self._past_key_values = cache_type(
config=config,
batch_size=past_key_values.batch_size,
# Double the cache size to be safe
max_cache_len=len(token_ids)*2,
dtype=past_key_values.dtype,
layer_device_map=layer_device_map,
)
else:
warnings.warn(f"Cache is too small. Resetting cache (no method implemented to resize cache for type {type(past_key_values)}).")
self._past_key_values = None
past_length = 0
elif past_length > num_cached:
past_length = max(0, num_cached - 1)
self._past_key_values = tuple(
tuple(p[..., :past_length, :] for p in v) for v in past_key_values
)
if isinstance(past_key_values, tuple):
self._past_key_values = tuple(
tuple(p[..., :past_length, :] for p in v) for v in past_key_values
)
else:
if hasattr(past_key_values, "crop"):
self._past_key_values.crop(past_length)
else:
warnings.warn(f"Cropping unsupported for cache type: {type(self._past_key_values)}. Resetting cache.")
if hasattr(self._past_key_values, "reset"):
# Use built-in reset method if available to avoid constructing/allocating a new cache
self._past_key_values.reset()
else:
self._past_key_values = None
past_length = 0

cache_token_ids[past_length:] = []

# call the model
Expand Down

0 comments on commit 4e2a861

Please sign in to comment.