Skip to content

Commit

Permalink
Add error logging for failed tokenizer loading
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Nov 12, 2024
1 parent 2aa3aee commit ae3ab10
Showing 1 changed file with 27 additions and 14 deletions.
41 changes: 27 additions & 14 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import DefaultDict, Iterable, List, Tuple
from typing import DefaultDict, Iterable, List, Optional, Tuple
import json
import logging
import re
Expand Down Expand Up @@ -60,7 +60,7 @@ def __new__(
output_dir: Path,
server_ctx_size=4096,
chunk_word_count=1024,
tokenizer_model_name: str | None = None,
tokenizer_model_name: Optional[str] = None,
):
"""Insantiate the appropriate chunker for the provided document
Expand All @@ -70,7 +70,7 @@ def __new__(
output_dir (Path): directory where artifacts should be stored
server_ctx_size (int): Context window size of server
chunk_word_count (int): Maximum number of words to chunk a document
tokenizer_model_name (str): name of huggingface model to get
tokenizer_model_name (Optional[str]): name of huggingface model to get
tokenizer from
Returns:
TextSplitChunker | ContextAwareChunker: Object of the appropriate
Expand Down Expand Up @@ -187,7 +187,7 @@ def __init__(
filepaths,
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name: str,
tokenizer_model_name: Optional[str],
):
self.document_paths = document_paths
self.filepaths = filepaths
Expand Down Expand Up @@ -290,7 +290,8 @@ def fuse_texts(

return fused_texts

def create_tokenizer(self, model_name: str):
@staticmethod
def create_tokenizer(model_name: Optional[str]):
"""
Create a tokenizer instance from a pre-trained model or a local directory.
Expand All @@ -300,25 +301,37 @@ def create_tokenizer(self, model_name: str):
Returns:
AutoTokenizer: The tokenizer instance.
"""
# Third Party
import ipdb
if model_name is None:
raise TypeError("No model path provided")

ipdb.set_trace()
model_path = Path(model_name)
error_info_message = (
"Please run ilab model download {download_args} and try again"
)
try:
if is_model_safetensors(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path)
error_info_message = error_info_message.format(
download_args=f"--repository {model_path}"
)
elif is_model_gguf(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path.parent, gguf_file=model_path.name)
model_dir, model_filename = model_path.parent, model_path.name
tokenizer = AutoTokenizer.from_pretrained(
model_dir, gguf_file=model_filename
)
error_info_message = error_info_message.format(
download_args=f"--repository {model_dir} --filename {model_filename}"
)
else:
raise Exception(f"Received path to invalid model format {model_path}")
logger.info(f"Successfully loaded tokenizer from: {model_path}")
return tokenizer
except Exception as e:

except (OSError, ValueError) as e:

Check warning on line 330 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

W0719: Raising too general exception: Exception (broad-exception-raised)
logger.error(
f"Failed to load tokenizer as model was not found at {model_path}."
"Please run `ilab model download {model_name} and try again\n"
"{str(e)}"
f"Failed to load tokenizer as model was not found at {model_path}. {error_info_message}"
)
raise
raise e

def get_token_count(self, text, tokenizer):
"""
Expand Down

0 comments on commit ae3ab10

Please sign in to comment.