From ae3ab10e039e5dff103fa09c3096d638a91d1ca3 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Mon, 11 Nov 2024 15:57:06 -0500 Subject: [PATCH] Add error logging for failed tokenizer loading Signed-off-by: Khaled Sulayman --- src/instructlab/sdg/utils/chunkers.py | 41 ++++++++++++++++++--------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 393221bf..27c96972 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -3,7 +3,7 @@ from collections import defaultdict from enum import Enum from pathlib import Path -from typing import DefaultDict, Iterable, List, Tuple +from typing import DefaultDict, Iterable, List, Optional, Tuple import json import logging import re @@ -60,7 +60,7 @@ def __new__( output_dir: Path, server_ctx_size=4096, chunk_word_count=1024, - tokenizer_model_name: str | None = None, + tokenizer_model_name: Optional[str] = None, ): """Insantiate the appropriate chunker for the provided document @@ -70,7 +70,7 @@ def __new__( output_dir (Path): directory where artifacts should be stored server_ctx_size (int): Context window size of server chunk_word_count (int): Maximum number of words to chunk a document - tokenizer_model_name (str): name of huggingface model to get + tokenizer_model_name (Optional[str]): name of huggingface model to get tokenizer from Returns: TextSplitChunker | ContextAwareChunker: Object of the appropriate @@ -187,7 +187,7 @@ def __init__( filepaths, output_dir: Path, chunk_word_count: int, - tokenizer_model_name: str, + tokenizer_model_name: Optional[str], ): self.document_paths = document_paths self.filepaths = filepaths @@ -290,7 +290,8 @@ def fuse_texts( return fused_texts - def create_tokenizer(self, model_name: str): + @staticmethod + def create_tokenizer(model_name: Optional[str]): """ Create a tokenizer instance from a pre-trained model or a local directory. @@ -300,25 +301,37 @@ def create_tokenizer(self, model_name: str): Returns: AutoTokenizer: The tokenizer instance. """ - # Third Party - import ipdb + if model_name is None: + raise TypeError("No model path provided") - ipdb.set_trace() model_path = Path(model_name) + error_info_message = ( + "Please run ilab model download {download_args} and try again" + ) try: if is_model_safetensors(model_path): tokenizer = AutoTokenizer.from_pretrained(model_path) + error_info_message = error_info_message.format( + download_args=f"--repository {model_path}" + ) elif is_model_gguf(model_path): - tokenizer = AutoTokenizer.from_pretrained(model_path.parent, gguf_file=model_path.name) + model_dir, model_filename = model_path.parent, model_path.name + tokenizer = AutoTokenizer.from_pretrained( + model_dir, gguf_file=model_filename + ) + error_info_message = error_info_message.format( + download_args=f"--repository {model_dir} --filename {model_filename}" + ) + else: + raise Exception(f"Received path to invalid model format {model_path}") logger.info(f"Successfully loaded tokenizer from: {model_path}") return tokenizer - except Exception as e: + + except (OSError, ValueError) as e: logger.error( - f"Failed to load tokenizer as model was not found at {model_path}." - "Please run `ilab model download {model_name} and try again\n" - "{str(e)}" + f"Failed to load tokenizer as model was not found at {model_path}. {error_info_message}" ) - raise + raise e def get_token_count(self, text, tokenizer): """