From ae3ab10e039e5dff103fa09c3096d638a91d1ca3 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Mon, 11 Nov 2024 15:57:06 -0500
Subject: [PATCH] Add error logging for failed tokenizer loading

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 41 ++++++++++++++++++---------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 393221bf..27c96972 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -3,7 +3,7 @@
 from collections import defaultdict
 from enum import Enum
 from pathlib import Path
-from typing import DefaultDict, Iterable, List, Tuple
+from typing import DefaultDict, Iterable, List, Optional, Tuple
 import json
 import logging
 import re
@@ -60,7 +60,7 @@ def __new__(
         output_dir: Path,
         server_ctx_size=4096,
         chunk_word_count=1024,
-        tokenizer_model_name: str | None = None,
+        tokenizer_model_name: Optional[str] = None,
     ):
         """Insantiate the appropriate chunker for the provided document
 
@@ -70,7 +70,7 @@ def __new__(
             output_dir (Path): directory where artifacts should be stored
             server_ctx_size (int): Context window size of server
             chunk_word_count (int): Maximum number of words to chunk a document
-            tokenizer_model_name (str): name of huggingface model to get
+            tokenizer_model_name (Optional[str]): name of huggingface model to get
                 tokenizer from
         Returns:
             TextSplitChunker | ContextAwareChunker: Object of the appropriate
@@ -187,7 +187,7 @@ def __init__(
         filepaths,
         output_dir: Path,
         chunk_word_count: int,
-        tokenizer_model_name: str,
+        tokenizer_model_name: Optional[str],
     ):
         self.document_paths = document_paths
         self.filepaths = filepaths
@@ -290,7 +290,8 @@ def fuse_texts(
 
         return fused_texts
 
-    def create_tokenizer(self, model_name: str):
+    @staticmethod
+    def create_tokenizer(model_name: Optional[str]):
         """
         Create a tokenizer instance from a pre-trained model or a local directory.
 
@@ -300,25 +301,37 @@ def create_tokenizer(self, model_name: str):
         Returns:
             AutoTokenizer: The tokenizer instance.
         """
-        # Third Party
-        import ipdb
+        if model_name is None:
+            raise TypeError("No model path provided")
 
-        ipdb.set_trace()
         model_path = Path(model_name)
+        error_info_message = (
+            "Please run ilab model download {download_args} and try again"
+        )
         try:
             if is_model_safetensors(model_path):
                 tokenizer = AutoTokenizer.from_pretrained(model_path)
+                error_info_message = error_info_message.format(
+                    download_args=f"--repository {model_path}"
+                )
             elif is_model_gguf(model_path):
-                tokenizer = AutoTokenizer.from_pretrained(model_path.parent, gguf_file=model_path.name)
+                model_dir, model_filename = model_path.parent, model_path.name
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_dir, gguf_file=model_filename
+                )
+                error_info_message = error_info_message.format(
+                    download_args=f"--repository {model_dir} --filename {model_filename}"
+                )
+            else:
+                raise Exception(f"Received path to invalid model format {model_path}")
             logger.info(f"Successfully loaded tokenizer from: {model_path}")
             return tokenizer
-        except Exception as e:
+
+        except (OSError, ValueError) as e:
             logger.error(
-                f"Failed to load tokenizer as model was not found at {model_path}."
-                "Please run `ilab model download {model_name} and try again\n"
-                "{str(e)}"
+                f"Failed to load tokenizer as model was not found at {model_path}. {error_info_message}"
             )
-            raise
+            raise e
 
     def get_token_count(self, text, tokenizer):
         """