diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 60ac956e..02351a8c 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -19,11 +19,13 @@ PdfFormatOption, ) from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline -from instructlab.model.backends.backends import is_model_gguf, is_model_safetensors from langchain_text_splitters import Language, RecursiveCharacterTextSplitter from tabulate import tabulate from transformers import AutoTokenizer +# First Party +from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors + logger = logging.getLogger(__name__) _DEFAULT_CHUNK_OVERLAP = 100 @@ -320,16 +322,16 @@ def create_tokenizer(model_name: Optional[str]): tokenizer = AutoTokenizer.from_pretrained(model_path) elif is_model_gguf(model_path): + model_dir, model_filename = model_path.parent, model_path.name error_info_message = error_info_message.format( download_args=f"--repository {model_dir} --filename {model_filename}" ) - model_dir, model_filename = model_path.parent, model_path.name tokenizer = AutoTokenizer.from_pretrained( model_dir, gguf_file=model_filename ) else: - raise Exception(f"Received path to invalid model format {model_path}") + raise ValueError(f"Received path to invalid model format {model_path}") logger.info(f"Successfully loaded tokenizer from: {model_path}") return tokenizer diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index e707c2a9..a6f9b381 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -417,7 +417,6 @@ def _knowledge_leaf_node_to_samples( document_output_dir, model_name, ): - import ipdb; ipdb.set_trace() chunker = DocumentChunker( leaf_node=leaf_node, taxonomy_path=taxonomy_path, diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index d51fb173..281c6afe 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # Standard -import os from pathlib import Path +import os import tempfile # Third Party @@ -21,6 +21,7 @@ TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") + @pytest.fixture def documents_dir(): return Path(TEST_DATA_DIR) / "sample_documents" @@ -93,6 +94,8 @@ def test_chunker_factory_empty_filetype(documents_dir): output_dir=temp_dir, tokenizer_model_name=tokenizer_model_name, ) + + def test_create_tokenizer(tokenizer_model_name): ContextAwareChunker.create_tokenizer(tokenizer_model_name) @@ -100,11 +103,12 @@ def test_create_tokenizer(tokenizer_model_name): @pytest.mark.parametrize( "model_name", [ - os.path.join(TEST_DATA_DIR, "models/invalid_gguf.gguf"), - os.path.join(TEST_DATA_DIR, "models/invalid_safetensors_dir/"), - os.path.join(TEST_DATA_DIR, "bad_path)"), - ] + "models/invalid_gguf.gguf", + "models/invalid_safetensors_dir/", + "bad_path", + ], ) def test_invalid_tokenizer(model_name): - with pytest.raises(Exception): - ContextAwareChunker.create_tokenizer(model_name) + model_path = os.path.join(TEST_DATA_DIR, model_name) + with pytest.raises(ValueError): + ContextAwareChunker.create_tokenizer(model_path)