From 42ac6729ba33834a5a337e37a589da0c460c4b5f Mon Sep 17 00:00:00 2001 From: Inah Jeon Date: Wed, 8 Jan 2025 21:02:51 +0900 Subject: [PATCH] [SAAS-511] Update Model Names and Remove Deprecated Models (#40) * rename solar models * remove deprecated solar docvision model * remove deprecated layout analysis model * change version --- libs/upstage/README.md | 8 +- libs/upstage/langchain_upstage/__init__.py | 4 - libs/upstage/langchain_upstage/chat_models.py | 11 +- .../langchain_upstage/layout_analysis.py | 259 ----------- .../layout_analysis_parsers.py | 425 ------------------ .../tools/groundedness_check.py | 2 +- libs/upstage/pyproject.toml | 2 +- .../test_chat_models_standard.py | 2 +- .../integration_tests/test_layout_analysis.py | 104 ----- .../tests/unit_tests/test_chat_models.py | 8 +- .../unit_tests/test_chat_models_standard.py | 2 +- libs/upstage/tests/unit_tests/test_imports.py | 2 - .../tests/unit_tests/test_layout_analysis.py | 247 ---------- 13 files changed, 17 insertions(+), 1059 deletions(-) delete mode 100644 libs/upstage/langchain_upstage/layout_analysis.py delete mode 100644 libs/upstage/langchain_upstage/layout_analysis_parsers.py delete mode 100644 libs/upstage/tests/integration_tests/test_layout_analysis.py delete mode 100644 libs/upstage/tests/unit_tests/test_layout_analysis.py diff --git a/libs/upstage/README.md b/libs/upstage/README.md index 8f95ced..4dcd7a4 100644 --- a/libs/upstage/README.md +++ b/libs/upstage/README.md @@ -24,17 +24,17 @@ See a [usage example](https://python.langchain.com/docs/integrations/text_embedd Use `solar-embedding-1-large` model for embeddings. Do not add suffixes such as `-query` or `-passage` to the model name. `UpstageEmbeddings` will automatically add the suffixes based on the method called. -## Layout Analysis Loader +## Document Parse Loader See a [usage example](https://python.langchain.com/v0.1/docs/integrations/document_loaders/upstage/) -The `use_ocr` option determines whether OCR will be used for text extraction from documents. If this option is not specified, the default policy of the [Upstage Layout Analysis API](https://developers.upstage.ai/docs/apis/layout-analysis#request-body) service will be applied. When `use_ocr` is set to `True`, OCR is utilized to extract text. In the case of PDF documents, this involves converting the PDF into images before performing OCR. Conversely, if `use_ocr` is set to `False` for PDF documents, the text information embedded within the PDF is used directly. However, if the input document is not a PDF, such as an image, setting `use_ocr` to `False` will result in an error. +The `use_ocr` option determines whether OCR will be used for text extraction from documents. If this option is not specified, the default policy of the [Upstage Document Parse API](https://console.upstage.ai/docs/capabilities/document-parse#request) service will be applied. When `use_ocr` is set to `True`, OCR is utilized to extract text. In the case of PDF documents, this involves converting the PDF into images before performing OCR. Conversely, if `use_ocr` is set to `False` for PDF documents, the text information embedded within the PDF is used directly. However, if the input document is not a PDF, such as an image, setting `use_ocr` to `False` will result in an error. ```python -from langchain_upstage import UpstageLayoutAnalysisLoader +from langchain_upstage import UpstageDocumentParseLoader file_path = "/PATH/TO/YOUR/FILE.image" -layzer = UpstageLayoutAnalysisLoader(file_path, split="page") +layzer = UpstageDocumentParseLoader(file_path, split="page") # For improved memory efficiency, consider using the lazy_load method to load documents page by page. docs = layzer.load() # or layzer.lazy_load() diff --git a/libs/upstage/langchain_upstage/__init__.py b/libs/upstage/langchain_upstage/__init__.py index 8e94219..b91aeb3 100644 --- a/libs/upstage/langchain_upstage/__init__.py +++ b/libs/upstage/langchain_upstage/__init__.py @@ -2,8 +2,6 @@ from langchain_upstage.document_parse import UpstageDocumentParseLoader from langchain_upstage.document_parse_parsers import UpstageDocumentParseParser from langchain_upstage.embeddings import UpstageEmbeddings -from langchain_upstage.layout_analysis import UpstageLayoutAnalysisLoader -from langchain_upstage.layout_analysis_parsers import UpstageLayoutAnalysisParser from langchain_upstage.tools.groundedness_check import ( GroundednessCheck, UpstageGroundednessCheck, @@ -12,10 +10,8 @@ __all__ = [ "ChatUpstage", "UpstageEmbeddings", - "UpstageLayoutAnalysisLoader", "UpstageDocumentParseLoader", "UpstageDocumentParseParser", - "UpstageLayoutAnalysisParser", "UpstageGroundednessCheck", "GroundednessCheck", ] diff --git a/libs/upstage/langchain_upstage/chat_models.py b/libs/upstage/langchain_upstage/chat_models.py index 5ad1a45..0f108e2 100644 --- a/libs/upstage/langchain_upstage/chat_models.py +++ b/libs/upstage/langchain_upstage/chat_models.py @@ -56,8 +56,7 @@ DOC_PARSING_MODEL = ["solar-pro"] SOLAR_TOKENIZERS = { "solar-pro": "upstage/solar-pro-tokenizer", - "solar-1-mini-chat": "upstage/solar-1-mini-tokenizer", - "solar-docvision": "upstage/solar-docvision-preview-tokenizer", + "solar-mini": "upstage/solar-1-mini-tokenizer", } @@ -105,7 +104,7 @@ def _get_ls_params( params["ls_provider"] = "upstage" return params - model_name: str = Field(default="solar-1-mini-chat", alias="model") + model_name: str = Field(default="solar-mini", alias="model") """Model name to use.""" upstage_api_key: SecretStr = Field( default_factory=secret_from_env( @@ -384,7 +383,7 @@ class AnswerWithJustification(BaseModel): justification: str - llm = ChatUpstage(model="solar-1-mini-chat", temperature=0) + llm = ChatUpstage(model="solar-mini", temperature=0) structured_llm = llm.with_structured_output(AnswerWithJustification) structured_llm.invoke( @@ -410,7 +409,7 @@ class AnswerWithJustification(BaseModel): justification: str - llm = ChatUpstage(model="solar-1-mini-chat", temperature=0) + llm = ChatUpstage(model="solar-mini", temperature=0) structured_llm = llm.with_structured_output( AnswerWithJustification, include_raw=True ) @@ -440,7 +439,7 @@ class AnswerWithJustification(BaseModel): dict_schema = convert_to_openai_tool(AnswerWithJustification) - llm = ChatUpstage(model="solar-1-mini-chat", temperature=0) + llm = ChatUpstage(model="solar-mini", temperature=0) structured_llm = llm.with_structured_output(dict_schema) structured_llm.invoke( diff --git a/libs/upstage/langchain_upstage/layout_analysis.py b/libs/upstage/langchain_upstage/layout_analysis.py deleted file mode 100644 index 2484c8b..0000000 --- a/libs/upstage/langchain_upstage/layout_analysis.py +++ /dev/null @@ -1,259 +0,0 @@ -import os -import warnings -from pathlib import Path -from typing import Any, Dict, Iterator, List, Literal, Optional, Union - -from langchain_core.document_loaders import BaseLoader, Blob -from langchain_core.documents import Document - -from .layout_analysis_parsers import UpstageLayoutAnalysisParser - -DEFAULT_PAGE_BATCH_SIZE = 10 - -OutputType = Literal["text", "html"] -SplitType = Literal["none", "element", "page"] - - -def validate_api_key(api_key: str) -> None: - """ - Validates the provided API key. - - Args: - api_key (str): The API key to be validated. - - Raises: - ValueError: If the API key is empty or None. - - Returns: - None - """ - if not api_key: - raise ValueError("API Key is required for Upstage Document Loader") - - -def validate_file_path(file_path: Union[str, Path, List[str], List[Path]]) -> None: - """ - Validates if a file exists at the given file path. - - Args: - file_path (Union[str, Path, List[str], List[Path]): The file path(s) to be - validated. - - Raises: - FileNotFoundError: If the file or any of the files in the list do not exist. - """ - if isinstance(file_path, list): - for path in file_path: - validate_file_path(path) - return - if not os.path.exists(file_path): - raise FileNotFoundError(f"File not found: {file_path}") - - -def get_from_param_or_env( - key: str, - param: Optional[str] = None, - env_key: Optional[str] = None, - default: Optional[str] = None, -) -> str: - """Get a value from a param or an environment variable.""" - if param is not None: - return param - elif env_key and env_key in os.environ and os.environ[env_key]: - return os.environ[env_key] - elif default is not None: - return default - else: - raise ValueError( - f"Did not find {key}, please add an environment variable" - f" `{env_key}` which contains it, or pass" - f" `{key}` as a named parameter." - ) - - -class UpstageLayoutAnalysisLoader(BaseLoader): - """Upstage Layout Analysis. - - To use, you should have the environment variable `UPSTAGE_API_KEY` - set with your API key or pass it as a named parameter to the constructor. - - Example: - .. code-block:: python - - from langchain_upstage import UpstageLayoutAnalysis - - file_path = "/PATH/TO/YOUR/FILE.pdf" - loader = UpstageLayoutAnalysis( - file_path, split="page", output_type="text" - ) - """ - - def __init__( - self, - file_path: Union[str, Path, List[str], List[Path]], - output_type: Union[OutputType, dict] = "html", - split: SplitType = "none", - api_key: Optional[str] = None, - use_ocr: Optional[bool] = None, - exclude: list = ["header", "footer"], - ): - """ - Initializes an instance of the Upstage document loader. - - Args: - file_path (Union[str, Path, List[str], List[Path]): The path to the document - to be loaded. - output_type (Union[OutputType, dict], optional): The type of output to be - generated by the parser. - Defaults to "html". - split (SplitType, optional): The type of splitting to be applied. - Defaults to "none" (no splitting). - api_key (str, optional): The API key for accessing the Upstage API. - Defaults to None, in which case it will be - fetched from the environment variable - `UPSTAGE_API_KEY`. - use_ocr (bool, optional): Extract text from images in the document using - OCR. If the value is True, OCR is used to extract - text from an image. If the value is False, text is - extracted from a PDF. (An error will occur if the - value is False and the input is NOT in PDF format) - The default value is None, and the default - behavior will be performed based on the API's - policy if no value is specified. Please check https://developers.upstage.ai/docs/apis/layout-analysis#request-body. - exclude (list, optional): Exclude specific elements from - the output. - Defaults to ["header", "footer"]. - """ - self.file_path = file_path - self.output_type = output_type - self.split = split - if deprecated_key := os.environ.get("UPSTAGE_DOCUMENT_AI_API_KEY"): - warnings.warn( - "UPSTAGE_DOCUMENT_AI_API_KEY is deprecated." - "Please use UPSTAGE_API_KEY instead." - ) - warnings.warn( - "UpstageLayoutAnalysisLoader is deprecated." - "Please use langchain_upstage.document_parse.UpstageDocumentParseLoader" - " instead." - ) - - self.api_key = get_from_param_or_env( - "UPSTAGE_API_KEY", api_key, "UPSTAGE_API_KEY", deprecated_key - ) - self.use_ocr = use_ocr - self.exclude = exclude - - validate_file_path(self.file_path) - validate_api_key(self.api_key) - - def load(self) -> List[Document]: - """ - Loads and parses the document using the UpstageLayoutAnalysisParser. - - Returns: - A list of Document objects representing the parsed layout analysis. - """ - - if isinstance(self.file_path, list): - result = [] - - for file_path in self.file_path: - blob = Blob.from_path(file_path) - - parser = UpstageLayoutAnalysisParser( - self.api_key, - split=self.split, - output_type=self.output_type, - use_ocr=self.use_ocr, - exclude=self.exclude, - ) - result.extend(list(parser.lazy_parse(blob, is_batch=True))) - - return result - - else: - blob = Blob.from_path(self.file_path) - - parser = UpstageLayoutAnalysisParser( - self.api_key, - split=self.split, - output_type=self.output_type, - use_ocr=self.use_ocr, - exclude=self.exclude, - ) - return list(parser.lazy_parse(blob, is_batch=True)) - - def lazy_load(self) -> Iterator[Document]: - """ - Lazily loads and parses the document using the UpstageLayoutAnalysisParser. - - Returns: - An iterator of Document objects representing the parsed layout analysis. - """ - - if isinstance(self.file_path, list): - for file_path in self.file_path: - blob = Blob.from_path(file_path) - - parser = UpstageLayoutAnalysisParser( - self.api_key, - split=self.split, - output_type=self.output_type, - use_ocr=self.use_ocr, - exclude=self.exclude, - ) - yield from parser.lazy_parse(blob, is_batch=True) - else: - blob = Blob.from_path(self.file_path) - - parser = UpstageLayoutAnalysisParser( - self.api_key, - split=self.split, - output_type=self.output_type, - use_ocr=self.use_ocr, - exclude=self.exclude, - ) - yield from parser.lazy_parse(blob) - - def merge_and_split( - self, documents: List[Document], splitter: Optional[object] = None - ) -> List[Document]: - """ - Merges the page content and metadata of multiple documents into a single - document, or splits the documents using a custom splitter. - - Args: - documents (list): A list of Document objects to be merged and split. - splitter (object, optional): An optional splitter object that implements the - `split_documents` method. If provided, the documents will be split using - this splitter. Defaults to None, in which case the documents are merged. - - Returns: - list: A list of Document objects. If no splitter is provided, a single - Document object is returned with the merged content and combined metadata. - If a splitter is provided, the documents are split and a list of Document - objects is returned. - - Raises: - AssertionError: If a splitter is provided but it does not implement the - `split_documents` method. - """ - if splitter is None: - merged_content = " ".join([doc.page_content for doc in documents]) - - metadatas: Dict[str, Any] = dict() - for _meta in [doc.metadata for doc in documents]: - for key, value in _meta.items(): - if key in metadatas: - metadatas[key].append(value) - else: - metadatas[key] = [value] - - return [Document(page_content=merged_content, metadata=metadatas)] - else: - assert hasattr( - splitter, "split_documents" - ), "splitter must implement split_documents method" - - return splitter.split_documents(documents) diff --git a/libs/upstage/langchain_upstage/layout_analysis_parsers.py b/libs/upstage/langchain_upstage/layout_analysis_parsers.py deleted file mode 100644 index 41f3ffe..0000000 --- a/libs/upstage/langchain_upstage/layout_analysis_parsers.py +++ /dev/null @@ -1,425 +0,0 @@ -import io -import json -import logging -import os -import warnings -from typing import Dict, Iterator, List, Literal, Optional, Union - -import requests -from langchain_core.document_loaders import BaseBlobParser, Blob -from langchain_core.documents import Document -from pypdf import PdfReader, PdfWriter -from pypdf.errors import PdfReadError - -# Disable logging for PyPDF -logger = logging.getLogger("pypdf") -logger.setLevel(logging.ERROR) - -LAYOUT_ANALYSIS_URL = "https://api.upstage.ai/v1/document-ai/layout-analysis" - -DEFAULT_NUMBER_OF_PAGE = 10 - -OutputType = Literal["text", "html"] -SplitType = Literal["none", "element", "page"] - - -def validate_api_key(api_key: str) -> None: - """ - Validates the provided API key. - - Args: - api_key (str): The API key to be validated. - - Raises: - ValueError: If the API key is empty or None. - - Returns: - None - """ - if not api_key: - raise ValueError("API Key is required for Upstage Document Loader") - - -def validate_file_path(file_path: str) -> None: - """ - Validates if a file exists at the given file path. - - Args: - file_path (str): The path to the file. - - Raises: - FileNotFoundError: If the file does not exist at the given file path. - """ - if not os.path.exists(file_path): - raise FileNotFoundError(f"File not found: {file_path}") - - -def parse_output(data: dict, output_type: Union[OutputType, dict]) -> str: - """ - Parse the output data based on the specified output type. - - Args: - data (dict): The data to be parsed. - output_type (Union[OutputType, dict]): The output type to parse the element data - into. - - Returns: - str: The parsed output. - - Raises: - ValueError: If the output type is invalid. - """ - if isinstance(output_type, dict): - if data["category"] in output_type: - return data[output_type[data["category"]]] - else: - return data["text"] - elif isinstance(output_type, str): - if output_type == "text": - return data["text"] - elif output_type == "html": - return data["html"] - else: - raise ValueError(f"Invalid output type: {output_type}") - else: - raise ValueError(f"Invalid output type: {output_type}") - - -def get_from_param_or_env( - key: str, - param: Optional[str] = None, - env_key: Optional[str] = None, - default: Optional[str] = None, -) -> str: - """Get a value from a param or an environment variable.""" - if param is not None: - return param - elif env_key and env_key in os.environ and os.environ[env_key]: - return os.environ[env_key] - elif default is not None: - return default - else: - raise ValueError( - f"Did not find {key}, please add an environment variable" - f" `{env_key}` which contains it, or pass" - f" `{key}` as a named parameter." - ) - - -class UpstageLayoutAnalysisParser(BaseBlobParser): - """Upstage Layout Analysis Parser. - - To use, you should have the environment variable `UPSTAGE_API_KEY` - set with your API key or pass it as a named parameter to the constructor. - - Example: - .. code-block:: python - - from langchain_upstage import UpstageLayoutAnalysisParser - - loader = UpstageLayoutAnalysisParser(split="page", output_type="text") - """ - - def __init__( - self, - api_key: Optional[str] = None, - output_type: Union[OutputType, dict] = "html", - split: SplitType = "none", - use_ocr: Optional[bool] = None, - exclude: list = [], - ): - """ - Initializes an instance of the Upstage class. - - Args: - api_key (str, optional): The API key for accessing the Upstage API. - Defaults to None, in which case it will be - fetched from the environment variable - `UPSTAGE_API_KEY`. - output_type (Union[OutputType, dict], optional): The type of output to be - generated by the parser. - Defaults to "html". - split (SplitType, optional): The type of splitting to be applied. - Defaults to "none" (no splitting). - use_ocr (bool, optional): Extract text from images in the document using - OCR. If the value is True, OCR is used to extract - text from an image. If the value is False, text is - extracted from a PDF. (An error will occur if the - value is False and the input is NOT in PDF format) - The default value is None, and the default - behavior will be performed based on the API's - policy if no value is specified. Please check https://developers.upstage.ai/docs/apis/layout-analysis#request-body. - exclude (list, optional): Exclude specific elements from the output. - Defaults to [] (all included). - """ - if deprecated_key := os.environ.get("UPSTAGE_DOCUMENT_AI_API_KEY"): - warnings.warn( - "UPSTAGE_DOCUMENT_AI_API_KEY is deprecated." - "Please use UPSTAGE_API_KEY instead." - ) - warnings.warn( - "UpstageLayoutAnalysisParser is deprecated." - "Please use" - " langchain_upstage.document_parse_parsers.UpstageDocumentParseParser" - " instead." - ) - - self.api_key = get_from_param_or_env( - "UPSTAGE_API_KEY", api_key, "UPSTAGE_API_KEY", deprecated_key - ) - - self.output_type = output_type - self.split = split - self.use_ocr = use_ocr - self.exclude = exclude - - validate_api_key(self.api_key) - - def _get_response(self, files: Dict) -> List: - """ - Sends a POST request to the API endpoint with the provided files and - returns the response. - - Args: - files (dict): A dictionary containing the files to be sent in the request. - - Returns: - dict: The JSON response from the API. - - Raises: - ValueError: If there is an error in the API call. - """ - try: - headers = {"Authorization": f"Bearer {self.api_key}"} - if self.use_ocr is not None: - options = {"ocr": self.use_ocr} - response = requests.post( - LAYOUT_ANALYSIS_URL, headers=headers, files=files, data=options - ) - else: - response = requests.post( - LAYOUT_ANALYSIS_URL, headers=headers, files=files - ) - response.raise_for_status() - - result = response.json().get("elements", []) - - elements = [ - element for element in result if element["category"] not in self.exclude - ] - - return elements - - except requests.RequestException as req_err: - # Handle any request-related exceptions - raise ValueError(f"Failed to send request: {req_err}") - except json.JSONDecodeError as json_err: - # Handle JSON decode errors - raise ValueError(f"Failed to decode JSON response: {json_err}") - except Exception as err: - # Handle any other exceptions - raise ValueError(f"An error occurred: {err}") - - return [] - - def _split_and_request( - self, - full_docs: PdfReader, - start_page: int, - num_pages: int = DEFAULT_NUMBER_OF_PAGE, - ) -> List: - """ - Splits the full pdf document into partial pages and sends a request to the - server. - - Args: - full_docs (PdfReader): The full document to be split and requested. - start_page (int): The starting page number for splitting the document. - num_pages (int, optional): The number of pages to split the document - into. - Defaults to DEFAULT_NUMBER_OF_PAGE. - - Returns: - response: The response from the server. - """ - merger = PdfWriter() - merger.append( - full_docs, - pages=(start_page, min(start_page + num_pages, full_docs.get_num_pages())), - ) - - with io.BytesIO() as buffer: - merger.write(buffer) - buffer.seek(0) - response = self._get_response({"document": buffer}) - - return response - - def _element_document(self, elements: Dict, start_page: int = 0) -> Document: - """ - Converts an elements into a Document object. - - Args: - elements (Dict) : The elements to convert. - start_page (int): The starting page number for splitting the document. - This number starts from zero. - - Returns: - A list containing a single Document object. - - """ - return Document( - page_content=(parse_output(elements, self.output_type)), - metadata={ - "page": elements["page"] + start_page, - "id": elements["id"], - "bounding_box": json.dumps(elements["bounding_box"]), - "category": elements["category"], - }, - ) - - def _page_document(self, elements: List, start_page: int = 0) -> List[Document]: - """ - Combines elements with the same page number into a single Document object. - - Args: - elements (List): A list of elements containing page numbers. - start_page (int): The starting page number for splitting the document. - This number starts from zero. - - Returns: - List[Document]: A list of Document objects, each representing a page - with its content and metadata. - """ - _docs = [] - pages = sorted(set(map(lambda x: x["page"], elements))) - - page_group = [ - [element for element in elements if element["page"] == x] for x in pages - ] - - for group in page_group: - page_content = " ".join( - [parse_output(element, self.output_type) for element in group] - ) - - _docs.append( - Document( - page_content=page_content, - metadata={ - "page": group[0]["page"] + start_page, - }, - ) - ) - - return _docs - - def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]: - """ - Lazily parses a document and yields Document objects based on the specified - split type. - - Args: - blob (Blob): The input document blob to parse. - is_batch (bool, optional): Whether to parse the document in batches. - Defaults to False (single page parsing) - - Yields: - Document: The parsed document object. - - Raises: - ValueError: If an invalid split type is provided. - - """ - - if is_batch: - num_pages = DEFAULT_NUMBER_OF_PAGE - else: - num_pages = 1 - - try: - full_docs = PdfReader(str(blob.path)) - number_of_pages = full_docs.get_num_pages() - is_pdf = True - except PdfReadError: - number_of_pages = 1 - is_pdf = False - except Exception as e: - raise ValueError(f"Failed to read PDF file: {e}") - - if self.split == "none": - if is_pdf: - result = "" - start_page = 0 - num_pages = DEFAULT_NUMBER_OF_PAGE - for _ in range(number_of_pages): - if start_page >= number_of_pages: - break - - elements = self._split_and_request(full_docs, start_page, num_pages) - for element in elements: - result += parse_output(element, self.output_type) - - start_page += num_pages - - else: - if not blob.path: - raise ValueError("Blob path is required for non-PDF files.") - - result = "" - with open(blob.path, "rb") as f: - elements = self._get_response({"document": f}) - - for element in elements: - result += parse_output(element, self.output_type) - - yield Document( - page_content=result, - metadata={ - "total_pages": number_of_pages, - }, - ) - - elif self.split == "element": - if is_pdf: - start_page = 0 - for _ in range(number_of_pages): - if start_page >= number_of_pages: - break - - elements = self._split_and_request(full_docs, start_page, num_pages) - for element in elements: - yield self._element_document(element, start_page) - - start_page += num_pages - - else: - if not blob.path: - raise ValueError("Blob path is required for non-PDF files.") - with open(blob.path, "rb") as f: - elements = self._get_response({"document": f}) - - for element in elements: - yield self._element_document(element) - - elif self.split == "page": - if is_pdf: - start_page = 0 - for _ in range(number_of_pages): - if start_page >= number_of_pages: - break - - elements = self._split_and_request(full_docs, start_page, num_pages) - yield from self._page_document(elements, start_page) - - start_page += num_pages - else: - if not blob.path: - raise ValueError("Blob path is required for non-PDF files.") - with open(blob.path, "rb") as f: - elements = self._get_response({"document": f}) - - yield from self._page_document(elements) - - else: - raise ValueError(f"Invalid split type: {self.split}") diff --git a/libs/upstage/langchain_upstage/tools/groundedness_check.py b/libs/upstage/langchain_upstage/tools/groundedness_check.py index 325e024..40b9840 100644 --- a/libs/upstage/langchain_upstage/tools/groundedness_check.py +++ b/libs/upstage/langchain_upstage/tools/groundedness_check.py @@ -72,7 +72,7 @@ def __init__(self, **kwargs: Any) -> None: raise ValueError("UPSTAGE_API_KEY must be set or passed") api_wrapper = ChatUpstage( - model="solar-1-mini-answer-verification", + model="groundedness-check", api_key=upstage_api_key.get_secret_value(), ) super().__init__(upstage_api_key=upstage_api_key, api_wrapper=api_wrapper) diff --git a/libs/upstage/pyproject.toml b/libs/upstage/pyproject.toml index b315ac6..0b4a36d 100644 --- a/libs/upstage/pyproject.toml +++ b/libs/upstage/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain-upstage" -version = "0.4.0" +version = "0.5.0rc0" description = "An integration package connecting Upstage and LangChain" authors = [] readme = "README.md" diff --git a/libs/upstage/tests/integration_tests/test_chat_models_standard.py b/libs/upstage/tests/integration_tests/test_chat_models_standard.py index 6337c7b..7af3e9f 100644 --- a/libs/upstage/tests/integration_tests/test_chat_models_standard.py +++ b/libs/upstage/tests/integration_tests/test_chat_models_standard.py @@ -17,7 +17,7 @@ def chat_model_class(self) -> Type[BaseChatModel]: @property def chat_model_params(self) -> dict: return { - "model": "solar-1-mini-chat", + "model": "solar-mini", } @pytest.mark.xfail(reason="Not implemented.") diff --git a/libs/upstage/tests/integration_tests/test_layout_analysis.py b/libs/upstage/tests/integration_tests/test_layout_analysis.py deleted file mode 100644 index 07cfb8c..0000000 --- a/libs/upstage/tests/integration_tests/test_layout_analysis.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Test Upstage layout analysis.""" - -from pathlib import Path -from typing import List, get_args - -from langchain_upstage import UpstageLayoutAnalysisLoader -from langchain_upstage.layout_analysis import OutputType, SplitType - -EXAMPLE_PDF_PATH = Path(__file__).parent.parent / "examples/solar.pdf" - - -def test_layout_analysis_param() -> None: - """Test layout analysis document loader initialization.""" - - for output_type in get_args(OutputType): - for split in get_args(SplitType): - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type=output_type, - split=split, - ) - assert loader.output_type == output_type - assert loader.split == split - assert loader.file_path == EXAMPLE_PDF_PATH - assert loader.exclude == ["header", "footer"] - - excludes: List[List[str]] = [[], ["header"], ["header", "footer"]] - for exclude in excludes: - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type=output_type, - split=split, - exclude=exclude, - ) - assert loader.output_type == output_type - assert loader.split == split - assert loader.file_path == EXAMPLE_PDF_PATH - assert loader.exclude == exclude - - -def test_file_not_found_error() -> None: - """Test layout analysis error handling.""" - - try: - UpstageLayoutAnalysisLoader( - file_path="./NOT_EXISTING_FILE.pdf", - ) - assert False - except FileNotFoundError: - assert True - - -def test_none_split() -> None: - """Test layout analysis with no split.""" - - for output_type in get_args(OutputType): - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type=output_type, - split="none", - ) - documents = loader.load() - - assert len(documents) == 1 - assert documents[0].page_content is not None - assert documents[0].metadata["total_pages"] == 1 - - -def test_element_split() -> None: - """Test layout analysis with element split.""" - - for output_type in get_args(OutputType): - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type=output_type, - split="element", - ) - documents = loader.load() - - assert len(documents) == 13 - for document in documents: - assert document.page_content is not None - assert document.metadata["page"] == 1 - assert document.metadata["id"] is not None - assert document.metadata["bounding_box"] is not None - assert isinstance(document.metadata["bounding_box"], str) - assert document.metadata["category"] is not None - - -def test_page_split() -> None: - """Test layout analysis with page split.""" - - for output_type in get_args(OutputType): - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type=output_type, - split="page", - ) - documents = loader.load() - - assert len(documents) == 1 - for document in documents: - assert document.page_content is not None - assert document.metadata["page"] == 1 diff --git a/libs/upstage/tests/unit_tests/test_chat_models.py b/libs/upstage/tests/unit_tests/test_chat_models.py index 4a8690c..213aaed 100644 --- a/libs/upstage/tests/unit_tests/test_chat_models.py +++ b/libs/upstage/tests/unit_tests/test_chat_models.py @@ -112,7 +112,7 @@ def mock_completion() -> dict: "id": "chatcmpl-7fcZavknQda3SQ", "object": "chat.completion", "created": 1689989000, - "model": "solar-1-mini-chat", + "model": "solar-mini", "choices": [ { "index": 0, @@ -248,12 +248,12 @@ def test_upstage_invoke_name(mock_completion: dict) -> None: def test_upstage_tokenizer() -> None: - llm = ChatUpstage(model="solar-1-mini-chat") + llm = ChatUpstage(model="solar-mini") llm._get_tokenizer() def test_upstage_tokenizer_get_num_tokens() -> None: - llm = ChatUpstage(model="solar-1-mini-chat") + llm = ChatUpstage(model="solar-mini") num_tokens = llm.get_num_tokens_from_messages([HumanMessage(content="Hello World")]) assert num_tokens == 12 @@ -279,4 +279,4 @@ def test_chat_upstage_extra_kwargs() -> None: # Test that "model" cannot be specified in kwargs with pytest.raises(ValueError): - ChatUpstage(model_kwargs={"model": "solar-1-mini-chat"}) + ChatUpstage(model_kwargs={"model": "solar-mini"}) diff --git a/libs/upstage/tests/unit_tests/test_chat_models_standard.py b/libs/upstage/tests/unit_tests/test_chat_models_standard.py index 89b7ece..038742d 100644 --- a/libs/upstage/tests/unit_tests/test_chat_models_standard.py +++ b/libs/upstage/tests/unit_tests/test_chat_models_standard.py @@ -16,7 +16,7 @@ def chat_model_class(self) -> Type[BaseChatModel]: @property def chat_model_params(self) -> dict: return { - "model": "solar-1-mini-chat", + "model": "solar-mini", } @property diff --git a/libs/upstage/tests/unit_tests/test_imports.py b/libs/upstage/tests/unit_tests/test_imports.py index 1d49335..a66d422 100644 --- a/libs/upstage/tests/unit_tests/test_imports.py +++ b/libs/upstage/tests/unit_tests/test_imports.py @@ -3,10 +3,8 @@ EXPECTED_ALL = [ "ChatUpstage", "UpstageEmbeddings", - "UpstageLayoutAnalysisLoader", "UpstageDocumentParseLoader", "UpstageDocumentParseParser", - "UpstageLayoutAnalysisParser", "UpstageGroundednessCheck", "GroundednessCheck", ] diff --git a/libs/upstage/tests/unit_tests/test_layout_analysis.py b/libs/upstage/tests/unit_tests/test_layout_analysis.py deleted file mode 100644 index a122b85..0000000 --- a/libs/upstage/tests/unit_tests/test_layout_analysis.py +++ /dev/null @@ -1,247 +0,0 @@ -import json -from pathlib import Path -from typing import Any, Dict, get_args -from unittest import TestCase -from unittest.mock import MagicMock, Mock, patch - -import requests - -from langchain_upstage import UpstageLayoutAnalysisLoader -from langchain_upstage.layout_analysis import OutputType, SplitType - -MOCK_RESPONSE_JSON: Dict[str, Any] = { - "api": "1.0", - "billed_pages": 1, - "elements": [ - { - "bounding_box": [ - {"x": 74, "y": 906}, - {"x": 148, "y": 906}, - {"x": 148, "y": 2338}, - {"x": 74, "y": 2338}, - ], - "category": "header", - "html": "
arXiv:2103.15348v2
", - "id": 0, - "page": 1, - "text": "arXiv:2103.15348v2", - }, - { - "bounding_box": [ - {"x": 654, "y": 474}, - {"x": 1912, "y": 474}, - {"x": 1912, "y": 614}, - {"x": 654, "y": 614}, - ], - "category": "paragraph", - "html": "

LayoutParser Toolkit

", - "id": 1, - "page": 1, - "text": "LayoutParser Toolkit", - }, - ], - "html": "
arXiv:2103.15348v2
" - + "

LayoutParser Toolkit

", - "mimetype": "multipart/form-data", - "model": "layout-analyzer-0.1.0", - "text": "arXiv:2103.15348v2LayoutParser Toolkit", -} - -EXAMPLE_PDF_PATH = Path(__file__).parent.parent / "examples/solar.pdf" - - -def test_initialization() -> None: - """Test layout analysis document loader initialization.""" - UpstageLayoutAnalysisLoader(file_path=EXAMPLE_PDF_PATH, api_key="bar") - - -def test_layout_analysis_param() -> None: - for output_type in get_args(OutputType): - for split in get_args(SplitType): - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - api_key="bar", - output_type=output_type, - split=split, - exclude=[], - ) - assert loader.output_type == output_type - assert loader.split == split - assert loader.api_key == "bar" - assert loader.file_path == EXAMPLE_PDF_PATH - - -@patch("requests.post") -def test_none_split_text_output(mock_post: Mock) -> None: - mock_post.return_value = MagicMock( - status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON) - ) - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="text", - split="none", - api_key="valid_api_key", - exclude=[], - ) - documents = loader.load() - - assert len(documents) == 1 - assert documents[0].page_content == MOCK_RESPONSE_JSON["text"] - assert documents[0].metadata["total_pages"] == 1 - - -@patch("requests.post") -def test_element_split_text_output(mock_post: Mock) -> None: - mock_post.return_value = MagicMock( - status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON) - ) - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="text", - split="element", - api_key="valid_api_key", - exclude=[], - ) - documents = loader.load() - - assert len(documents) == 2 - - for i, document in enumerate(documents): - assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["text"] - assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"] - assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"] - assert document.metadata["bounding_box"] == json.dumps( - MOCK_RESPONSE_JSON["elements"][i]["bounding_box"] - ) - - -@patch("requests.post") -def test_page_split_text_output(mock_post: Mock) -> None: - mock_post.return_value = MagicMock( - status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON) - ) - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="text", - split="page", - api_key="valid_api_key", - exclude=[], - ) - documents = loader.load() - - assert len(documents) == 1 - - for i, document in enumerate(documents): - assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"] - - -@patch("requests.post") -def test_none_split_html_output(mock_post: Mock) -> None: - mock_post.return_value = MagicMock( - status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON) - ) - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="html", - split="none", - api_key="valid_api_key", - exclude=[], - ) - documents = loader.load() - - assert len(documents) == 1 - assert documents[0].page_content == MOCK_RESPONSE_JSON["html"] - assert documents[0].metadata["total_pages"] == 1 - - -@patch("requests.post") -def test_element_split_html_output(mock_post: Mock) -> None: - mock_post.return_value = MagicMock( - status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON) - ) - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="html", - split="element", - api_key="valid_api_key", - exclude=[], - ) - documents = loader.load() - - assert len(documents) == 2 - - for i, document in enumerate(documents): - assert document.page_content == MOCK_RESPONSE_JSON["elements"][i]["html"] - assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"] - assert document.metadata["id"] == MOCK_RESPONSE_JSON["elements"][i]["id"] - assert document.metadata["bounding_box"] == json.dumps( - MOCK_RESPONSE_JSON["elements"][i]["bounding_box"] - ) - - -@patch("requests.post") -def test_page_split_html_output(mock_post: Mock) -> None: - mock_post.return_value = MagicMock( - status_code=200, json=MagicMock(return_value=MOCK_RESPONSE_JSON) - ) - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="html", - split="page", - api_key="valid_api_key", - exclude=[], - ) - documents = loader.load() - - assert len(documents) == 1 - - for i, document in enumerate(documents): - assert document.metadata["page"] == MOCK_RESPONSE_JSON["elements"][i]["page"] - - -@patch("requests.post") -def test_request_exception(mock_post: Mock) -> None: - mock_post.side_effect = requests.RequestException("Mocked request exception") - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="html", - split="page", - api_key="valid_api_key", - exclude=[], - ) - - with TestCase.assertRaises(TestCase(), ValueError) as context: - loader.load() - - assert "Failed to send request: Mocked request exception" == str(context.exception) - - -@patch("requests.post") -def test_json_decode_error(mock_post: Mock) -> None: - mock_response = Mock() - mock_response.status_code = 200 - mock_response.json.side_effect = json.JSONDecodeError("Expecting value", "", 0) - mock_post.return_value = mock_response - - loader = UpstageLayoutAnalysisLoader( - file_path=EXAMPLE_PDF_PATH, - output_type="html", - split="page", - api_key="valid_api_key", - exclude=[], - ) - - with TestCase.assertRaises(TestCase(), ValueError) as context: - loader.load() - - assert ( - "Failed to decode JSON response: Expecting value: line 1 column 1 (char 0)" - == str(context.exception) - )