From 48df2d01d487ec573a68a315e7feb7ee87532d5c Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Wed, 4 Dec 2024 11:59:06 +0200 Subject: [PATCH 01/12] Initial commit --- validation/check_dataset.py | 25 +++++++++++++++++++++++++ validation/requirements.txt | 1 + 2 files changed, 26 insertions(+) create mode 100644 validation/check_dataset.py create mode 100644 validation/requirements.txt diff --git a/validation/check_dataset.py b/validation/check_dataset.py new file mode 100644 index 0000000..4102f41 --- /dev/null +++ b/validation/check_dataset.py @@ -0,0 +1,25 @@ +# Cohere For AI Community, Danylo Boiko, 2024 + +from argparse import ArgumentParser + +from rich.console import Console + + +class DatasetValidator: + def __init__(self, json_path: str, language_code: str) -> None: + self.json_path = json_path + self.language_code = language_code.lower() + self.console = Console() + + def validate(self): + pass + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument("--json_path", type=str, required=True, help="Path to the JSON file to be validated") + parser.add_argument("--language_code", type=str, required=True, help="The language code for the dataset") + args = parser.parse_args() + + validator = DatasetValidator(args.json_path, args.language_code) + validator.validate() diff --git a/validation/requirements.txt b/validation/requirements.txt new file mode 100644 index 0000000..c94be38 --- /dev/null +++ b/validation/requirements.txt @@ -0,0 +1 @@ +rich \ No newline at end of file From 34341d1590b2a49b5773917c0c39f2666d423e95 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Wed, 4 Dec 2024 12:01:54 +0200 Subject: [PATCH 02/12] Rename folder --- {validation => validator}/check_dataset.py | 0 {validation => validator}/requirements.txt | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {validation => validator}/check_dataset.py (100%) rename {validation => validator}/requirements.txt (100%) diff --git a/validation/check_dataset.py b/validator/check_dataset.py similarity index 100% rename from validation/check_dataset.py rename to validator/check_dataset.py diff --git a/validation/requirements.txt b/validator/requirements.txt similarity index 100% rename from validation/requirements.txt rename to validator/requirements.txt From 2e0fe63f27bf3c1476abf291742ea145ed0be8c2 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Fri, 6 Dec 2024 23:11:16 +0200 Subject: [PATCH 03/12] Add generic validation pipeline --- validator/check_dataset.py | 86 ++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 9 deletions(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index 4102f41..ff6f1d5 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -1,25 +1,93 @@ # Cohere For AI Community, Danylo Boiko, 2024 -from argparse import ArgumentParser +import json +import argparse from rich.console import Console +from rich.panel import Panel +from rich.syntax import Syntax +from rich.text import Text +from rich.tree import Tree + + +class ValidationError: + def __init__(self, entity_index: int, message: str) -> None: + self.entity_index = entity_index + self.message = message class DatasetValidator: - def __init__(self, json_path: str, language_code: str) -> None: - self.json_path = json_path - self.language_code = language_code.lower() - self.console = Console() + def __init__(self, json_file: str, language_code: str) -> None: + self.json_file: str = json_file + self.json_entries: list[dict] = [] + self.language_code: str = language_code.lower() + self.console: Console = Console() + self.errors: list[ValidationError] = [] + + def validate(self) -> None: + self.console.print("Starting validation...", style="bold green") + self.console.print(f"JSON file: {self.json_file}", style="cyan") + self.console.print(f"Language code: {self.language_code}", style="cyan") + + if not self._load_json(): + return + + self._validate_entries() + + self._print_validation_report() + + def _load_json(self) -> bool: + try: + with open(self.json_file, "r", encoding="utf-8") as file: + entries = json.load(file) + + if not isinstance(entries, list): + raise ValueError(f"The file must contain a JSON array (list of entries)") + + self.json_entries = entries + + return True + except Exception as error: + self.console.print(f"Error loading file {self.json_file}: {error}", style="red") - def validate(self): + return False + + def _validate_entries(self) -> None: pass + def _print_validation_report(self) -> None: + if len(self.errors) == 0: + return self.console.print("Congratulations, the JSON file is valid!", style="green") + + self.console.print("The following errors were found, fix them and try again:", style="red") + + for error in self.errors: + self.console.print(Panel(self._create_error_tree(error), expand=False, border_style="red")) + + def _create_error_tree(self, error: ValidationError) -> Tree: + entry = self.json_entries[error.entity_index] + + tree = Tree(f"Error in entry with index {error.entity_index}", style="red") + tree.add(Text(error.message, style="yellow")) + + question_node = tree.add("Question") + question_node.add(Syntax(entry.get("question", "[N/A]"), "text", word_wrap=True)) + + options_node = tree.add("Options") + for option_num, option_value in enumerate(entry.get("options", []), 1): + options_node.add(f"{option_num}. {option_value}") + + answer_node = tree.add("Answer") + answer_node.add(str(entry.get("answer", "[N/A]"))) + + return tree + if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument("--json_path", type=str, required=True, help="Path to the JSON file to be validated") + parser = argparse.ArgumentParser() + parser.add_argument("--json_file", type=str, required=True, help="Path to the JSON file to be validated") parser.add_argument("--language_code", type=str, required=True, help="The language code for the dataset") args = parser.parse_args() - validator = DatasetValidator(args.json_path, args.language_code) + validator = DatasetValidator(args.json_file, args.language_code) validator.validate() From 21c037fed601e5c544e2d1b04e27c52755eeefb0 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Sun, 8 Dec 2024 15:28:09 +0200 Subject: [PATCH 04/12] Add data validation --- validator/check_dataset.py | 129 +++++++++++++++++++++++++++++++++---- validator/requirements.txt | 3 +- 2 files changed, 117 insertions(+), 15 deletions(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index ff6f1d5..0a3b49b 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -3,6 +3,10 @@ import json import argparse +from typing import Union, Literal, Optional + +from pydantic import BaseModel, ValidationError, field_validator +from pydantic_core.core_schema import ValidationInfo from rich.console import Console from rich.panel import Panel from rich.syntax import Syntax @@ -10,11 +14,100 @@ from rich.tree import Tree -class ValidationError: - def __init__(self, entity_index: int, message: str) -> None: - self.entity_index = entity_index +class EntrySchema(BaseModel): + language: str + country: str + file_name: str + source: str + license: str + level: str + category_en: str + category_original_lang: str + original_question_num: Union[int, str] + question: str + options: list[str] + answer: int + image_png: Optional[str] + image_information: Literal["useful", "essential"] + image_type: Literal["symbols", "figures", "graph", "table", "text"] + parallel_question_num: Optional[Union[int, str]] + + @staticmethod + def _validate_string(value: str) -> str: + if len(value.strip()) == 0: + raise ValueError("Value cannot be empty or whitespace") + + if value.startswith(" ") or value.endswith(" "): + raise ValueError("Value cannot have leading or trailing spaces") + + return value + + @staticmethod + def _validate_uniqueness(values: list) -> list: + if len(set(values)) != len(values): + raise ValueError("All values must be unique") + + return values + + @staticmethod + def _validate_length(values: list, expected_length) -> list: + length = len(values) + + if length != expected_length: + raise ValueError(f"Expected {expected_length} values, but got {length}") + + return values + + @field_validator("language") + def validate_language(cls, language: str, config: ValidationInfo) -> str: + expected_language = config.context.get("language") + + if language != expected_language: + raise ValueError(f"Expected '{expected_language}', but got '{language}'") + + return cls._validate_string(language) + + @field_validator("options") + def validate_options(cls, options: list[str]) -> list[str]: + for option in options: + cls._validate_string(option) + + cls._validate_uniqueness(options) + + return cls._validate_length(options, 4) + + @field_validator("answer") + def validate_answer(cls, answer: int, config: ValidationInfo) -> int: + options_count = len(config.data.get("options", [])) + + if options_count != 0 and answer not in range(options_count): + raise ValueError(f"Expected value from 0 to {options_count - 1}, but got {answer}") + + return answer + + @field_validator( + "country", "file_name", "source", "license", "level", "category_en", "category_original_lang", + "original_question_num", "question", "image_png", "parallel_question_num" + ) + def validate_string_fields(cls, value: Optional[str]) -> Optional[str]: + return cls._validate_string(value) if isinstance(value, str) else value + + class Config: + extra = "forbid" + + +class EntryError: + def __init__(self, index: int, location: tuple | None, message: str) -> None: + self.index = index + self.location = location self.message = message + def __str__(self) -> str: + if self.location: + return f"Location: {str(self.location).strip("(,)")}, error: {self.message.lower()}" + + return self.message + class DatasetValidator: def __init__(self, json_file: str, language_code: str) -> None: @@ -22,10 +115,10 @@ def __init__(self, json_file: str, language_code: str) -> None: self.json_entries: list[dict] = [] self.language_code: str = language_code.lower() self.console: Console = Console() - self.errors: list[ValidationError] = [] + self.errors: list[EntryError] = [] def validate(self) -> None: - self.console.print("Starting validation...", style="bold green") + self.console.print("Starting validation...", style="green") self.console.print(f"JSON file: {self.json_file}", style="cyan") self.console.print(f"Language code: {self.language_code}", style="cyan") @@ -47,13 +140,21 @@ def _load_json(self) -> bool: self.json_entries = entries return True - except Exception as error: - self.console.print(f"Error loading file {self.json_file}: {error}", style="red") + except Exception as e: + self.console.print(f"Error loading file {self.json_file}: {e}", style="red") return False def _validate_entries(self) -> None: - pass + for index, entry in enumerate(self.json_entries): + try: + EntrySchema.model_validate(entry, context={ + "language": self.language_code + }) + except ValidationError as e: + self.errors.extend([ + EntryError(index, error.get("loc", None), error.get("msg")) for error in e.errors() + ]) def _print_validation_report(self) -> None: if len(self.errors) == 0: @@ -64,21 +165,21 @@ def _print_validation_report(self) -> None: for error in self.errors: self.console.print(Panel(self._create_error_tree(error), expand=False, border_style="red")) - def _create_error_tree(self, error: ValidationError) -> Tree: - entry = self.json_entries[error.entity_index] + def _create_error_tree(self, error: EntryError) -> Tree: + entry = self.json_entries[error.index] - tree = Tree(f"Error in entry with index {error.entity_index}", style="red") - tree.add(Text(error.message, style="yellow")) + tree = Tree(f"Error in entry with index {error.index}", style="red") + tree.add(Text(str(error), style="yellow")) question_node = tree.add("Question") - question_node.add(Syntax(entry.get("question", "[N/A]"), "text", word_wrap=True)) + question_node.add(Syntax(entry.get("question", "N/A"), "text", word_wrap=True)) options_node = tree.add("Options") for option_num, option_value in enumerate(entry.get("options", []), 1): options_node.add(f"{option_num}. {option_value}") answer_node = tree.add("Answer") - answer_node.add(str(entry.get("answer", "[N/A]"))) + answer_node.add(str(entry.get("answer", "N/A"))) return tree diff --git a/validator/requirements.txt b/validator/requirements.txt index c94be38..6376362 100644 --- a/validator/requirements.txt +++ b/validator/requirements.txt @@ -1 +1,2 @@ -rich \ No newline at end of file +rich +pydantic \ No newline at end of file From c55e83168b600a75d4517e193db5d418e0e2c4e1 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Sun, 8 Dec 2024 15:45:16 +0200 Subject: [PATCH 05/12] Add minor refactoring --- validator/check_dataset.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index 0a3b49b..89ffbf5 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -14,6 +14,9 @@ from rich.tree import Tree +EXPECTED_OPTIONS_COUNT = 4 + + class EntrySchema(BaseModel): language: str country: str @@ -34,7 +37,7 @@ class EntrySchema(BaseModel): @staticmethod def _validate_string(value: str) -> str: - if len(value.strip()) == 0: + if not value.strip(): raise ValueError("Value cannot be empty or whitespace") if value.startswith(" ") or value.endswith(" "): @@ -43,24 +46,22 @@ def _validate_string(value: str) -> str: return value @staticmethod - def _validate_uniqueness(values: list) -> list: + def _validate_list_uniqueness(values: list) -> list: if len(set(values)) != len(values): raise ValueError("All values must be unique") return values @staticmethod - def _validate_length(values: list, expected_length) -> list: - length = len(values) - - if length != expected_length: - raise ValueError(f"Expected {expected_length} values, but got {length}") + def _validate_list_length(values: list, expected_length: int) -> list: + if len(values) != expected_length: + raise ValueError(f"Expected {expected_length} values, but got {len(values)}") return values @field_validator("language") def validate_language(cls, language: str, config: ValidationInfo) -> str: - expected_language = config.context.get("language") + expected_language = config.context.get("expected_language") if language != expected_language: raise ValueError(f"Expected '{expected_language}', but got '{language}'") @@ -72,15 +73,15 @@ def validate_options(cls, options: list[str]) -> list[str]: for option in options: cls._validate_string(option) - cls._validate_uniqueness(options) + cls._validate_list_uniqueness(options) - return cls._validate_length(options, 4) + return cls._validate_list_length(options, EXPECTED_OPTIONS_COUNT) @field_validator("answer") def validate_answer(cls, answer: int, config: ValidationInfo) -> int: options_count = len(config.data.get("options", [])) - if options_count != 0 and answer not in range(options_count): + if options_count > 0 and not (0 <= answer < options_count): raise ValueError(f"Expected value from 0 to {options_count - 1}, but got {answer}") return answer @@ -149,7 +150,7 @@ def _validate_entries(self) -> None: for index, entry in enumerate(self.json_entries): try: EntrySchema.model_validate(entry, context={ - "language": self.language_code + "expected_language": self.language_code }) except ValidationError as e: self.errors.extend([ From c4ef8311c82b8457e598b3be74d18f528ea05962 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Sun, 8 Dec 2024 15:49:28 +0200 Subject: [PATCH 06/12] Inline validator methods --- validator/check_dataset.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index 89ffbf5..a7693e2 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -45,20 +45,6 @@ def _validate_string(value: str) -> str: return value - @staticmethod - def _validate_list_uniqueness(values: list) -> list: - if len(set(values)) != len(values): - raise ValueError("All values must be unique") - - return values - - @staticmethod - def _validate_list_length(values: list, expected_length: int) -> list: - if len(values) != expected_length: - raise ValueError(f"Expected {expected_length} values, but got {len(values)}") - - return values - @field_validator("language") def validate_language(cls, language: str, config: ValidationInfo) -> str: expected_language = config.context.get("expected_language") @@ -73,9 +59,13 @@ def validate_options(cls, options: list[str]) -> list[str]: for option in options: cls._validate_string(option) - cls._validate_list_uniqueness(options) + if len(set(options)) != len(options): + raise ValueError("All values must be unique") + + if len(options) != EXPECTED_OPTIONS_COUNT: + raise ValueError(f"Expected {EXPECTED_OPTIONS_COUNT} values, but got {len(options)}") - return cls._validate_list_length(options, EXPECTED_OPTIONS_COUNT) + return options @field_validator("answer") def validate_answer(cls, answer: int, config: ValidationInfo) -> int: From f8deb9e49d81afa2d2a76b73025ff5a85ccac539 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Sun, 8 Dec 2024 15:51:42 +0200 Subject: [PATCH 07/12] Add minor refactoring --- validator/check_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index a7693e2..39a1398 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -129,12 +129,10 @@ def _load_json(self) -> bool: raise ValueError(f"The file must contain a JSON array (list of entries)") self.json_entries = entries - return True except Exception as e: self.console.print(f"Error loading file {self.json_file}: {e}", style="red") - - return False + return False def _validate_entries(self) -> None: for index, entry in enumerate(self.json_entries): From 2c531bd011911bcbf358a79ad637e36f3d1945be Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Sun, 8 Dec 2024 15:53:49 +0200 Subject: [PATCH 08/12] Make error location optional --- validator/check_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index 39a1398..97146f3 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -88,7 +88,7 @@ class Config: class EntryError: - def __init__(self, index: int, location: tuple | None, message: str) -> None: + def __init__(self, index: int, location: Optional[tuple], message: str) -> None: self.index = index self.location = location self.message = message From a5d7ff7d812544984caf8fa7d945687d367bf1da Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Sun, 8 Dec 2024 16:04:26 +0200 Subject: [PATCH 09/12] Improve code style --- validator/check_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index 97146f3..a2e7bfe 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -117,7 +117,6 @@ def validate(self) -> None: return self._validate_entries() - self._print_validation_report() def _load_json(self) -> bool: From 6098f53410802a9f52040fc33899420cb45acde6 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Mon, 9 Dec 2024 14:04:08 +0200 Subject: [PATCH 10/12] Add validation for image data --- validator/check_dataset.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index a2e7bfe..f23ee9d 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -5,7 +5,7 @@ from typing import Union, Literal, Optional -from pydantic import BaseModel, ValidationError, field_validator +from pydantic import BaseModel, ValidationError, field_validator, model_validator from pydantic_core.core_schema import ValidationInfo from rich.console import Console from rich.panel import Panel @@ -31,9 +31,9 @@ class EntrySchema(BaseModel): options: list[str] answer: int image_png: Optional[str] - image_information: Literal["useful", "essential"] - image_type: Literal["symbols", "figures", "graph", "table", "text"] - parallel_question_num: Optional[Union[int, str]] + image_information: Optional[Literal["useful", "essential"]] + image_type: Optional[Literal["graph", "table", "diagram", "scientific formula", "text", "figure", "map", "photo"]] + parallel_question_id: Optional[tuple[str, int]] @staticmethod def _validate_string(value: str) -> str: @@ -76,13 +76,31 @@ def validate_answer(cls, answer: int, config: ValidationInfo) -> int: return answer + @field_validator("parallel_question_id") + def validate_parallel_question_id(cls, parallel_question_id: Optional[tuple[str, int]]) -> Optional[tuple[str, int]]: + if isinstance(parallel_question_id, tuple) and isinstance(parallel_question_id[0], str): + cls._validate_string(parallel_question_id[0]) + + return parallel_question_id + @field_validator( "country", "file_name", "source", "license", "level", "category_en", "category_original_lang", - "original_question_num", "question", "image_png", "parallel_question_num" + "original_question_num", "question", "image_png" ) def validate_string_fields(cls, value: Optional[str]) -> Optional[str]: return cls._validate_string(value) if isinstance(value, str) else value + @model_validator(mode="after") + def validate_image_data(cls, model: "EntrySchema") -> "EntrySchema": + image_data = [model.image_png, model.image_information, model.image_type] + + if any(image_data) and not all(image_data): + raise ValueError( + "All fields related to image data (prefixed with 'image_') must be specified if any one of them is specified" + ) + + return model + class Config: extra = "forbid" @@ -94,10 +112,13 @@ def __init__(self, index: int, location: Optional[tuple], message: str) -> None: self.message = message def __str__(self) -> str: + message = self.message.removeprefix("Value error, ") + if self.location: - return f"Location: {str(self.location).strip("(,)")}, error: {self.message.lower()}" + location = str(self.location).strip("(,)") + return f"Location: {location}, error: {message.lower()}" - return self.message + return message class DatasetValidator: @@ -125,7 +146,7 @@ def _load_json(self) -> bool: entries = json.load(file) if not isinstance(entries, list): - raise ValueError(f"The file must contain a JSON array (list of entries)") + raise ValueError("The file must contain a JSON array (list of entries)") self.json_entries = entries return True From 0774dec9f3f9e2a26dd2ccded06dd4fe32b144a2 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Mon, 9 Dec 2024 16:29:14 +0200 Subject: [PATCH 11/12] Allow n-choice questions --- validator/check_dataset.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index f23ee9d..f7ce211 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -14,9 +14,6 @@ from rich.tree import Tree -EXPECTED_OPTIONS_COUNT = 4 - - class EntrySchema(BaseModel): language: str country: str @@ -59,12 +56,12 @@ def validate_options(cls, options: list[str]) -> list[str]: for option in options: cls._validate_string(option) + if len(options) < 2: + raise ValueError(f"Expected at least 2 options, but got {len(options)}") + if len(set(options)) != len(options): raise ValueError("All values must be unique") - if len(options) != EXPECTED_OPTIONS_COUNT: - raise ValueError(f"Expected {EXPECTED_OPTIONS_COUNT} values, but got {len(options)}") - return options @field_validator("answer") From 46cae54b806f0a635cb25d3bb37c58c400bda6c1 Mon Sep 17 00:00:00 2001 From: Danylo Boiko Date: Mon, 9 Dec 2024 17:02:05 +0200 Subject: [PATCH 12/12] Format strip call --- validator/check_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/check_dataset.py b/validator/check_dataset.py index f7ce211..e5356c3 100644 --- a/validator/check_dataset.py +++ b/validator/check_dataset.py @@ -112,7 +112,7 @@ def __str__(self) -> str: message = self.message.removeprefix("Value error, ") if self.location: - location = str(self.location).strip("(,)") + location = str(self.location).strip(",()") return f"Location: {location}, error: {message.lower()}" return message