From dc5a08083bec00bd06e54138b581516f18c19f76 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 29 Oct 2024 14:15:03 -0700 Subject: [PATCH 1/2] [JSON] Raise exceptions when "sibling" keywords are unhandled (#1063) When an applicator has sibling keywords, JSON schema mandates that both sets of keywords are respected. This PR ensures that we raise early exceptions wherever this "schema intersection" behavior isn't explicitly handled. --- guidance/library/_json.py | 64 ++++++++++++--- tests/unit/library/test_json.py | 133 +++++++++++++++++++++++++++++--- 2 files changed, 173 insertions(+), 24 deletions(-) diff --git a/guidance/library/_json.py b/guidance/library/_json.py index 4f8c98ce0..93e0247e9 100644 --- a/guidance/library/_json.py +++ b/guidance/library/_json.py @@ -165,17 +165,17 @@ class ObjectKeywords(str, Enum): JSONType.OBJECT: ObjectKeywords, } -DEFS_KEYS = {"$defs", "definitions"} - IGNORED_KEYS = { "$anchor", + "$defs", "$schema", "$id", "id", "$comment", "title", - "description", "default", + "definitions", + "description", "examples", } @@ -188,7 +188,7 @@ class ObjectKeywords(str, Enum): IGNORED_KEYS.add("discriminator") WHITESPACE = {b" ", b"\t", b"\n", b"\r"} -VALID_KEYS = set(Keyword) | IGNORED_KEYS | DEFS_KEYS | set(NumberKeywords) | set(StringKeywords) | set(ArrayKeywords) | set(ObjectKeywords) +VALID_KEYS = set(Keyword) | set(NumberKeywords) | set(StringKeywords) | set(ArrayKeywords) | set(ObjectKeywords) | IGNORED_KEYS FORMAT_PATTERNS: dict[str, Optional[str]] = { # https://json-schema.org/understanding-json-schema/reference/string#built-in-formats @@ -398,6 +398,11 @@ def validate_json_node_keys(node: Mapping[str, Any]): ) +def get_sibling_keys(node: Mapping[str, Any], key: str) -> set[str]: + # Get the set of functional (non-ignored) keys that are siblings of the given key + return set(node.keys()) & VALID_KEYS - set(IGNORED_KEYS) - {key} + + class GenJson: item_separator = ", " key_separator = ": " @@ -723,7 +728,20 @@ def const( lm, *, value: Union[None, bool, int, float, str, Mapping, Sequence], + instance_type: Optional[Union[str, Sequence[str]]] = None, + enum: Optional[Sequence[Union[None, bool, int, float, str, Mapping, Sequence]]] = None, ): + schema_to_validate_against: dict[str, Any] = {} + if instance_type is not None: + schema_to_validate_against["type"] = instance_type + if enum is not None: + schema_to_validate_against["enum"] = enum + if schema_to_validate_against: + # Raise a validation error if the value doesn't match the type + jsonschema.validate( + instance=value, + schema=schema_to_validate_against, + ) # Base case if isinstance(value, (type(None), bool, int, float, str)): return lm + json_dumps(value) @@ -756,14 +774,18 @@ def enum( self, lm, *, - options: Sequence[Mapping[str, Any]] + options: Sequence[Union[None, bool, int, float, str, Mapping, Sequence]], + instance_type: Optional[Union[str, Sequence[str]]] = None, ): - # TODO: can we support a whitespace-flexible version of this? all_opts: list[GrammarFunction] = [] - for opt in options: - all_opts.append( - self.const(value=opt) - ) + for instance in options: + try: + grm = self.const(value=instance, instance_type=instance_type) + except jsonschema.ValidationError: + continue + all_opts.append(grm) + if not all_opts: + raise ValueError(f"No valid options found for enum with type {instance_type!r}: {options}") return lm + select(options=all_opts) @@ -811,15 +833,24 @@ def json( validate_json_node_keys(json_schema) if Keyword.ANYOF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.ANYOF) + if sibling_keys: + raise NotImplementedError(f"anyOf with sibling keys is not yet supported. Got {sibling_keys}") return lm + self.anyOf(anyof_list=json_schema[Keyword.ANYOF]) if Keyword.ALLOF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.ALLOF) + if sibling_keys: + raise NotImplementedError(f"allOf with sibling keys is not yet supported. Got {sibling_keys}") allof_list = json_schema[Keyword.ALLOF] if len(allof_list) != 1: raise ValueError("Only support allOf with exactly one item") return lm + self.json(json_schema=allof_list[0]) if Keyword.ONEOF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.ONEOF) + if sibling_keys: + raise NotImplementedError(f"oneOf with sibling keys is not yet supported. Got {sibling_keys}") oneof_list = json_schema[Keyword.ONEOF] if len(oneof_list) == 1: return lm + self.json(json_schema=oneof_list[0]) @@ -827,13 +858,22 @@ def json( return lm + self.anyOf(anyof_list=oneof_list) if Keyword.REF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.REF) + if sibling_keys: + raise NotImplementedError(f"$ref with sibling keys is not yet supported. Got {sibling_keys}") return lm + self.ref(reference=json_schema[Keyword.REF]) if Keyword.CONST in json_schema: - return lm + self.const(value=json_schema[Keyword.CONST]) + sibling_keys = get_sibling_keys(json_schema, Keyword.CONST) - {Keyword.TYPE, Keyword.ENUM} + if sibling_keys: + raise NotImplementedError(f"const with sibling keys is not yet supported. Got {sibling_keys}") + return lm + self.const(value=json_schema[Keyword.CONST], instance_type=json_schema.get(Keyword.TYPE, None), enum=json_schema.get(Keyword.ENUM, None)) if Keyword.ENUM in json_schema: - return lm + self.enum(options=json_schema[Keyword.ENUM]) + sibling_keys = get_sibling_keys(json_schema, Keyword.ENUM) - {Keyword.TYPE} + if sibling_keys: + raise NotImplementedError(f"enum with sibling keys is not yet supported. Got {sibling_keys}") + return lm + self.enum(options=json_schema[Keyword.ENUM], instance_type=json_schema.get(Keyword.TYPE, None)) if Keyword.TYPE in json_schema: target_types = cast(Union[str, Sequence[str]], json_schema[Keyword.TYPE]) diff --git a/tests/unit/library/test_json.py b/tests/unit/library/test_json.py index cad0d6742..2deea6eda 100644 --- a/tests/unit/library/test_json.py +++ b/tests/unit/library/test_json.py @@ -1265,14 +1265,12 @@ def test_nested_refs(self, test_object, valid): # ref valid, maxItems valid ({"foo": []}, True), # ref valid, maxItems invalid - pytest.param( - *({"foo": [1, 2, 3]}, False), - marks=pytest.mark.xfail(reason="sibling keywords to ref are not yet supported"), - ), + ({"foo": [1, 2, 3]}, False), # ref invalid ({"foo": "string"}, False), ], ) + @pytest.mark.xfail(reason="sibling keywords to ref are not yet supported") def test_ref_applies_alongside_sibling_keywords(self, test_object, valid): schema = { "$schema": "https://json-schema.org/draft/2020-12/schema", @@ -1559,12 +1557,10 @@ def test_naive_replacement_of_ref_with_its_destination_is_not_correct( # invalid on outer field ({"foo": {"bar": "a"}, "bar": 1}, False), # valid on both fields - pytest.param( - *({"foo": {"bar": "a"}, "bar": "a"}, True), - marks=pytest.mark.xfail(reason="refs with sibling keywords are not yet supported; foo here is being seen as an additionalProperty before bar"), - ), + ({"foo": {"bar": "a"}, "bar": "a"}, True), ], ) + @pytest.mark.xfail(reason="refs with sibling keywords are not yet supported") def test_refs_with_relative_uris_and_defs(self, test_object, valid): schema = { "$schema": "https://json-schema.org/draft/2020-12/schema", @@ -1594,12 +1590,10 @@ def test_refs_with_relative_uris_and_defs(self, test_object, valid): # invalid on outer field ({"foo": {"bar": "a"}, "bar": 1}, False), # valid on both fields - pytest.param( - *({"foo": {"bar": "a"}, "bar": "a"}, True), - marks=pytest.mark.xfail(reason="refs with sibling keywords are not yet supported; foo here is being seen as an additionalProperty before bar"), - ), + ({"foo": {"bar": "a"}, "bar": "a"}, True), ], ) + @pytest.mark.xfail(reason="refs with sibling keywords are not yet supported") def test_relative_refs_with_absolute_uris_and_defs(self, test_object, valid): schema = { "$schema": "https://json-schema.org/draft/2020-12/schema", @@ -2354,6 +2348,60 @@ def test_bad_prefix_enum(self, bad_obj, good_bytes, failure_byte, allowed_bytes) schema_obj=schema_obj, ) + @pytest.mark.parametrize( + "obj, valid", + [ + (1, True), + (2, False), + ("2", False), + ("1", False), + (True, False), + ] + ) + def test_typed_enum_single_type(self, obj, valid): + schema_obj = { + "enum": [1, "2", True], + "type": "integer" + } + if valid: + validate(instance=obj, schema=schema_obj) + generate_and_check(obj, schema_obj) + else: + with pytest.raises(ValidationError): + validate(instance=obj, schema=schema_obj) + check_match_failure(bad_string=json_dumps(obj), schema_obj=schema_obj) + + @pytest.mark.parametrize( + "obj, valid", + [ + (1, True), + (2, False), + ("2", True), + ("1", False), + (True, False), + ] + ) + def test_typed_enum_multiple_types(self, obj, valid): + schema_obj = { + "enum": [1, "2", True], + "type": ["integer", "string"] + } + if valid: + validate(instance=obj, schema=schema_obj) + generate_and_check(obj, schema_obj) + else: + with pytest.raises(ValidationError): + validate(instance=obj, schema=schema_obj) + check_match_failure(bad_string=json_dumps(obj), schema_obj=schema_obj) + + def test_invalid_typed_enum(self): + schema_obj = { + "enum": [1, "2"], + "type": "boolean" + } + with pytest.raises(ValueError) as ve: + gen_json(schema=schema_obj) + assert ve.value.args[0] == "No valid options found for enum with type 'boolean': [1, '2']" class TestConst: def test_constant_int(self): @@ -2413,6 +2461,67 @@ def test_constant_precedence(self): schema_obj=schema_obj, ) + def test_valid_typed_const(self): + schema_obj = { + "const": 1, + "type": "integer" + } + target_obj = 1 + validate(instance=target_obj, schema=schema_obj) + generate_and_check(target_obj, schema_obj) + + def test_invalid_typed_const(self): + schema_obj = { + "const": 1, + "type": "boolean" + } + with pytest.raises(ValidationError): + gen_json(schema=schema_obj) + + def test_valid_enum_const(self): + schema_obj = { + "const": 1, + "enum": [1, 2, 3] + } + target_obj = 1 + validate(instance=target_obj, schema=schema_obj) + generate_and_check(target_obj, schema_obj) + + def test_invalid_enum_const(self): + schema_obj = { + "const": 1, + "enum": [2, 3] + } + with pytest.raises(ValidationError): + gen_json(schema=schema_obj) + + def test_valid_typed_enum_const(self): + schema_obj = { + "const": 1, + "enum": [1, "2", 3], + "type": "integer" + } + target_obj = 1 + validate(instance=target_obj, schema=schema_obj) + generate_and_check(target_obj, schema_obj) + + @pytest.mark.parametrize( + "const", + [ + "2", # right enum, wrong type + 2, # wrong enum, right type + "3", # wrong enum, wrong type + ] + ) + def test_invalid_typed_enum_const(self, const): + schema_obj = { + "const": const, + "enum": [1, "2", 3], + "type": "integer" + } + with pytest.raises(ValidationError): + gen_json(schema=schema_obj) + class TestAdditionalProperties: From c8c6a1192dcd2435615250e642b0e3a15ad68622 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Tue, 29 Oct 2024 14:16:44 -0700 Subject: [PATCH 2/2] [Feature] Allow json-loads-able strings to be passed as schema (#1028) JSON-load any passed schema string --- guidance/library/_json.py | 8 ++++++-- tests/unit/library/test_json.py | 27 ++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/guidance/library/_json.py b/guidance/library/_json.py index 93e0247e9..0ca544726 100644 --- a/guidance/library/_json.py +++ b/guidance/library/_json.py @@ -1,4 +1,4 @@ -from json import dumps as json_dumps +from json import dumps as json_dumps, loads as json_loads from enum import Enum import math from typing import ( @@ -957,6 +957,7 @@ def json( *, schema: Union[ None, + str, JSONSchema, Type["pydantic.BaseModel"], "pydantic.TypeAdapter", @@ -1005,6 +1006,7 @@ def json( schema : Union[None, Mapping[str, Any], Type[pydantic.BaseModel], pydantic.TypeAdapter] One of: - None, in which case any valid JSON will be generated + - A string representing a JSON schema which will be parsed using ``json.loads()`` - A JSON schema object. This is a JSON schema string which has been passed to ``json.loads()`` - A subclass of ``pydantic.BaseModel`` - An instance of ``pydantic.TypeAdapter`` @@ -1018,7 +1020,9 @@ def json( # Default schema is empty, "anything goes" schema # TODO: consider default being `{"type": "object"}` schema = {} - elif isinstance(schema, (Mapping, bool)): + elif isinstance(schema, (Mapping, bool, str)): + if isinstance(schema, str): + schema = cast(JSONSchema, json_loads(schema)) # Raises jsonschema.exceptions.SchemaError or ValueError # if schema is not valid jsonschema.validators.Draft202012Validator.check_schema(schema) diff --git a/tests/unit/library/test_json.py b/tests/unit/library/test_json.py index 2deea6eda..d320fdb4f 100644 --- a/tests/unit/library/test_json.py +++ b/tests/unit/library/test_json.py @@ -1,15 +1,15 @@ import json from functools import partial -from typing import Any, Dict, Set, Union, Optional +from typing import Any, Set, Union, Optional import pytest from jsonschema import validate, ValidationError -from json import dumps as json_dumps +from json import dumps as json_dumps, loads as json_loads from guidance import json as gen_json from guidance import models -from guidance.library._json import IGNORED_KEYS +from guidance.library._json import IGNORED_KEYS, JSONSchema from ...utils import check_match_failure as _check_match_failure from ...utils import check_run_with_temperature @@ -17,8 +17,11 @@ def generate_and_check( - target_obj: Any, schema_obj, desired_temperature: Optional[float] = None + target_obj: Any, schema_obj: Union[str, JSONSchema], desired_temperature: Optional[float] = None ): + if isinstance(schema_obj, str): + schema_obj = json_loads(schema_obj) + # Sanity check what we're being asked validate(instance=target_obj, schema=schema_obj) prepared_json = json_dumps(target_obj) @@ -46,7 +49,7 @@ def check_match_failure( good_bytes: Optional[bytes] = None, failure_byte: Optional[bytes] = None, allowed_bytes: Optional[Set[bytes]] = None, - schema_obj: Dict[str, Any], + schema_obj: Union[str, JSONSchema], ): grammar = gen_json(schema=schema_obj) @@ -3270,3 +3273,17 @@ def test_whitespace_flexibility(self, indent, separators, schema, obj): assert grammar.match(prepared_json, raise_exceptions=True) is not None model = models.Mock(f"{prepared_json}".encode()) assert str(model + grammar) == prepared_json + + +class TestStringSchema: + def test_good(self): + schema = """{"type": "object", "properties": {"a": {"type": "string"}}}""" + target_obj = {"a": "hello"} + generate_and_check(target_obj, schema) + + def test_bad(self): + schema = """{"type": "object", "properties": {"a": {"type": "string"}}}""" + check_match_failure( + bad_string='{"a": 42}', + schema_obj=schema, + )