From 35d3be2ab3879c63f890e83feb7224be77f3b34b Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 12:14:41 -0700 Subject: [PATCH 1/8] clean up ignored keys --- guidance/library/_json.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/guidance/library/_json.py b/guidance/library/_json.py index 4f8c98ce0..0f941b177 100644 --- a/guidance/library/_json.py +++ b/guidance/library/_json.py @@ -165,17 +165,17 @@ class ObjectKeywords(str, Enum): JSONType.OBJECT: ObjectKeywords, } -DEFS_KEYS = {"$defs", "definitions"} - IGNORED_KEYS = { "$anchor", + "$defs", "$schema", "$id", "id", "$comment", "title", - "description", "default", + "definitions", + "description", "examples", } @@ -188,7 +188,7 @@ class ObjectKeywords(str, Enum): IGNORED_KEYS.add("discriminator") WHITESPACE = {b" ", b"\t", b"\n", b"\r"} -VALID_KEYS = set(Keyword) | IGNORED_KEYS | DEFS_KEYS | set(NumberKeywords) | set(StringKeywords) | set(ArrayKeywords) | set(ObjectKeywords) +VALID_KEYS = set(Keyword) | set(NumberKeywords) | set(StringKeywords) | set(ArrayKeywords) | set(ObjectKeywords) | IGNORED_KEYS FORMAT_PATTERNS: dict[str, Optional[str]] = { # https://json-schema.org/understanding-json-schema/reference/string#built-in-formats From a9a405b057b8832634d0d04c137bbdebf6e33451 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 12:16:41 -0700 Subject: [PATCH 2/8] refuse to build grammar if there are unhandled sibling keys --- guidance/library/_json.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/guidance/library/_json.py b/guidance/library/_json.py index 0f941b177..b3e72ba2f 100644 --- a/guidance/library/_json.py +++ b/guidance/library/_json.py @@ -398,6 +398,11 @@ def validate_json_node_keys(node: Mapping[str, Any]): ) +def get_sibling_keys(node: Mapping[str, Any], key: str) -> set[str]: + # Get the set of functional (non-ignored) keys that are siblings of the given key + return set(node.keys()) & VALID_KEYS - set(IGNORED_KEYS) - {key} + + class GenJson: item_separator = ", " key_separator = ": " @@ -811,15 +816,24 @@ def json( validate_json_node_keys(json_schema) if Keyword.ANYOF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.ANYOF) + if sibling_keys: + raise NotImplementedError(f"anyOf with sibling keys is not yet supported. Got {sibling_keys}") return lm + self.anyOf(anyof_list=json_schema[Keyword.ANYOF]) if Keyword.ALLOF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.ALLOF) + if sibling_keys: + raise NotImplementedError(f"allOf with sibling keys is not yet supported. Got {sibling_keys}") allof_list = json_schema[Keyword.ALLOF] if len(allof_list) != 1: raise ValueError("Only support allOf with exactly one item") return lm + self.json(json_schema=allof_list[0]) if Keyword.ONEOF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.ONEOF) + if sibling_keys: + raise NotImplementedError(f"oneOf with sibling keys is not yet supported. Got {sibling_keys}") oneof_list = json_schema[Keyword.ONEOF] if len(oneof_list) == 1: return lm + self.json(json_schema=oneof_list[0]) @@ -827,12 +841,21 @@ def json( return lm + self.anyOf(anyof_list=oneof_list) if Keyword.REF in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.REF) + if sibling_keys: + raise NotImplementedError(f"$ref with sibling keys is not yet supported. Got {sibling_keys}") return lm + self.ref(reference=json_schema[Keyword.REF]) if Keyword.CONST in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.CONST) + if sibling_keys: + raise NotImplementedError(f"const with sibling keys is not yet supported. Got {sibling_keys}") return lm + self.const(value=json_schema[Keyword.CONST]) if Keyword.ENUM in json_schema: + sibling_keys = get_sibling_keys(json_schema, Keyword.ENUM) + if sibling_keys: + raise NotImplementedError(f"enum with sibling keys is not yet supported. Got {sibling_keys}") return lm + self.enum(options=json_schema[Keyword.ENUM]) if Keyword.TYPE in json_schema: From 606352eff463313cb73885d67528cd21d9f74709 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 12:31:19 -0700 Subject: [PATCH 3/8] xfail refs tests that have sibling keywords --- tests/unit/library/test_json.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/unit/library/test_json.py b/tests/unit/library/test_json.py index cad0d6742..f0ac40114 100644 --- a/tests/unit/library/test_json.py +++ b/tests/unit/library/test_json.py @@ -1265,14 +1265,12 @@ def test_nested_refs(self, test_object, valid): # ref valid, maxItems valid ({"foo": []}, True), # ref valid, maxItems invalid - pytest.param( - *({"foo": [1, 2, 3]}, False), - marks=pytest.mark.xfail(reason="sibling keywords to ref are not yet supported"), - ), + ({"foo": [1, 2, 3]}, False), # ref invalid ({"foo": "string"}, False), ], ) + @pytest.mark.xfail(reason="sibling keywords to ref are not yet supported") def test_ref_applies_alongside_sibling_keywords(self, test_object, valid): schema = { "$schema": "https://json-schema.org/draft/2020-12/schema", @@ -1559,12 +1557,10 @@ def test_naive_replacement_of_ref_with_its_destination_is_not_correct( # invalid on outer field ({"foo": {"bar": "a"}, "bar": 1}, False), # valid on both fields - pytest.param( - *({"foo": {"bar": "a"}, "bar": "a"}, True), - marks=pytest.mark.xfail(reason="refs with sibling keywords are not yet supported; foo here is being seen as an additionalProperty before bar"), - ), + ({"foo": {"bar": "a"}, "bar": "a"}, True), ], ) + @pytest.mark.xfail(reason="refs with sibling keywords are not yet supported") def test_refs_with_relative_uris_and_defs(self, test_object, valid): schema = { "$schema": "https://json-schema.org/draft/2020-12/schema", @@ -1594,12 +1590,10 @@ def test_refs_with_relative_uris_and_defs(self, test_object, valid): # invalid on outer field ({"foo": {"bar": "a"}, "bar": 1}, False), # valid on both fields - pytest.param( - *({"foo": {"bar": "a"}, "bar": "a"}, True), - marks=pytest.mark.xfail(reason="refs with sibling keywords are not yet supported; foo here is being seen as an additionalProperty before bar"), - ), + ({"foo": {"bar": "a"}, "bar": "a"}, True), ], ) + @pytest.mark.xfail(reason="refs with sibling keywords are not yet supported") def test_relative_refs_with_absolute_uris_and_defs(self, test_object, valid): schema = { "$schema": "https://json-schema.org/draft/2020-12/schema", From 0f152bcdf186a01ee36ae83f72050390d5f08762 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 12:38:40 -0700 Subject: [PATCH 4/8] handle sibling 'type' for const and enum --- guidance/library/_json.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/guidance/library/_json.py b/guidance/library/_json.py index b3e72ba2f..cf3124b66 100644 --- a/guidance/library/_json.py +++ b/guidance/library/_json.py @@ -728,7 +728,14 @@ def const( lm, *, value: Union[None, bool, int, float, str, Mapping, Sequence], + instance_type: Optional[Union[str, Sequence[str]]] = None, ): + if instance_type is not None: + # Raise a validation error if the value doesn't match the type + jsonschema.validate( + instance=value, + schema={"type": instance_type}, + ) # Base case if isinstance(value, (type(None), bool, int, float, str)): return lm + json_dumps(value) @@ -761,14 +768,18 @@ def enum( self, lm, *, - options: Sequence[Mapping[str, Any]] + options: Sequence[Mapping[str, Any]], + instance_type: Optional[Union[str, Sequence[str]]] = None, ): - # TODO: can we support a whitespace-flexible version of this? all_opts: list[GrammarFunction] = [] - for opt in options: - all_opts.append( - self.const(value=opt) - ) + for instance in options: + try: + grm = self.const(value=instance, instance_type=instance_type) + except jsonschema.ValidationError: + continue + all_opts.append(grm) + if not all_opts: + raise ValueError(f"No valid options found for enum with type {instance_type!r}: {options}") return lm + select(options=all_opts) @@ -847,16 +858,16 @@ def json( return lm + self.ref(reference=json_schema[Keyword.REF]) if Keyword.CONST in json_schema: - sibling_keys = get_sibling_keys(json_schema, Keyword.CONST) + sibling_keys = get_sibling_keys(json_schema, Keyword.CONST) - {Keyword.TYPE} if sibling_keys: raise NotImplementedError(f"const with sibling keys is not yet supported. Got {sibling_keys}") - return lm + self.const(value=json_schema[Keyword.CONST]) + return lm + self.const(value=json_schema[Keyword.CONST], instance_type=json_schema.get(Keyword.TYPE, None)) if Keyword.ENUM in json_schema: - sibling_keys = get_sibling_keys(json_schema, Keyword.ENUM) + sibling_keys = get_sibling_keys(json_schema, Keyword.ENUM) - {Keyword.TYPE} if sibling_keys: raise NotImplementedError(f"enum with sibling keys is not yet supported. Got {sibling_keys}") - return lm + self.enum(options=json_schema[Keyword.ENUM]) + return lm + self.enum(options=json_schema[Keyword.ENUM], instance_type=json_schema.get(Keyword.TYPE, None)) if Keyword.TYPE in json_schema: target_types = cast(Union[str, Sequence[str]], json_schema[Keyword.TYPE]) From c5abc25d1b39855fc494bcca9d17b236f9d22a76 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 13:13:15 -0700 Subject: [PATCH 5/8] handle sibling 'enum' for const (make pydantic discriminated union tests pass --- guidance/library/_json.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/guidance/library/_json.py b/guidance/library/_json.py index cf3124b66..93e0247e9 100644 --- a/guidance/library/_json.py +++ b/guidance/library/_json.py @@ -729,12 +729,18 @@ def const( *, value: Union[None, bool, int, float, str, Mapping, Sequence], instance_type: Optional[Union[str, Sequence[str]]] = None, + enum: Optional[Sequence[Union[None, bool, int, float, str, Mapping, Sequence]]] = None, ): + schema_to_validate_against: dict[str, Any] = {} if instance_type is not None: + schema_to_validate_against["type"] = instance_type + if enum is not None: + schema_to_validate_against["enum"] = enum + if schema_to_validate_against: # Raise a validation error if the value doesn't match the type jsonschema.validate( instance=value, - schema={"type": instance_type}, + schema=schema_to_validate_against, ) # Base case if isinstance(value, (type(None), bool, int, float, str)): @@ -768,7 +774,7 @@ def enum( self, lm, *, - options: Sequence[Mapping[str, Any]], + options: Sequence[Union[None, bool, int, float, str, Mapping, Sequence]], instance_type: Optional[Union[str, Sequence[str]]] = None, ): all_opts: list[GrammarFunction] = [] @@ -858,10 +864,10 @@ def json( return lm + self.ref(reference=json_schema[Keyword.REF]) if Keyword.CONST in json_schema: - sibling_keys = get_sibling_keys(json_schema, Keyword.CONST) - {Keyword.TYPE} + sibling_keys = get_sibling_keys(json_schema, Keyword.CONST) - {Keyword.TYPE, Keyword.ENUM} if sibling_keys: raise NotImplementedError(f"const with sibling keys is not yet supported. Got {sibling_keys}") - return lm + self.const(value=json_schema[Keyword.CONST], instance_type=json_schema.get(Keyword.TYPE, None)) + return lm + self.const(value=json_schema[Keyword.CONST], instance_type=json_schema.get(Keyword.TYPE, None), enum=json_schema.get(Keyword.ENUM, None)) if Keyword.ENUM in json_schema: sibling_keys = get_sibling_keys(json_schema, Keyword.ENUM) - {Keyword.TYPE} From cd656c751bd4d273b47e6324f286b0e00bf85605 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 13:30:29 -0700 Subject: [PATCH 6/8] add tests for type-narrowed enums --- tests/unit/library/test_json.py | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/unit/library/test_json.py b/tests/unit/library/test_json.py index f0ac40114..3502cb27e 100644 --- a/tests/unit/library/test_json.py +++ b/tests/unit/library/test_json.py @@ -2348,6 +2348,51 @@ def test_bad_prefix_enum(self, bad_obj, good_bytes, failure_byte, allowed_bytes) schema_obj=schema_obj, ) + @pytest.mark.parametrize( + "obj, valid", + [ + (1, True), + (2, False), + ("2", False), + ("1", False), + (True, False), + ] + ) + def test_typed_enum_single_type(self, obj, valid): + schema_obj = { + "enum": [1, "2", True], + "type": "integer" + } + if valid: + validate(instance=obj, schema=schema_obj) + generate_and_check(obj, schema_obj) + else: + with pytest.raises(ValidationError): + validate(instance=obj, schema=schema_obj) + check_match_failure(bad_string=json_dumps(obj), schema_obj=schema_obj) + + @pytest.mark.parametrize( + "obj, valid", + [ + (1, True), + (2, False), + ("2", True), + ("1", False), + (True, False), + ] + ) + def test_typed_enum_multiple_types(self, obj, valid): + schema_obj = { + "enum": [1, "2", True], + "type": ["integer", "string"] + } + if valid: + validate(instance=obj, schema=schema_obj) + generate_and_check(obj, schema_obj) + else: + with pytest.raises(ValidationError): + validate(instance=obj, schema=schema_obj) + check_match_failure(bad_string=json_dumps(obj), schema_obj=schema_obj) class TestConst: def test_constant_int(self): From 431e02e4db72f22cdfc01ca6b012437799719e80 Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 13:35:17 -0700 Subject: [PATCH 7/8] add test to ensure we raise when enum/type are inconsistent --- tests/unit/library/test_json.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/unit/library/test_json.py b/tests/unit/library/test_json.py index 3502cb27e..611e91075 100644 --- a/tests/unit/library/test_json.py +++ b/tests/unit/library/test_json.py @@ -2394,6 +2394,15 @@ def test_typed_enum_multiple_types(self, obj, valid): validate(instance=obj, schema=schema_obj) check_match_failure(bad_string=json_dumps(obj), schema_obj=schema_obj) + def test_invalid_typed_enum(self): + schema_obj = { + "enum": [1, "2"], + "type": "boolean" + } + with pytest.raises(ValueError) as ve: + gen_json(schema=schema_obj) + assert ve.value.args[0] == "No valid options found for enum with type 'boolean': [1, '2']" + class TestConst: def test_constant_int(self): # First sanity check what we're setting up From 8487e8bdb0126d66e29870c04a275af460db62bf Mon Sep 17 00:00:00 2001 From: Hudson Cooper Date: Thu, 24 Oct 2024 13:44:11 -0700 Subject: [PATCH 8/8] add test to ensure we raise when const is inconsistent with enum/type --- tests/unit/library/test_json.py | 61 +++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/unit/library/test_json.py b/tests/unit/library/test_json.py index 611e91075..2deea6eda 100644 --- a/tests/unit/library/test_json.py +++ b/tests/unit/library/test_json.py @@ -2461,6 +2461,67 @@ def test_constant_precedence(self): schema_obj=schema_obj, ) + def test_valid_typed_const(self): + schema_obj = { + "const": 1, + "type": "integer" + } + target_obj = 1 + validate(instance=target_obj, schema=schema_obj) + generate_and_check(target_obj, schema_obj) + + def test_invalid_typed_const(self): + schema_obj = { + "const": 1, + "type": "boolean" + } + with pytest.raises(ValidationError): + gen_json(schema=schema_obj) + + def test_valid_enum_const(self): + schema_obj = { + "const": 1, + "enum": [1, 2, 3] + } + target_obj = 1 + validate(instance=target_obj, schema=schema_obj) + generate_and_check(target_obj, schema_obj) + + def test_invalid_enum_const(self): + schema_obj = { + "const": 1, + "enum": [2, 3] + } + with pytest.raises(ValidationError): + gen_json(schema=schema_obj) + + def test_valid_typed_enum_const(self): + schema_obj = { + "const": 1, + "enum": [1, "2", 3], + "type": "integer" + } + target_obj = 1 + validate(instance=target_obj, schema=schema_obj) + generate_and_check(target_obj, schema_obj) + + @pytest.mark.parametrize( + "const", + [ + "2", # right enum, wrong type + 2, # wrong enum, right type + "3", # wrong enum, wrong type + ] + ) + def test_invalid_typed_enum_const(self, const): + schema_obj = { + "const": const, + "enum": [1, "2", 3], + "type": "integer" + } + with pytest.raises(ValidationError): + gen_json(schema=schema_obj) + class TestAdditionalProperties: