Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug: "Phu Thai or Phuu Thai" repeated for pht-Thai-TH #17

Open
mcdurdin opened this issue May 7, 2024 · 5 comments
Open

bug: "Phu Thai or Phuu Thai" repeated for pht-Thai-TH #17

mcdurdin opened this issue May 7, 2024 · 5 comments

Comments

@mcdurdin
Copy link

mcdurdin commented May 7, 2024

    {
        "full": "pht-Thai-TH",
        "iana": [ "Phu Thai" ],
        "iso639_3": "pht",
        "latnnames": [ "Phu Thai or Phuu Thai", "Phu Thai or Phuu Thai" ],
                       ^^^^^^^^^^^^^^^^^^^^^^^  ^^^^^^^^^^^^^^^^^^^^^^^
        "localnames": [ "ผู้ไท", "ภูไท" ],
        "name": "Phu Thai",
        "names": [ "Phutai", "Phuu Thai", "Poutai", "Putai", "Puthai", "Puthay" ],
        "region": "TH",
        "regionname": "Thailand",
        "regions": [ "LA", "US", "VN" ],
        "script": "Thai",
        "sldr": false,
        "tag": "pht",
        "tags": [ "pht-TH", "pht-Thai" ],
        "windows": "pht-Thai"
    },

Presume this should be:

        "latnnames": [ "Phu Thai", "Phuu Thai" ],
@mcdurdin
Copy link
Author

mcdurdin commented May 7, 2024

This is caught with an updated schema:

{
  "$schema": "http://json-schema.org/schema#",
  "$id": "https://github.com/silnrsi/langtags/raw/master/source/langtags_schema.json",
  "$ref": "#/definitions/langtags",
  "definitions": {
    "langtags": {
      "type": "array",
      "items": { "oneOf": [
        {"$ref": "#/definitions/langtag"},
        {"$ref": "#/definitions/_globalvar"},
        {"$ref": "#/definitions/_phonvar"},
        {"$ref": "#/definitions/_version"},
        {"$ref": "#/definitions/_conformance"}
      ] },
      "additionalItems": false
    },

    "langtag": {
      "type": "object",
      "properties": {
        "tag": {
          "$ref": "#/definitions/bcp47"
        },
        "full": {
          "type": "string"
        },
        "tags": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47" },
          "additionalItems": false,
          "uniqueItems": true
        },
        "variants": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47_variant" },
          "additionalItems": false,
          "uniqueItems": true
        },
        "iso639_3": {
          "$ref": "#/definitions/iso639_3"
        },
        "region": {
          "$ref": "#/definitions/iso3166_1"
        },
        "regions": {
          "type": "array",
          "items": { "$ref": "#/definitions/iso3166_1" },
          "additionalItems": false,
          "uniqueItems": true
        },
        "regionname": {
          "type": "string"
        },
        "iana": {
          "type": "array",
          "items": { "type": "string" },
          "uniqueItems": true
        },
        "name": {
          "type": "string"
        },
        "names": {
          "type": "array",
          "items": { "type": "string" },
          "uniqueItems": true
        },
        "localname": {
          "type": "string"
        },
        "sldr": {
          "type": "boolean"
        },
        "nophonvars": {
          "type": "boolean"
        },
        "obsolete": {
          "type": "boolean"
        },
        "script": {
          "$ref": "#/definitions/iso15924"
        },
        "localnames": {
          "type": "array",
          "items": { "type": "string" },
          "uniqueItems": true
        },
        "latnnames": {
          "type": "array",
          "items": { "type": "string" },
          "uniqueItems": true
        },
        "suppress": {
          "type": "boolean"
        },
        "unwritten": {
          "type": "boolean"
        },
        "windows": {
          "$ref": "#/definitions/bcp47"
        },
        "rod": { "type": "string"},
        "macrolang": {
          "$ref": "#/definitions/bcp47"
        }
      },
      "required": ["full"],
      "additionalProperties": false
    },

    "_globalvar": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_globalvar"
        },
        "variants": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47_variant" },
          "additionalItems": false
        }
      },
      "required": ["tag", "variants"],
      "additionalProperties": false
    },

    "_phonvar": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_phonvar"
        },
        "variants": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47_variant" },
          "additionalItems": false
        }
      },
      "required": ["tag", "variants"],
      "additionalProperties": false
    },

    "_version": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_version"
        },
        "api": {
          "type": "string",
          "pattern": "^\\d+\\.\\d+(\\.\\d+)?$"
        },
        "date": {
          "type": "string",
          "pattern": "^\\d+-\\d+-\\d+$"
        }
      },
      "required": ["tag", "api", "date"],
      "additionalProperties": false
    },

    "_conformance": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_conformance"
        },
        "regions": {
          "type": "array",
          "items": { "$ref": "#/definitions/iso3166_1"}
        },
        "scripts": {
          "type": "array",
          "items": { "$ref": "#/definitions/iso15924"}
        }
      },
      "required": ["regions", "scripts"],
      "additionalProperties": false
    },

    "bcp47": {
      "type": "string",
      "pattern": "^(((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)|(brv-(Thai|TH)-x-(dongluang|khongchiem|sakonnakon)|cek-(Latn-)?(MM-)?x-asangkhongso|cek-(Latn-)?(MM-)?x-khawngtuu|dao-(Latn-)?(MM-)?x-khengdaai|1901|1996|dgl-(Copt-)?(SD-)?x-oldnubian|ers-(Zzzz-)?(CN-)?x-ersushaba|fia-(Copt-)?(SD-)?x-oldnubian|mnc-(Mong-)?(CN-)?x-oldmanchu|nst-(Latn-)?(MM-)?x-moshanghawa|onw-(Copt-)?(SD-)?x-oldnubian|sgn-(Zxxx-)?MY-(Zxxx-)?MM|sgn-MY-Zxxx|sgn-Zxxx-MY-mm|tew-(Latn-)?(US-)?x-santaclara|tzo-(Latn-)?(MX-)?x-sanandres|tzo-(Latn-)?(MX-)?x-zinacantan|xnz-(Copt-)?(EG-)?x-oldnubian))|((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+))$"
    },

    "bcp47_variant": {
      "type": "string",
      "pattern": "^([0-9][a-zA-Z0-9]{3,8})|([a-zA-Z][a-zA-Z0-9]{4,8})$"
    },

    "iso639_3": {
      "type": "string",
      "pattern": "^[a-z]{3}$"
    },

    "iso3166_1": {
      "type": "string",
      "pattern": "^([A-Z]{2})|(\\d\\d\\d)$"
    },

    "iso15924": {
      "type": "string",
      "pattern": "^[A-Z]([a-z]{3})$"
    }
  }
}

@mcdurdin mcdurdin changed the title "Phu Thai or Phuu Thai" repeated for pht-Thai-TH bug: "Phu Thai or Phuu Thai" repeated for pht-Thai-TH May 7, 2024
ermshiperete added a commit to ermshiperete/langtags that referenced this issue May 7, 2024
This change updates the schema as suggested in silnrsi#17 to detect duplicates
in tags. It also adds a unit tests to detect duplicates that appear in
multiple entries, as suggested in silnrsi#16.

This change does not update the data and thus will cause failing tests.
ermshiperete added a commit to keymanapp/keyman that referenced this issue May 7, 2024
…rsion

The `langtags.json` version downloadable from
https://github.com/silnrsi/langtags/ has several duplicated tags due to
the following bugs:
- silnrsi/langtags#15
- silnrsi/langtags#17

This change removes the duplicates from `langtags.json`, and also removes
the `tpo-Zzzz-LA` tag according to the newer `langtags.json` in the
https://github.com/silnrsi/langtags repo. See also
silnrsi/langtags#16.
ermshiperete added a commit to keymanapp/keyman that referenced this issue May 7, 2024
…rsion

The `langtags.json` version downloadable from
https://github.com/silnrsi/langtags/ has several duplicated tags due to
the following bugs:
- silnrsi/langtags#15
- silnrsi/langtags#17

This change removes the duplicates from `langtags.json`, and also removes
the `tpo-Zzzz-LA` tag according to the newer `langtags.json` in the
https://github.com/silnrsi/langtags repo. See also
silnrsi/langtags#16.
@mhosken
Copy link
Contributor

mhosken commented May 7, 2024

The latnnames field is in 1:1 correspondence with the localnames fields. I.e. each entry is meant to be a transcription/literation of its corresponding entry in the localnames field. Thus it is valid for the latnnames list to have repeated entries. It is not valid for the localnames field to have repeated entries.

@mhosken
Copy link
Contributor

mhosken commented May 7, 2024

Given the Thai involved, the latnname is valid if unhelpful, given we have no way of distinguishing two different p letters that are in the same tone class.

@mhosken
Copy link
Contributor

mhosken commented May 7, 2024

If you want to modify the json schema accordingly (e.g. check the same number of entries in localnames as latnnames), then I'll update the schema test.

@mcdurdin
Copy link
Author

mcdurdin commented May 8, 2024

If you want to modify the json schema accordingly (e.g. check the same number of entries in localnames as latnnames), then I'll update the schema test.

I don't think it is possible in json schema to compare two entries like that. But I have updated the schema to remove the latnnames uniqueness requirement:

 {
  "$schema": "http://json-schema.org/schema#",
  "$id": "https://github.com/silnrsi/langtags/raw/master/source/langtags_schema.json",
  "$ref": "#/definitions/langtags",
  "definitions": {
    "langtags": {
      "type": "array",
      "items": { "oneOf": [
        {"$ref": "#/definitions/langtag"},
        {"$ref": "#/definitions/_globalvar"},
        {"$ref": "#/definitions/_phonvar"},
        {"$ref": "#/definitions/_version"},
        {"$ref": "#/definitions/_conformance"}
      ] },
      "additionalItems": false
    },

    "langtag": {
      "type": "object",
      "properties": {
        "tag": {
          "$ref": "#/definitions/bcp47"
        },
        "full": {
          "type": "string"
        },
        "tags": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47" },
          "additionalItems": false,
          "uniqueItems": true
        },
        "variants": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47_variant" },
          "additionalItems": false,
          "uniqueItems": true
        },
        "iso639_3": {
          "$ref": "#/definitions/iso639_3"
        },
        "region": {
          "$ref": "#/definitions/iso3166_1"
        },
        "regions": {
          "type": "array",
          "items": { "$ref": "#/definitions/iso3166_1" },
          "additionalItems": false,
          "uniqueItems": true
        },
        "regionname": {
          "type": "string"
        },
        "iana": {
          "type": "array",
          "items": { "type": "string" },
          "uniqueItems": true
        },
        "name": {
          "type": "string"
        },
        "names": {
          "type": "array",
          "items": { "type": "string" },
          "uniqueItems": true
        },
        "localname": {
          "type": "string"
        },
        "sldr": {
          "type": "boolean"
        },
        "nophonvars": {
          "type": "boolean"
        },
        "obsolete": {
          "type": "boolean"
        },
        "script": {
          "$ref": "#/definitions/iso15924"
        },
        "localnames": {
          "type": "array",
          "items": { "type": "string" },
          "uniqueItems": true
        },
        "latnnames": {
          "type": "array",
          "items": { "type": "string" }
        },
        "suppress": {
          "type": "boolean"
        },
        "unwritten": {
          "type": "boolean"
        },
        "windows": {
          "$ref": "#/definitions/bcp47"
        },
        "rod": { "type": "string"},
        "macrolang": {
          "$ref": "#/definitions/bcp47"
        }
      },
      "required": ["full"],
      "additionalProperties": false
    },

    "_globalvar": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_globalvar"
        },
        "variants": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47_variant" },
          "additionalItems": false
        }
      },
      "required": ["tag", "variants"],
      "additionalProperties": false
    },

    "_phonvar": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_phonvar"
        },
        "variants": {
          "type": "array",
          "items": { "$ref": "#/definitions/bcp47_variant" },
          "additionalItems": false
        }
      },
      "required": ["tag", "variants"],
      "additionalProperties": false
    },

    "_version": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_version"
        },
        "api": {
          "type": "string",
          "pattern": "^\\d+\\.\\d+(\\.\\d+)?$"
        },
        "date": {
          "type": "string",
          "pattern": "^\\d+-\\d+-\\d+$"
        }
      },
      "required": ["tag", "api", "date"],
      "additionalProperties": false
    },

    "_conformance": {
      "type": "object",
      "properties": {
        "tag": {
          "type": "string",
          "const": "_conformance"
        },
        "regions": {
          "type": "array",
          "items": { "$ref": "#/definitions/iso3166_1"}
        },
        "scripts": {
          "type": "array",
          "items": { "$ref": "#/definitions/iso15924"}
        }
      },
      "required": ["regions", "scripts"],
      "additionalProperties": false
    },

    "bcp47": {
      "type": "string",
      "pattern": "^(((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)|(brv-(Thai|TH)-x-(dongluang|khongchiem|sakonnakon)|cek-(Latn-)?(MM-)?x-asangkhongso|cek-(Latn-)?(MM-)?x-khawngtuu|dao-(Latn-)?(MM-)?x-khengdaai|1901|1996|dgl-(Copt-)?(SD-)?x-oldnubian|ers-(Zzzz-)?(CN-)?x-ersushaba|fia-(Copt-)?(SD-)?x-oldnubian|mnc-(Mong-)?(CN-)?x-oldmanchu|nst-(Latn-)?(MM-)?x-moshanghawa|onw-(Copt-)?(SD-)?x-oldnubian|sgn-(Zxxx-)?MY-(Zxxx-)?MM|sgn-MY-Zxxx|sgn-Zxxx-MY-mm|tew-(Latn-)?(US-)?x-santaclara|tzo-(Latn-)?(MX-)?x-sanandres|tzo-(Latn-)?(MX-)?x-zinacantan|xnz-(Copt-)?(EG-)?x-oldnubian))|((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+))$"
    },

    "bcp47_variant": {
      "type": "string",
      "pattern": "^([0-9][a-zA-Z0-9]{3,8})|([a-zA-Z][a-zA-Z0-9]{4,8})$"
    },

    "iso639_3": {
      "type": "string",
      "pattern": "^[a-z]{3}$"
    },

    "iso3166_1": {
      "type": "string",
      "pattern": "^([A-Z]{2})|(\\d\\d\\d)$"
    },

    "iso15924": {
      "type": "string",
      "pattern": "^[A-Z]([a-z]{3})$"
    }
  }
}

ermshiperete added a commit to ermshiperete/langtags that referenced this issue May 8, 2024
latnnames don't have to be unique (see silnrsi#17).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants