From b08229f50b1bd5051f2ff0222e3e3746197fd02f Mon Sep 17 00:00:00 2001 From: Craig Cornelius Date: Mon, 15 Jul 2024 16:04:47 -0700 Subject: [PATCH] Add hash for test data (#252) * Adding ICU4C running collation tests - first try * Cache ICU4C binaries in GH and locally, only if they don't exist * Install JSON-C dependency if not installed at beginning of CI or e2e script * Fix bash if condition syntax * Hash code for tests and updating schema * Adding hex hash to all tests data * updating hexhash generation for tests * Add common hash function for JavaScript * Clean up some unneeded warning/error messges * Revise white space * Update testgen/generators/generate_test_hash.js Co-authored-by: Shane F. Carr --------- Co-authored-by: Elango Cheran Co-authored-by: Shane F. Carr --- schema/check_generated_data.py | 7 +- schema/collation_short/test_schema.json | 4 + schema/datetime_fmt/test_schema.json | 4 + .../result_schema.json | 0 .../test_schema.json | 4 + .../verify_schema.json | 0 schema/likely_subtags/test_schema.json | 4 + schema/list_fmt/test_schema.json | 4 + schema/message_fmt2/test_schema.json | 4 + schema/number_format/test_schema.json | 4 + schema/rdt_fmt/test_schema.json | 118 ++++++++++++++++++ schema/schema_files.py | 68 ++++++---- schema/schema_validator.py | 5 +- testgen/generators/base.py | 61 +++++++++ testgen/generators/datetime_gen.js | 23 ++-- testgen/generators/generate_test_hash.js | 34 +++++ testgen/generators/lang_names.py | 2 + testgen/generators/list_fmt_gen.js | 10 +- testgen/generators/rdt_fmt_gen.js | 18 ++- 19 files changed, 329 insertions(+), 45 deletions(-) rename schema/{language_names => lang_names}/result_schema.json (100%) rename schema/{language_names => lang_names}/test_schema.json (92%) rename schema/{language_names => lang_names}/verify_schema.json (100%) create mode 100644 schema/rdt_fmt/test_schema.json create mode 100644 testgen/generators/generate_test_hash.js diff --git a/schema/check_generated_data.py b/schema/check_generated_data.py index 9bea6be7..f2a20e27 100644 --- a/schema/check_generated_data.py +++ b/schema/check_generated_data.py @@ -74,7 +74,7 @@ def main(args): # Create .json summary_json = { 'validation_type': 'Generated test data files', - 'description': 'Results of validating generated test data agains schema', + 'description': 'Results of validating generated test data against schema', 'when_processed': datetime.now().strftime('%Y-%m-%d T%H%M%S.%f'), 'validations': { 'failed': failed_validations, @@ -82,7 +82,10 @@ def main(args): } } - summary_data = json.dumps(summary_json) + try: + summary_data = json.dumps(summary_json) + except BaseException as error: + logging.error('json.dumps Summary data problem: %s, ') try: output_filename = os.path.join(test_data_path, 'test_data_validation_summary.json') diff --git a/schema/collation_short/test_schema.json b/schema/collation_short/test_schema.json index cb2bd7c5..b6344bae 100644 --- a/schema/collation_short/test_schema.json +++ b/schema/collation_short/test_schema.json @@ -29,6 +29,10 @@ "description": "A numeric ID, unique for the set of tests", "type": "string" }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, "s1": { "description": "First string for comparison", "type": "string" diff --git a/schema/datetime_fmt/test_schema.json b/schema/datetime_fmt/test_schema.json index 87968e41..67cb5159 100644 --- a/schema/datetime_fmt/test_schema.json +++ b/schema/datetime_fmt/test_schema.json @@ -20,6 +20,10 @@ "description": "A numeric ID, unique for the set of tests", "type": "string" }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, "locale": { "description": "language tag for formatting the output", "type": "string" diff --git a/schema/language_names/result_schema.json b/schema/lang_names/result_schema.json similarity index 100% rename from schema/language_names/result_schema.json rename to schema/lang_names/result_schema.json diff --git a/schema/language_names/test_schema.json b/schema/lang_names/test_schema.json similarity index 92% rename from schema/language_names/test_schema.json rename to schema/lang_names/test_schema.json index b5b23ff0..b0716b28 100644 --- a/schema/language_names/test_schema.json +++ b/schema/lang_names/test_schema.json @@ -41,6 +41,10 @@ "description": "A numeric ID, unique for the set of tests", "type": "string" }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, "language_label": { "description": "locale tag of the source language ", "type": "string" diff --git a/schema/language_names/verify_schema.json b/schema/lang_names/verify_schema.json similarity index 100% rename from schema/language_names/verify_schema.json rename to schema/lang_names/verify_schema.json diff --git a/schema/likely_subtags/test_schema.json b/schema/likely_subtags/test_schema.json index 556d5c56..b5083173 100644 --- a/schema/likely_subtags/test_schema.json +++ b/schema/likely_subtags/test_schema.json @@ -75,6 +75,10 @@ "description": "A numeric ID, unique for the set of tests", "type": "string" }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, "locale": { "description": "locale tag source ", "type": "string" diff --git a/schema/list_fmt/test_schema.json b/schema/list_fmt/test_schema.json index 1e4b5743..fd9b9012 100644 --- a/schema/list_fmt/test_schema.json +++ b/schema/list_fmt/test_schema.json @@ -74,6 +74,10 @@ "description": "A numeric ID, unique for the set of tests", "type": "string" }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, "locale": { "description": "locale tag source ", "type": "string" diff --git a/schema/message_fmt2/test_schema.json b/schema/message_fmt2/test_schema.json index 3bc9e7cc..92208178 100644 --- a/schema/message_fmt2/test_schema.json +++ b/schema/message_fmt2/test_schema.json @@ -27,6 +27,10 @@ "type": "string", "description": "Identifier for the test." }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, "test_description": { "type": "string", "description": "Information about the test." diff --git a/schema/number_format/test_schema.json b/schema/number_format/test_schema.json index ddc8a161..2a2a1288 100644 --- a/schema/number_format/test_schema.json +++ b/schema/number_format/test_schema.json @@ -42,6 +42,10 @@ "description": "A numeric ID, unique for the set of tests", "type": "string" }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, "locale": { "description": "language tag for formatting the output", "type": "string" diff --git a/schema/rdt_fmt/test_schema.json b/schema/rdt_fmt/test_schema.json new file mode 100644 index 00000000..35a1b5ef --- /dev/null +++ b/schema/rdt_fmt/test_schema.json @@ -0,0 +1,118 @@ +{"$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://github.com/unicode/conformance/_test_schema.json'", + "title": "ICU Conformance test data description", + "description": "This documents the format of relative date time format test data for execution", + "type": "object", + "properties": { + "additionalProperties": false, + "test_type": { + "description": "The name of the test", + "type": "string", + "enum": ["rdt_fmt"] + }, + "Test scenario": { + "type": "string", + "description":" Obsolete tag to be removed and replaced with test Type" + }, + "source_file": { + "description": "origin of conformance data", + "type": "string" + }, + "source_version": { + "description": "version of conformance data", + "type": "string" + }, + "test_ environmment": { + "test_language": { + "description": "Programming language of execution", + "type": "string" + }, + "executor": { + "description": "path to the executor for this output", + "type": "string" + }, + "test_type": { + "description": "the kind of tests run", + "type": "string" + }, + "date_time": { + "description": "timestamp of output", + "type": "string" + }, + "input_file": { + "description": "path to test data", + "type": "string" + }, + "icu_version": { + "description": "ICU data version used in testing", + "type": "string" + }, + "cldr_version": { + "description": "CLDR version used in testing", + "type": "string" + }, + "test_count": { + "description": "Number of tests run", + "type": "integer" + } + }, + "error_info": { + "description": "count of detected errors", + "type": "integer" + }, + "platform": { + "description": "the executor's platorm", + "type": "integer" + }, + "tests": { + "description": "list of N tests for relative date time format testing", + "type": "array", + "items": { + "additionalProperties": false, + "type": "object", + "properties": { + "label": { + "description": "A numeric ID, unique for the set of tests", + "type": "string" + }, + "hexhash": { + "description": "A hexadecimal hash code for the test without the label", + "type": "string" + }, + "unit": { + "description": "time interval, e.g., year, month, day, hour, etc. ", + "type": "string" + }, + "unit": { + "count": "how may of the units in past or future", + "type": "string" + }, + "options": { + "type": "string", + "description": "Optional parameters for the output", + "type": "object", + "properties": { + "style": { + "type": "string", + "description": "size of formated output, e.g., long" + }, + "numberingSystem": { + "type": "string", + "description": "4 letter script code of numbering system" + } + } + + } + } + }, + "required": [ + "label", + "locale", + "count", + "unit" + ] + } + }, + "required": ["test_type"] +} + diff --git a/schema/schema_files.py b/schema/schema_files.py index 0be27d13..cb088e3d 100644 --- a/schema/schema_files.py +++ b/schema/schema_files.py @@ -2,21 +2,26 @@ ALL_TEST_TYPES = ['collation_short', 'datetime_fmt', + 'lang_names', + 'likely_subtags', + 'lang_names', 'list_fmt', 'message_fmt2', 'number_format', - 'language_names', - 'likely_subtags' + 'plural_rules', + 'rdt_fmt' ] TEST_FILE_TO_TEST_TYPE_MAP = { 'collation_test': 'collation_short', 'datetime_fmt_test_file': 'datetime_fmt', - 'lang_name_test_file': 'language_names', + 'lang_names_test_file': 'lang_names', 'likely_subtags_test': 'likely_subtags', + 'list_fmt_test_file': 'list_fmt', 'message_fmt2_test_file': 'message_fmt2', 'num_fmt_test_file': 'number_fmt', - 'list_fmt_test_file': 'list_fmt' + 'plural_rules_test_file': 'plural_rules', + 'rdt_fmt_test_file': 'rdt' } SCHEMA_FILE_MAP = { @@ -83,32 +88,17 @@ } }, - "language_names": { - "test_data": { - "schema_file": "language_names/test_schema.json", - 'prod_file': 'lang_name_test_file.json' - }, - "verify_data": { - "schema_file": "language_names/verify_schema.json", - 'prod_file': 'pass.json' - }, - "result_data": { - "schema_file": "language_names/result_schema.json", - "prod_file": "lang_name_test_file.json" - } - }, - "lang_names": { "test_data": { - "schema_file": "language_names/test_schema.json", + "schema_file": "lang_names/test_schema.json", 'prod_file': 'lang_name_test_file.json' }, "verify_data": { - "schema_file": "language_names/verify_schema.json", + "schema_file": "lang_names/verify_schema.json", 'prod_file': 'pass.json' }, "result_data": { - "schema_file": "language_names/result_schema.json", + "schema_file": "lang_names/result_schema.json", "prod_file": "lang_name_test_file.json" } }, @@ -139,8 +129,20 @@ "schema_file": "list_fmt/result_schema.json", "prod_file": "list_fmt_test.json" } - - # Additional tests + }, + "plural_rules" : { + "test_data": { + "schema_file": "plural_rules/test_schema.json", + 'prod_file': 'plural_rules.json' + }, + "verify_data": { + "schema_file": "plural_rules/verify_schema.json", + 'prod_file': 'plural_rules.json' + }, + "result_data": { + "schema_file": "plural_rules/result_schema.json", + "prod_file": "plural_rules.json" + } }, "message_fmt2": { "test_data": { @@ -155,5 +157,21 @@ "schema_file": "message_fmt2/result_schema.json", 'prod_file': 'message_fmt2_result.json' } - } + }, + "rdt_fmt": { + "test_data": { + "schema_file": "rdt_fmt/test_schema.json", + 'prod_file': 'rdt_fmt.json' + }, + "verify_data": { + "schema_file": "rdt_fmt/verify_schema.json", + 'prod_file': 'rdt_fmt.json' + }, + "result_data": { + "schema_file": "plural_rules/result_schema.json", + "prod_file": "rdt_fmt.json" + } + }, + # Additional tests + } diff --git a/schema/schema_validator.py b/schema/schema_validator.py index b76ebd36..051a2d1b 100644 --- a/schema/schema_validator.py +++ b/schema/schema_validator.py @@ -133,7 +133,9 @@ def validate_test_data_with_schema(self): file_path_pair = self.get_schema_data_info(icu_version, test_type) if file_path_pair: schema_test_info.append(file_path_pair) - + else: + # logging.info('No data test file %s for %s, %s', file_path_pair, test_type, icu_version) + pass results = self.parallel_check_test_data_schema(schema_test_info) for result_data in results: @@ -176,6 +178,7 @@ def get_schema_data_info(self, icu_version, test_type): 'test_result_file': test_file_name } else: + # logging.warning('## get_schema_data_info. No file at test_file_name: %s', test_file_name); return None def check_test_data_against_schema(self, schema_info): diff --git a/testgen/generators/base.py b/testgen/generators/base.py index 44c30900..99d8a853 100644 --- a/testgen/generators/base.py +++ b/testgen/generators/base.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- from abc import ABC, abstractmethod +import copy +import hashlib import json import logging import logging.config @@ -8,6 +10,24 @@ import requests +def remove_none(obj): + # Recursively removes any parts with None as value + if isinstance(obj, str): + return obj + result = copy.copy(obj) + if hasattr(obj, "items"): + for (key, value) in obj.items(): + if value is None: + del result[key] + else: + result[key] = remove_none(value) + elif hasattr(obj, "__iter__"): + if len(obj) == 1 and obj[0] == obj: + return result + for (i, value) in enumerate(obj): + result[i] = remove_none(value) + return result + class DataGenerator(ABC): def __init__(self, icu_version, run_limit=None): self.icu_version = icu_version @@ -20,7 +40,48 @@ def __init__(self, icu_version, run_limit=None): def process_test_data(self): pass + + def generateTestHashValues(self, testdata): + # For each test item, copy it. Omit 'label' from that copy. + # Create the string representation of that copy with json.dumps() + # Then make a hex hash value for that string. + # Add it to that item. + + try: + all_tests = testdata['tests'] + except BaseException as error: + logging.error('# generateTestHashValues: %s does not have "tests": %s', + error, testdata.keys()) + return None + + for test in all_tests: + try: + test_no_label = test.copy() + except BaseException as error: + logging.error('error: %s, Item with no label found here: %s, %s' , + error, testdata['test_type'], test) + continue + del test_no_label['label'] + + # Make it compact and consistent + test_no_nones = remove_none(test_no_label) + test_no_label_string = json.dumps(test_no_nones, separators=(',', ':'), sort_keys=True) + + # Create the 32 byte hasn, consisten with Javascript + hasher = hashlib.sha1() + hasher.update(test_no_label_string.encode("utf-8")) + hex_digest = hasher.hexdigest() + test['hexhash'] = hex_digest + + return True # Indicates OK + def saveJsonFile(self, filename, data, indent=None): + if 'tests' in data: + hash_ok = self.generateTestHashValues(data) + if not hash_ok: + logging.error('### Problems generating hash codes for file %s', + filename) + output_path = os.path.join(self.icu_version, filename) output_file = open(output_path, "w", encoding="UTF-8") json.dump(data, output_file, indent=indent) diff --git a/testgen/generators/datetime_gen.js b/testgen/generators/datetime_gen.js index ca3ef472..625de31c 100644 --- a/testgen/generators/datetime_gen.js +++ b/testgen/generators/datetime_gen.js @@ -13,6 +13,8 @@ // Set up Node version to generate data specific to ICU/CLDR version // e.g., `nvm install 21.6.0;nvm use 21.6.0` (ICU 74) +const gen_hash = require("./generate_test_hash.js"); + require("temporal-polyfill/global"); const fs = require('node:fs'); @@ -333,7 +335,7 @@ function generateAll(run_limit) { // If not, skip the test. if (!supported_calendars || !supported_calendars.includes(calendar)) { console. - continue; + continue; } } catch(error) { console.log('Supported calendars for %sError: %s', @@ -415,7 +417,7 @@ function generateAll(run_limit) { formatter = new Intl.DateTimeFormat(locale, all_options); } catch (error) { console.error(error, ' with locale ', - locale, ' and options: ', all_options); + locale, ' and options: ', all_options); continue; } @@ -491,8 +493,6 @@ function generateAll(run_limit) { } catch(error) { // This item isn't in the output. Just return the entire string. result = parts.map((x) => x.value).join(""); - // console.error('BAD PARTS?: ', JSON.stringify(parts)); - // console.error(' result: ', JSON.stringify(result)); } if (!result || debug) { console.log('OK! key = %s, %s', @@ -529,9 +529,9 @@ function generateAll(run_limit) { const label_string = String(label_num); - let test_case = {'label': label_string, - 'input_string': input_string - }; + let test_case = { + 'input_string': input_string + }; if (skeleton) { test_case['skeleton'] = skeleton; @@ -552,6 +552,11 @@ function generateAll(run_limit) { if (!result || debug) { console.debug("TEST CASE :", test_case); } + + gen_hash.generate_hash_for_test(test_case); + + test_case['label'] = label_string; + test_cases.push(test_case); // Generate what we get. @@ -566,7 +571,7 @@ function generateAll(run_limit) { } } catch (error) { console.error('!!! error ', error, ' in label ', label_num, - ' for date = ', d); + ' for date = ', d); } } // !!! @@ -578,7 +583,7 @@ function generateAll(run_limit) { test_obj['tests'] = sample_tests(test_cases, run_limit); try { - fs.writeFileSync('datetime_fmt_test.json', JSON.stringify(test_obj, null, 2)); + fs.writeFileSync('datetime_fmt_test.json', JSON.stringify(test_obj, null)); // file written successfully } catch (err) { console.error(err); diff --git a/testgen/generators/generate_test_hash.js b/testgen/generators/generate_test_hash.js new file mode 100644 index 00000000..933c00db --- /dev/null +++ b/testgen/generators/generate_test_hash.js @@ -0,0 +1,34 @@ +// Adds a hex hash code based on the test without the label item. + +const crypto = require('crypto'); + +function remove_none(obj) { + // Recursively removes all null items + if (typeof obj !== "object") { + return obj; + } + const result = new obj.constructor; + const entries = Object.entries(obj); + entries.sort(); + for (const [key, value] of entries) { + if (value !== null) { + result[key] = remove_none(value); + } + } + return result; +} + +function generate_hash_for_test(test_case) { + // Computes a 32 byte hex hash code for the test case + // Note that the test case should not include 'label'. + + obj = remove_none(test_case) + json_str = JSON.stringify(obj); + + const hasher = crypto.createHash("sha1"); + hasher.update(json_str, "utf-8"); + + test_case['hexhash'] = hasher.digest("hex"); +} + +module.exports = {generate_hash_for_test}; diff --git a/testgen/generators/lang_names.py b/testgen/generators/lang_names.py index 5abbf83c..ce23b44b 100644 --- a/testgen/generators/lang_names.py +++ b/testgen/generators/lang_names.py @@ -12,6 +12,7 @@ class LangNamesGenerator(DataGenerator): json_test = {"test_type": "lang_names"} json_verify = {"test_type": "lang_names"} + def process_test_data(self): self.languageNameDescr() filename = "languageNameTable.txt" @@ -22,6 +23,7 @@ def process_test_data(self): # TODO: add standard vs. dialect vs. alternate names self.generateLanguageNameTestDataObjects(rawlangnametestdata) + self.generateTestHashValues(self.json_test) output_path = os.path.join(self.icu_version, "lang_name_test_file.json") lang_name_test_file = open(output_path, "w", encoding="UTF-8") json.dump(self.json_test, lang_name_test_file, indent=1) diff --git a/testgen/generators/list_fmt_gen.js b/testgen/generators/list_fmt_gen.js index c0ad841e..b278bca9 100644 --- a/testgen/generators/list_fmt_gen.js +++ b/testgen/generators/list_fmt_gen.js @@ -8,6 +8,8 @@ // Set up Node version to generate data specific to ICU/CLDR version // e.g., `nvm install 21.6.0;nvm use 21.6.0` (ICU 74) +const gen_hash = require("./generate_test_hash.js"); + const fs = require('node:fs'); let debug = false; @@ -119,10 +121,12 @@ function generateAll() { // TODO: Save this as a test case. let test_list; - let test_case = {'label': label_string, + let test_case = { 'input_list': list, 'options': {...all_options} }; + gen_hash.generate_hash_for_test(test_case); + test_case['label'] = label_string; if (locale != '') { test_case["locale"] = locale; @@ -156,7 +160,7 @@ function generateAll() { test_obj['tests'] = test_cases; try { - fs.writeFileSync('list_fmt_test.json', JSON.stringify(test_obj, null, 2)); + fs.writeFileSync('list_fmt_test.json', JSON.stringify(test_obj, null)); // file written successfully } catch (err) { console.error(err); @@ -164,7 +168,7 @@ function generateAll() { verify_obj['verifications'] = verify_cases; try { - fs.writeFileSync('list_fmt_verify.json', JSON.stringify(verify_obj, null, 2)); + fs.writeFileSync('list_fmt_verify.json', JSON.stringify(verify_obj, null)); // file written successfully } catch (err) { console.error(err); diff --git a/testgen/generators/rdt_fmt_gen.js b/testgen/generators/rdt_fmt_gen.js index 9bc58c9e..5dd9af28 100644 --- a/testgen/generators/rdt_fmt_gen.js +++ b/testgen/generators/rdt_fmt_gen.js @@ -11,6 +11,8 @@ // Set up Node version to generate data specific to ICU/CLDR version // e.g., `nvm install 21.6.0;nvm use 21.6.0` (ICU 74) +const gen_hash = require("./generate_test_hash.js"); + const fs = require('node:fs'); const debug = false; @@ -116,10 +118,12 @@ function generateAll() { } const label_string = String(label_num); - let test_case = {'label': label_string, - 'unit': unit, - 'count': String(count), - }; + + // Without label + let test_case = { + 'unit': unit, + 'count': String(count), + }; if (locale != '') { test_case["locale"] = locale; @@ -132,6 +136,10 @@ function generateAll() { if (debug) { console.log("TEST CASE :", test_case); } + + gen_hash.generate_hash_for_test(test_case); + test_case['label'] = label_string; + test_cases.push(test_case); // Generate what we get. @@ -158,7 +166,7 @@ function generateAll() { test_obj['tests'] = test_cases; try { - fs.writeFileSync('rdt_fmt_test.json', JSON.stringify(test_obj, null, 2)); + fs.writeFileSync('rdt_fmt_test.json', JSON.stringify(test_obj, null)); // file written successfully } catch (err) { console.error(err);