From 0a85ccbe2736e124dfeeb4b007b6191a2916fd7f Mon Sep 17 00:00:00 2001 From: Conrad Nied Date: Wed, 8 Jan 2025 15:25:25 -0800 Subject: [PATCH] CLDR-18155 Make likely script first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the latest review Mark mentioned that he linked the idea of regardless of population, the current likely subtag should be first in this list. This change makes that happen. This mostly affects historic languages but also a few with known overrides to their scripts. Changed languages: * Azerbaijani [az] gets Latin first. There are more Azerbaijani speakers in Iran (thereby using the Arabic script) but the online precense of Azeris is centered in Azerbaijan (thereby Latin script) * Lingua Franca Nova [lfn], a constructed language, gets Latin first * Panjabi [pa] (currently likely subtag to Gurmukhī script in India) no longer has Perso-Arabic first, because of online usage is different than populations * Pali [pi] gets Sinhala first in the secondary tags -- in fact though I need to fix this with my other script changes * Samaritan Aramaic [sam], an extinct language gets the Samaritan alphabet first * Old Irish [sga] gets Ogham writing first * Umbrian [xum] swaps to get Latin first but I'm also not sure this is correct -- but it is the current value in Likely subtags so I'll leave it be (for now). --- common/supplemental/likelySubtags.xml | 2 +- common/supplemental/supplementalData.xml | 16 ++++++++-------- .../testData/localeIdentifiers/likelySubtags.txt | 2 +- .../unicode/cldr/tool/ConvertLanguageData.java | 11 +++++++++++ 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/common/supplemental/likelySubtags.xml b/common/supplemental/likelySubtags.xml index 1cf6e396413..0f0ff297d8a 100644 --- a/common/supplemental/likelySubtags.xml +++ b/common/supplemental/likelySubtags.xml @@ -1,7 +1,7 @@ @@ -1346,7 +1346,7 @@ XXX Code for transations where no currency is involved - + @@ -1861,7 +1861,7 @@ XXX Code for transations where no currency is involved - + @@ -2073,7 +2073,7 @@ XXX Code for transations where no currency is involved - + @@ -2092,7 +2092,7 @@ XXX Code for transations where no currency is involved - + @@ -2165,7 +2165,7 @@ XXX Code for transations where no currency is involved - + @@ -2198,7 +2198,7 @@ XXX Code for transations where no currency is involved - + @@ -2425,7 +2425,7 @@ XXX Code for transations where no currency is involved - + diff --git a/common/testData/localeIdentifiers/likelySubtags.txt b/common/testData/localeIdentifiers/likelySubtags.txt index 3b1261d5247..cd9cfed8fc2 100644 --- a/common/testData/localeIdentifiers/likelySubtags.txt +++ b/common/testData/localeIdentifiers/likelySubtags.txt @@ -1,5 +1,5 @@ # Test data for Likely Subtags -# Copyright © 1991-2024 Unicode, Inc. +# Copyright © 1991-2025 Unicode, Inc. # For terms of use, see http://www.unicode.org/copyright.html # SPDX-License-Identifier: Unicode-3.0 # CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java index b9cee165b2c..dfdc42a6b96 100644 --- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java +++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java @@ -404,6 +404,7 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput Set territories = status_territories == null ? null : status_territories.getAll(status); Map scriptsByPopulationAtThisLevel = new TreeMap<>(); + String likelyScript = supplementalData.getDefaultScript(languageSubtag); if (status_scripts != null) { Set scriptsAtThisLevel = status_scripts.getAll(status); if (scriptsAtThisLevel != null) { @@ -414,6 +415,16 @@ private static void writeNewBasicData2(PrintWriter out, Set sortedInput population = scriptsByPopulationAnyLevel.get(script); } scriptsByPopulationAtThisLevel.put(script, population); + + // Artifical add 1 billion population to the current likely subtag. + // This overrides the order for a few languages where there is a good + // reason for the likely subtag to not match the population. For + // instance, Azeribaijani's online presence is focused in Latin. This + // also orders the scripts when we don't have population data but have a + // distinct likely subtag. + if (script.equals(likelyScript)) { + scriptsByPopulationAtThisLevel.put(script, 1000000000); + } } } }