From 0a85ccbe2736e124dfeeb4b007b6191a2916fd7f Mon Sep 17 00:00:00 2001
From: Conrad Nied <conrad.logos@gmail.com>
Date: Wed, 8 Jan 2025 15:25:25 -0800
Subject: [PATCH] CLDR-18155 Make likely script first
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the latest review Mark mentioned that he linked the idea of regardless of population, the current likely subtag should be first in this list. This change makes that happen. This mostly affects historic languages but also a few with known overrides to their scripts.

Changed languages:
* Azerbaijani [az] gets Latin first. There are more Azerbaijani speakers in Iran (thereby using the Arabic script) but the online precense of Azeris is centered in Azerbaijan (thereby Latin script)
* Lingua Franca Nova [lfn], a constructed language, gets Latin first
* Panjabi [pa] (currently likely subtag to Gurmukhī script in India) no longer has Perso-Arabic first, because of online usage is different than populations
* Pali [pi] gets Sinhala first in the secondary tags -- in fact though I need to fix this with my other script changes
* Samaritan Aramaic [sam], an extinct language gets the Samaritan alphabet first
* Old Irish [sga] gets Ogham writing first
* Umbrian [xum] swaps to get Latin first but I'm also not sure this is correct -- but it is the current value in Likely subtags so I'll leave it be (for now).
---
 common/supplemental/likelySubtags.xml            |  2 +-
 common/supplemental/supplementalData.xml         | 16 ++++++++--------
 .../testData/localeIdentifiers/likelySubtags.txt |  2 +-
 .../unicode/cldr/tool/ConvertLanguageData.java   | 11 +++++++++++
 4 files changed, 21 insertions(+), 10 deletions(-)
diff --git a/common/supplemental/likelySubtags.xml b/common/supplemental/likelySubtags.xml
index 1cf6e396413..0f0ff297d8a 100644
--- a/common/supplemental/likelySubtags.xml
+++ b/common/supplemental/likelySubtags.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
 <!--
-Copyright © 1991-2024 Unicode, Inc.
+Copyright © 1991-2025 Unicode, Inc.
 For terms of use, see http://www.unicode.org/copyright.html
 SPDX-License-Identifier: Unicode-3.0
 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
diff --git a/common/supplemental/supplementalData.xml b/common/supplemental/supplementalData.xml
index 5b44590c312..97a26d634b8 100644
--- a/common/supplemental/supplementalData.xml
+++ b/common/supplemental/supplementalData.xml
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
 <!--
-Copyright © 1991-2023 Unicode, Inc.
+Copyright © 1991-2025 Unicode, Inc.
 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 For terms of use, see https://www.unicode.org/copyright.html
 -->
@@ -1346,7 +1346,7 @@ XXX Code for transations where no currency is involved
 		<language type="awa" scripts="Deva"/>
 		<language type="awa" territories="IN" alt="secondary"/>
 		<language type="ay" scripts="Latn" territories="BO"/>
-		<language type="az" scripts="Arab Latn Cyrl" territories="AZ"/>
+		<language type="az" scripts="Latn Arab Cyrl" territories="AZ"/>
 		<language type="az" territories="IQ IR RU" alt="secondary"/>
 		<language type="ba" scripts="Cyrl"/>
 		<language type="ba" territories="RU" alt="secondary"/>
@@ -1861,7 +1861,7 @@ XXX Code for transations where no currency is involved
 		<language type="lep" scripts="Lepc"/>
 		<language type="lez" scripts="Cyrl"/>
 		<language type="lez" scripts="Aghb" territories="RU" alt="secondary"/>
-		<language type="lfn" scripts="Cyrl Latn" alt="secondary"/>
+		<language type="lfn" scripts="Latn Cyrl" alt="secondary"/>
 		<language type="lg" scripts="Latn"/>
 		<language type="lg" territories="UG" alt="secondary"/>
 		<language type="li" scripts="Latn"/>
@@ -2073,7 +2073,7 @@ XXX Code for transations where no currency is involved
 		<language type="osa" scripts="Latn" alt="secondary"/>
 		<language type="osc" scripts="Ital Latn" alt="secondary"/>
 		<language type="otk" scripts="Orkh"/>
-		<language type="pa" scripts="Arab Guru"/>
+		<language type="pa" scripts="Guru Arab"/>
 		<language type="pa" territories="CA GB IN PK" alt="secondary"/>
 		<language type="pag" scripts="Latn"/>
 		<language type="pag" territories="PH" alt="secondary"/>
@@ -2092,7 +2092,7 @@ XXX Code for transations where no currency is involved
 		<language type="pfl" scripts="Latn"/>
 		<language type="phn" scripts="Phnx"/>
 		<language type="pi" scripts="Mymr"/>
-		<language type="pi" scripts="Deva Sinh Thai" alt="secondary"/>
+		<language type="pi" scripts="Sinh Deva Thai" alt="secondary"/>
 		<language type="pis" scripts="Latn"/>
 		<language type="pis" territories="SB" alt="secondary"/>
 		<language type="pko" scripts="Latn"/>
@@ -2165,7 +2165,7 @@ XXX Code for transations where no currency is involved
 		<language type="saf" scripts="Latn"/>
 		<language type="sah" scripts="Cyrl"/>
 		<language type="sah" territories="RU" alt="secondary"/>
-		<language type="sam" scripts="Hebr Samr" alt="secondary"/>
+		<language type="sam" scripts="Samr Hebr" alt="secondary"/>
 		<language type="saq" scripts="Latn"/>
 		<language type="sas" scripts="Latn"/>
 		<language type="sas" territories="ID" alt="secondary"/>
@@ -2198,7 +2198,7 @@ XXX Code for transations where no currency is involved
 		<language type="sel" scripts="Cyrl"/>
 		<language type="ses" scripts="Latn"/>
 		<language type="sg" scripts="Latn" territories="CF"/>
-		<language type="sga" scripts="Latn Ogam" alt="secondary"/>
+		<language type="sga" scripts="Ogam Latn" alt="secondary"/>
 		<language type="sgs" scripts="Latn"/>
 		<language type="shi" scripts="Tfng Latn Arab"/>
 		<language type="shi" territories="MA" alt="secondary"/>
@@ -2425,7 +2425,7 @@ XXX Code for transations where no currency is involved
 		<language type="xpr" scripts="Prti"/>
 		<language type="xsa" scripts="Sarb"/>
 		<language type="xsr" scripts="Deva"/>
-		<language type="xum" scripts="Ital Latn" alt="secondary"/>
+		<language type="xum" scripts="Latn Ital" alt="secondary"/>
 		<language type="yao" scripts="Latn"/>
 		<language type="yap" scripts="Latn"/>
 		<language type="yav" scripts="Latn"/>
diff --git a/common/testData/localeIdentifiers/likelySubtags.txt b/common/testData/localeIdentifiers/likelySubtags.txt
index 3b1261d5247..cd9cfed8fc2 100644
--- a/common/testData/localeIdentifiers/likelySubtags.txt
+++ b/common/testData/localeIdentifiers/likelySubtags.txt
@@ -1,5 +1,5 @@
 # Test data for Likely Subtags
-#  Copyright © 1991-2024 Unicode, Inc.
+#  Copyright © 1991-2025 Unicode, Inc.
 #  For terms of use, see http://www.unicode.org/copyright.html
 #  SPDX-License-Identifier: Unicode-3.0
 #  CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java
index b9cee165b2c..dfdc42a6b96 100644
--- a/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java
+++ b/tools/cldr-code/src/main/java/org/unicode/cldr/tool/ConvertLanguageData.java
@@ -404,6 +404,7 @@ private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput
                 Set<String> territories =
                         status_territories == null ? null : status_territories.getAll(status);
                 Map<String, Integer> scriptsByPopulationAtThisLevel = new TreeMap<>();
+                String likelyScript = supplementalData.getDefaultScript(languageSubtag);
                 if (status_scripts != null) {
                     Set<String> scriptsAtThisLevel = status_scripts.getAll(status);
                     if (scriptsAtThisLevel != null) {
@@ -414,6 +415,16 @@ private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput
                                 population = scriptsByPopulationAnyLevel.get(script);
                             }
                             scriptsByPopulationAtThisLevel.put(script, population);
+
+                            // Artifical add 1 billion population to the current likely subtag.
+                            // This overrides the order for a few languages where there is a good
+                            // reason for the likely subtag to not match the population. For
+                            // instance, Azeribaijani's online presence is focused in Latin. This
+                            // also orders the scripts when we don't have population data but have a
+                            // distinct likely subtag.
+                            if (script.equals(likelyScript)) {
+                                scriptsByPopulationAtThisLevel.put(script, 1000000000);
+                            }
                         }
                     }
                 }