Skip to content

Commit

Permalink
CLDR-18155 Make likely script first
Browse files Browse the repository at this point in the history
In the latest review Mark mentioned that he linked the idea of regardless of population, the current likely subtag should be first in this list. This change makes that happen. This mostly affects historic languages but also a few with known overrides to their scripts.

Changed languages:
* Azerbaijani [az] gets Latin first. There are more Azerbaijani speakers in Iran (thereby using the Arabic script) but the online precense of Azeris is centered in Azerbaijan (thereby Latin script)
* Lingua Franca Nova [lfn], a constructed language, gets Latin first
* Panjabi [pa] (currently likely subtag to Gurmukhī script in India) no longer has Perso-Arabic first, because of online usage is different than populations
* Pali [pi] gets Sinhala first in the secondary tags -- in fact though I need to fix this with my other script changes
* Samaritan Aramaic [sam], an extinct language gets the Samaritan alphabet first
* Old Irish [sga] gets Ogham writing first
* Umbrian [xum] swaps to get Latin first but I'm also not sure this is correct -- but it is the current value in Likely subtags so I'll leave it be (for now).
  • Loading branch information
conradarcturus committed Jan 14, 2025
1 parent addae40 commit 0a85ccb
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 10 deletions.
2 changes: 1 addition & 1 deletion common/supplemental/likelySubtags.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
<!--
Copyright © 1991-2024 Unicode, Inc.
Copyright © 1991-2025 Unicode, Inc.
For terms of use, see http://www.unicode.org/copyright.html
SPDX-License-Identifier: Unicode-3.0
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
Expand Down
16 changes: 8 additions & 8 deletions common/supplemental/supplementalData.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
<!--
Copyright © 1991-2023 Unicode, Inc.
Copyright © 1991-2025 Unicode, Inc.
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
For terms of use, see https://www.unicode.org/copyright.html
-->
Expand Down Expand Up @@ -1346,7 +1346,7 @@ XXX Code for transations where no currency is involved
<language type="awa" scripts="Deva"/>
<language type="awa" territories="IN" alt="secondary"/>
<language type="ay" scripts="Latn" territories="BO"/>
<language type="az" scripts="Arab Latn Cyrl" territories="AZ"/>
<language type="az" scripts="Latn Arab Cyrl" territories="AZ"/>
<language type="az" territories="IQ IR RU" alt="secondary"/>
<language type="ba" scripts="Cyrl"/>
<language type="ba" territories="RU" alt="secondary"/>
Expand Down Expand Up @@ -1861,7 +1861,7 @@ XXX Code for transations where no currency is involved
<language type="lep" scripts="Lepc"/>
<language type="lez" scripts="Cyrl"/>
<language type="lez" scripts="Aghb" territories="RU" alt="secondary"/>
<language type="lfn" scripts="Cyrl Latn" alt="secondary"/>
<language type="lfn" scripts="Latn Cyrl" alt="secondary"/>
<language type="lg" scripts="Latn"/>
<language type="lg" territories="UG" alt="secondary"/>
<language type="li" scripts="Latn"/>
Expand Down Expand Up @@ -2073,7 +2073,7 @@ XXX Code for transations where no currency is involved
<language type="osa" scripts="Latn" alt="secondary"/>
<language type="osc" scripts="Ital Latn" alt="secondary"/>
<language type="otk" scripts="Orkh"/>
<language type="pa" scripts="Arab Guru"/>
<language type="pa" scripts="Guru Arab"/>
<language type="pa" territories="CA GB IN PK" alt="secondary"/>
<language type="pag" scripts="Latn"/>
<language type="pag" territories="PH" alt="secondary"/>
Expand All @@ -2092,7 +2092,7 @@ XXX Code for transations where no currency is involved
<language type="pfl" scripts="Latn"/>
<language type="phn" scripts="Phnx"/>
<language type="pi" scripts="Mymr"/>
<language type="pi" scripts="Deva Sinh Thai" alt="secondary"/>
<language type="pi" scripts="Sinh Deva Thai" alt="secondary"/>
<language type="pis" scripts="Latn"/>
<language type="pis" territories="SB" alt="secondary"/>
<language type="pko" scripts="Latn"/>
Expand Down Expand Up @@ -2165,7 +2165,7 @@ XXX Code for transations where no currency is involved
<language type="saf" scripts="Latn"/>
<language type="sah" scripts="Cyrl"/>
<language type="sah" territories="RU" alt="secondary"/>
<language type="sam" scripts="Hebr Samr" alt="secondary"/>
<language type="sam" scripts="Samr Hebr" alt="secondary"/>
<language type="saq" scripts="Latn"/>
<language type="sas" scripts="Latn"/>
<language type="sas" territories="ID" alt="secondary"/>
Expand Down Expand Up @@ -2198,7 +2198,7 @@ XXX Code for transations where no currency is involved
<language type="sel" scripts="Cyrl"/>
<language type="ses" scripts="Latn"/>
<language type="sg" scripts="Latn" territories="CF"/>
<language type="sga" scripts="Latn Ogam" alt="secondary"/>
<language type="sga" scripts="Ogam Latn" alt="secondary"/>
<language type="sgs" scripts="Latn"/>
<language type="shi" scripts="Tfng Latn Arab"/>
<language type="shi" territories="MA" alt="secondary"/>
Expand Down Expand Up @@ -2425,7 +2425,7 @@ XXX Code for transations where no currency is involved
<language type="xpr" scripts="Prti"/>
<language type="xsa" scripts="Sarb"/>
<language type="xsr" scripts="Deva"/>
<language type="xum" scripts="Ital Latn" alt="secondary"/>
<language type="xum" scripts="Latn Ital" alt="secondary"/>
<language type="yao" scripts="Latn"/>
<language type="yap" scripts="Latn"/>
<language type="yav" scripts="Latn"/>
Expand Down
2 changes: 1 addition & 1 deletion common/testData/localeIdentifiers/likelySubtags.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Test data for Likely Subtags
# Copyright © 1991-2024 Unicode, Inc.
# Copyright © 1991-2025 Unicode, Inc.
# For terms of use, see http://www.unicode.org/copyright.html
# SPDX-License-Identifier: Unicode-3.0
# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput
Set<String> territories =
status_territories == null ? null : status_territories.getAll(status);
Map<String, Integer> scriptsByPopulationAtThisLevel = new TreeMap<>();
String likelyScript = supplementalData.getDefaultScript(languageSubtag);
if (status_scripts != null) {
Set<String> scriptsAtThisLevel = status_scripts.getAll(status);
if (scriptsAtThisLevel != null) {
Expand All @@ -414,6 +415,16 @@ private static void writeNewBasicData2(PrintWriter out, Set<RowData> sortedInput
population = scriptsByPopulationAnyLevel.get(script);
}
scriptsByPopulationAtThisLevel.put(script, population);

// Artifical add 1 billion population to the current likely subtag.
// This overrides the order for a few languages where there is a good
// reason for the likely subtag to not match the population. For
// instance, Azeribaijani's online presence is focused in Latin. This
// also orders the scripts when we don't have population data but have a
// distinct likely subtag.
if (script.equals(likelyScript)) {
scriptsByPopulationAtThisLevel.put(script, 1000000000);
}
}
}
}
Expand Down

0 comments on commit 0a85ccb

Please sign in to comment.