forked from kangjianwei/LearningJDK
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStringUTF16.java
1768 lines (1551 loc) · 72.8 KB
/
StringUTF16.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.lang;
import jdk.internal.HotSpotIntrinsicCandidate;
import java.util.Arrays;
import java.util.Locale;
import java.util.Spliterator;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import static java.lang.String.LATIN1;
import static java.lang.String.UTF16;
// UTF16-String
final class StringUTF16 {
static final int HI_BYTE_SHIFT, LO_BYTE_SHIFT; // 大小端标记
static final int MAX_LENGTH = Integer.MAX_VALUE >> 1;
// 设置大小端标记
static {
if(isBigEndian()) {
HI_BYTE_SHIFT = 8;
LO_BYTE_SHIFT = 0;
} else {
HI_BYTE_SHIFT = 0;
LO_BYTE_SHIFT = 8;
}
}
/*▼ 获取char/char[] ████████████████████████████████████████████████████████████████████████████████┓ */
// 将UTF16-String内部的字节转换为char后返回
@HotSpotIntrinsicCandidate
static char getChar(byte[] val, int index) {
assert index >= 0 && index < length(val) : "Trusted caller missed bounds check";
index <<= 1; // 获取字符序列
return (char) (((val[index++] & 0xff) << HI_BYTE_SHIFT) | ((val[index] & 0xff) << LO_BYTE_SHIFT));
}
// 将UTF16-String内部的字节转换为char后返回,加入范围检查
public static char charAt(byte[] value, int index) {
// 越界检查
checkIndex(index, value);
return getChar(value, index);
}
// 将UTF16-String内部的字节批量转换为char后存入dst
@HotSpotIntrinsicCandidate
public static void getChars(byte[] value, int srcBegin, int srcEnd, char dst[], int dstBegin) {
// 范围检查
if(srcBegin < srcEnd) {
checkBoundsOffCount(srcBegin, srcEnd - srcBegin, value);
}
for(int i = srcBegin; i < srcEnd; i++) {
// 将UTF16-String内部的字节转换为char后返回
dst[dstBegin++] = getChar(value, i);
}
}
// 将UTF16-String内部的字节全部转换为char后返回
public static char[] toChars(byte[] value) {
char[] dst = new char[value.length >> 1];
// 将UTF16-String内部的字节批量转换为char后存入dst
getChars(value, 0, dst.length, dst, 0);
return dst;
}
/*▲ 获取char/char[] ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ 获取byte/byte[] ████████████████████████████████████████████████████████████████████████████████┓ */
// 将c的两个低字节转换为UTF16-String内部的字节后,存入val的index处
@HotSpotIntrinsicCandidate
static void putChar(byte[] val, int index, int c) {
assert index >= 0 && index < length(val) : "Trusted caller missed bounds check";
index <<= 1;
val[index++] = (byte) (c >> HI_BYTE_SHIFT);
val[index] = (byte) (c >> LO_BYTE_SHIFT);
}
// 将c的两个低字节转换为UTF16-SB内部的字节后,存入val的index处,加入了范围检查
public static void putCharSB(byte[] val, int index, int c) {
checkIndex(index, val);
putChar(val, index, c);
}
// 将s[off, end)内部的字节批量转换为UTF16-SB内部的字节后,存入val的index处
public static void putCharsSB(byte[] val, int index, CharSequence s, int off, int end) {
checkBoundsBeginEnd(index, index + end - off, val);
for(int i = off; i < end; i++) {
putChar(val, index++, s.charAt(i));
}
}
// 将str[off, end)内部的char批量转换为UTF16-SB内部的字节后,存入val的index处
private static void putChars(byte[] val, int index, char[] str, int off, int end) {
while(off < end) {
putChar(val, index++, str[off++]);
}
}
// 将ca[off, end)内部的char批量转换为UTF16-SB内部的字节后,存入val的index处,加入范围检查
public static void putCharsSB(byte[] val, int index, char[] ca, int off, int end) {
checkBoundsBeginEnd(index, index + end - off, val);
putChars(val, index, ca, off, end);
}
// 将4个char依次存入UTF16-SB内部的字节
public static int putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) {
int end = i + 4;
checkBoundsBeginEnd(i, end, value);
putChar(value, i++, c1);
putChar(value, i++, c2);
putChar(value, i++, c3);
putChar(value, i++, c4);
assert (i == end);
return end;
}
// 将5个char依次存入UTF16-SB内部的字节
public static int putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4, char c5) {
int end = i + 5;
checkBoundsBeginEnd(i, end, value);
putChar(value, i++, c1);
putChar(value, i++, c2);
putChar(value, i++, c3);
putChar(value, i++, c4);
putChar(value, i++, c5);
assert (i == end);
return end;
}
// 将char转换为UTF16-String内部的字节,并返回
public static byte[] toBytes(char c) {
byte[] result = new byte[2];
putChar(result, 0, c);
return result;
}
// 将value[off, off+len)中的char批量转换为UTF16-S内部的字节,并返回
@HotSpotIntrinsicCandidate
public static byte[] toBytes(char[] value, int off, int len) {
// 创建长度为2*len的字节数组
byte[] val = newBytesFor(len);
for(int i = 0; i < len; i++) {
// 将value[off]转换为UTF16-String内部的字节,存入val
putChar(val, i, value[off]);
off++;
}
return val;
}
/**
* int[] val = new int[]{0x56DB, 0x6761, 0x2A6A5}; // 分别是【四】【条】【𪚥】这三个字的Unicode编码值
* toBytes(val, 0, 3); // 返回字节数组:[0x56, 0xDB, 0x67, 0x61, 0xD8, 0x69, 0xDE, 0xA5]
* 注:0x2A6A5是一个增补字符的编码,需要先将其拆分为高低代理单元对<0xD869,0xDEA5>,然后再转为字节存储
*/
// 将val中的一组Unicode值批量转换为UTF16-String内部的字节,存入val的index处,再返回
public static byte[] toBytes(int[] val, int index, int len) {
final int end = index + len;
// Pass 1: 根据Unicode值,计算码元(char)的个数
int n = len; // 计算需要占几个char的空间
for(int i = index; i < end; i++) {
int cp = val[i];
if(Character.isBmpCodePoint(cp)) {
continue;
} else if(Character.isValidCodePoint(cp)) {
n++; // 如果是增补字符,则意味着需要多占一个char的空间
} else {
throw new IllegalArgumentException(Integer.toString(cp));
}
}
// Pass 2: 填充高低代理对
// 创建长度为2*n的字节数组
byte[] buf = newBytesFor(n);
for(int i = index, j = 0; i < end; i++, j++) {
int cp = val[i];
if(Character.isBmpCodePoint(cp)) {
putChar(buf, j, cp);
} else {
putChar(buf, j++, Character.highSurrogate(cp)); // 返回高代理处的码元(char)
putChar(buf, j, Character.lowSurrogate(cp)); // 返回低代理处的码元(char)
}
}
return buf;
}
// 返回Unicode增补符号cp的四字节表示
static byte[] toBytesSupplementary(int cp) {
byte[] result = new byte[4];
putChar(result, 0, Character.highSurrogate(cp)); // 返回高代理处的码元(char)
putChar(result, 1, Character.lowSurrogate(cp)); // 返回低代理处的码元(char)
return result;
}
// 将src(LATIN1-String)内部的字节批量转换为UTF16-String内部的字节,存入dst的dstOff处
public static void inflate(byte[] src, int srcOff, byte[] dst, int dstOff, int len) {
// 下标检查
checkBoundsOffCount(dstOff, len, dst);
for(int i = 0; i < len; i++) {
putChar(dst, dstOff++, src[srcOff++] & 0xff);
}
}
/*▲ 获取byte/byte[] ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ 压缩 ████████████████████████████████████████████████████████████████████████████████┓ */
/**
* 先将UTF16-String内部的字节存入一个char,再取这个char的一个低字节存入LATIN1-String内部的字节
* 如遇转换后的char在Latin1字符集之外,即遇到超出[0x00, 0xFF)范围的char,则停止压缩。
*
* byte[] src = new byte[]{0x00,0x12, 0x00,0x34, 0x00,0x56};
* byte[] dst = new byte[2];
* compress(src, 1, dst, 0, 2); // dst:[0x34, 0x56]
*/
// 将UTF16-String内部的字节转换为LATIN1-String内部的字节,加入范围检查
@HotSpotIntrinsicCandidate
public static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int len) {
// 下标范围检查
checkBoundsOffCount(srcOff, len, src);
for(int i = 0; i < len; i++) {
// 将UTF16-String内部的字节转换为char后返回
char c = getChar(src, srcOff);
// 超出了Latin1字符集表示范围
if(c > 0xFF) {
len = 0;
break;
}
dst[dstOff] = (byte) c;
srcOff++;
dstOff++;
}
// 返回压缩成功的字节对数量,如果中途失败,则为0
return len;
}
/**
* 先将UTF16-String内部的字节存入一个char,再取这个char的一个低字节存入LATIN1-String内部的字节,再返回
* 如遇转换后的char在Latin1字符集之外,即遇到超出[0x00, 0xFF)范围的char,则停止压缩。
*
* byte[] val = new byte[]{0x00,0x12, 0x00,0x34, 0x00,0x56};
* compress(val, 1, 2); // 返回字节数组:[0x34, 0x56]
*/
// 将UTF16-String内部的字节转换为LATIN1-String内部的字节后,再返回
public static byte[] compress(byte[] val, int off, int len) {
byte[] ret = new byte[len];
if(compress(val, off, ret, 0, len) == len) {
// 如果成功完成指定范围内的压缩,则返回压缩后的字符序列的字节表示
return ret;
}
// 如果不能完成压缩任务,则返回nll
return null;
}
/**
* 将UTF16-String内部的字节转换为LATIN1-String内部的字节
* 如遇char在Latin1字符集之外,即遇到超出[0x00, 0xFF)范围的char,则停止压缩。
*
* char[] src = new char[]{'\u0012', '\u0034', '\u0056'};
* byte[] dst = new byte[2];
* compress(src, 1, dst, 0, 2); // dst:[0x34, 0x56]
*/
// 将UTF16-String内部的字节转换为LATIN1-String内部的字节
@HotSpotIntrinsicCandidate
public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
for(int i = 0; i < len; i++) {
char c = src[srcOff];
if(c > 0xFF) {
len = 0;
break;
}
dst[dstOff] = (byte) c;
srcOff++;
dstOff++;
}
// 返回成功成功压缩的char的数量,如果中途失败,则为0
return len;
}
/**
* 将UTF16-String内部的字节转换为LATIN1-String内部的字节
* 如遇char在Latin1字符集之外,即遇到超出[0x00, 0xFF)范围的char,则停止压缩。
*
* char[] src = new char[]{'\u0012', '\u0034', '\u0056'};
* compress(src, 1, 2); // 返回字节数组:[0x34, 0x56]
*/
// 将UTF16-String内部的字节转换为LATIN1-String内部的字节
public static byte[] compress(char[] val, int off, int len) {
byte[] ret = new byte[len];
if(compress(val, off, ret, 0, len) == len) {
// 如果成功完成指定范围内的压缩,则返回压缩后的字符序列的字节表示
return ret;
}
// 如果不能完成压缩任务,则返回nll
return null;
}
/*
* 将UTF16-String内部的字节转换为LATIN1-String内部的字节,类似于压缩
* 要想数据无损转换,则原始字节对表示的char必须在[0x00, 0xFF]范围
*
* byte[] value = new byte[]{0x12,0x34, 0x56,0x78, 0xAB,0xCD};
* getBytes(value, 1, 3, dst, 0); // dst数组:[0x78, 0xCD]
*/
public static void getBytes(byte[] value, int srcBegin, int srcEnd, byte dst[], int dstBegin) {
srcBegin <<= 1;
srcEnd <<= 1;
for(int i = srcBegin + (1 >> LO_BYTE_SHIFT); i < srcEnd; i += 2) {
dst[dstBegin++] = value[i];
}
}
/*▲ 压缩 ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ 大小写转换 ████████████████████████████████████████████████████████████████████████████████┓ */
// 小写转换,需要指定语言环境,其中value存储了str的字节表现形式
public static String toLowerCase(String str, byte[] value, Locale locale) {
if(locale == null) {
throw new NullPointerException();
}
int first;
boolean hasSurr = false;
final int len = value.length >> 1;
// Now check if there are any characters that need to be changed, or are surrogate
for(first = 0; first < len; first++) {
int cp = (int) getChar(value, first);
if(Character.isSurrogate((char) cp)) {
hasSurr = true;
break;
}
if(cp != Character.toLowerCase(cp)) { // no need to check Character.ERROR
break;
}
}
if(first == len)
return str;
byte[] result = new byte[value.length];
System.arraycopy(value, 0, result, 0, first << 1); // Just copy the first few
// lowerCase characters.
String lang = locale.getLanguage();
if(lang == "tr" || lang == "az" || lang == "lt") {
return toLowerCaseEx(str, value, result, first, locale, true);
}
if(hasSurr) {
return toLowerCaseEx(str, value, result, first, locale, false);
}
int bits = 0;
for(int i = first; i < len; i++) {
int cp = (int) getChar(value, i);
if(cp == '\u03A3' || // GREEK CAPITAL LETTER SIGMA
Character.isSurrogate((char) cp)) {
return toLowerCaseEx(str, value, result, i, locale, false);
}
if(cp == '\u0130') { // LATIN CAPITAL LETTER I WITH DOT ABOVE
return toLowerCaseEx(str, value, result, i, locale, true);
}
cp = Character.toLowerCase(cp);
if(!Character.isBmpCodePoint(cp)) {
return toLowerCaseEx(str, value, result, i, locale, false);
}
bits |= cp;
putChar(result, i, cp);
}
if(bits > 0xFF) {
return new String(result, UTF16);
} else {
return newString(result, 0, len);
}
}
// 大写转换,需要指定语言环境,其中value存储了str的字节表现形式
public static String toUpperCase(String str, byte[] value, Locale locale) {
if(locale == null) {
throw new NullPointerException();
}
int first;
boolean hasSurr = false;
final int len = value.length >> 1;
// Now check if there are any characters that need to be changed, or are surrogate
for(first = 0; first < len; first++) {
int cp = (int) getChar(value, first);
if(Character.isSurrogate((char) cp)) {
hasSurr = true;
break;
}
if(cp != Character.toUpperCaseEx(cp)) { // no need to check Character.ERROR
break;
}
}
if(first == len) {
return str;
}
byte[] result = new byte[value.length];
System.arraycopy(value, 0, result, 0, first << 1); // Just copy the first few
// upperCase characters.
String lang = locale.getLanguage();
if(lang == "tr" || lang == "az" || lang == "lt") {
return toUpperCaseEx(str, value, result, first, locale, true);
}
if(hasSurr) {
return toUpperCaseEx(str, value, result, first, locale, false);
}
int bits = 0;
for(int i = first; i < len; i++) {
int cp = (int) getChar(value, i);
if(Character.isSurrogate((char) cp)) {
return toUpperCaseEx(str, value, result, i, locale, false);
}
cp = Character.toUpperCaseEx(cp);
if(!Character.isBmpCodePoint(cp)) { // Character.ERROR is not bmp
return toUpperCaseEx(str, value, result, i, locale, false);
}
bits |= cp;
putChar(result, i, cp);
}
if(bits > 0xFF) {
return new String(result, UTF16);
} else {
return newString(result, 0, len);
}
}
// 小写转换,处理增补字符以及一些特殊语言的场景
private static String toLowerCaseEx(String str, byte[] value, byte[] result, int first, Locale locale, boolean localeDependent) {
assert (result.length == value.length);
assert (first >= 0);
int resultOffset = first;
int length = value.length >> 1;
int srcCount;
for(int i = first; i < length; i += srcCount) {
int srcChar = getChar(value, i);
int lowerChar;
char[] lowerCharArray;
srcCount = 1;
if(Character.isSurrogate((char) srcChar)) {
srcChar = codePointAt(value, i, length);
srcCount = Character.charCount(srcChar);
}
if(localeDependent || srcChar == '\u03A3' || // GREEK CAPITAL LETTER SIGMA
srcChar == '\u0130') { // LATIN CAPITAL LETTER I WITH DOT ABOVE
lowerChar = ConditionalSpecialCasing.toLowerCaseEx(str, i, locale);
} else {
lowerChar = Character.toLowerCase(srcChar);
}
if(Character.isBmpCodePoint(lowerChar)) { // Character.ERROR is not a bmp
putChar(result, resultOffset++, lowerChar);
} else {
if(lowerChar == Character.ERROR) {
lowerCharArray = ConditionalSpecialCasing.toLowerCaseCharArray(str, i, locale);
} else {
// 解码,Unicode码点值 ---> char,对于增补平面区码点值,需要拆分成高、低代理单元再存储
lowerCharArray = Character.toChars(lowerChar);
}
/* Grow result if needed */
int mapLen = lowerCharArray.length;
if(mapLen > srcCount) {
// 创建长度为2*len的字节数组
byte[] result2 = newBytesFor((result.length >> 1) + mapLen - srcCount);
System.arraycopy(result, 0, result2, 0, resultOffset << 1);
result = result2;
}
assert resultOffset >= 0;
assert resultOffset + mapLen <= length(result);
for(int x = 0; x < mapLen; ++x) {
putChar(result, resultOffset++, lowerCharArray[x]);
}
}
}
return newString(result, 0, resultOffset);
}
// 大写转换,处理增补字符以及一些特殊语言的场景
private static String toUpperCaseEx(String str, byte[] value, byte[] result, int first, Locale locale, boolean localeDependent) {
assert (result.length == value.length);
assert (first >= 0);
int resultOffset = first;
int length = value.length >> 1;
int srcCount;
for(int i = first; i < length; i += srcCount) {
int srcChar = getChar(value, i);
int upperChar;
char[] upperCharArray;
srcCount = 1;
if(Character.isSurrogate((char) srcChar)) {
srcChar = codePointAt(value, i, length);
srcCount = Character.charCount(srcChar);
}
if(localeDependent) {
upperChar = ConditionalSpecialCasing.toUpperCaseEx(str, i, locale);
} else {
upperChar = Character.toUpperCaseEx(srcChar);
}
if(Character.isBmpCodePoint(upperChar)) {
putChar(result, resultOffset++, upperChar);
} else {
if(upperChar == Character.ERROR) {
if(localeDependent) {
upperCharArray = ConditionalSpecialCasing.toUpperCaseCharArray(str, i, locale);
} else {
upperCharArray = Character.toUpperCaseCharArray(srcChar);
}
} else {
// 解码,Unicode码点值 ---> char,对于增补平面区码点值,需要拆分成高、低代理单元再存储
upperCharArray = Character.toChars(upperChar);
}
/* Grow result if needed */
int mapLen = upperCharArray.length;
if(mapLen > srcCount) {
// 创建长度为2*len的字节数组
byte[] result2 = newBytesFor((result.length >> 1) + mapLen - srcCount);
System.arraycopy(result, 0, result2, 0, resultOffset << 1);
result = result2;
}
assert resultOffset >= 0;
assert resultOffset + mapLen <= length(result);
for(int x = 0; x < mapLen; ++x) {
putChar(result, resultOffset++, upperCharArray[x]);
}
}
}
return newString(result, 0, resultOffset);
}
/*▲ 大小写转换 ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ 码点/码元 ████████████████████████████████████████████████████████████████████████████████┓ */
/*
* ▶ 1 返回byte[]:value中某处所代表的符号的Unicode编码(从前到后试探)
*
* byte[] value = new byte[]{0x56,0xDB, 0x67,0x61, 0xD8,0x69, 0xDE,0xA5};
* codePointAt(value, 1, 2, true); // 返回0x6761,这是汉字【条】的Unicode编码值
* codePointAt(value, 2, 4, true); // 返回0x2A6A5,这是汉字【𪚥】的Unicode编码值,这是个增补字符,其UTF-16BE编码的编码为:\uD869\uDEA5
* codePointAt(value, 2, 3, true); // 返回0xD869,索引限制了继续往后判断,所以只返回了一个高代理单元,无法构成正确的Unicode符号
* codePointAt(value, 3, 4, true); // 返回0xDEA5,后面没有字节了,所以只返回了一个低代理单元,无法构成正确的Unicode符号
*/
// ▶ 1 返回UTF16-S中某处符号(双字节/四字节)的Unicode编码
private static int codePointAt(byte[] value, int index, int end, boolean checked) {
assert index < end;
if(checked) {
checkIndex(index, value);
}
// 将UTF16-String内部的字节转换为char后返回
char c1 = getChar(value, index);
// 如果出现了增补字符,一次遍历4个字节
if(Character.isHighSurrogate(c1) && ++index < end) {
if(checked) {
checkIndex(index, value);
}
char c2 = getChar(value, index);
if(Character.isLowSurrogate(c2)) {
// 高、低代理区的码点值 ---> Unicode符号编码值
return Character.toCodePoint(c1, c2);
}
}
return c1;
}
// ▶ 1-1 返回UTF16-SB中某处符号(双字节/四字节)的Unicode编码(从前到后试探)
public static int codePointAtSB(byte[] val, int index, int end) {
return codePointAt(val, index, end, true /*checked*/);
}
// ▶ 1-2 返回UTF16-String中某处符号(双字节/四字节)的Unicode编码(从前到后试探)
public static int codePointAt(byte[] value, int index, int end) {
return codePointAt(value, index, end, false /*unchecked*/);
}
/*
* byte[] value = new byte[]{0x56,0xDB, 0x67,0x61, 0xD8,0x69, 0xDE,0xA5}; // 四条𪚥
* codePointAt(value, 2, true); // 返回0x6761,这是汉字【条】的Unicode编码值
* codePointAt(value, 4, true); // 返回0x2A6A5,这是汉字【𪚥】的Unicode编码值,这是个增补字符,其UTF-16BE编码的编码为:\uD869\uDEA5
* codePointAt(value, 3, true); // 返回0xD869,只返回了一个高代理单元,无法构成正确的Unicode符号
*/
// ▶ 2 返回UTF16-S中某处(index-1)符号(双字节/四字节)的Unicode编码(从后往前试探)
private static int codePointBefore(byte[] value, int index, boolean checked) {
--index;
if(checked) {
checkIndex(index, value);
}
char c2 = getChar(value, index);
if(Character.isLowSurrogate(c2) && index > 0) {
--index;
if(checked) {
checkIndex(index, value);
}
char c1 = getChar(value, index);
if(Character.isHighSurrogate(c1)) {
// 高、低代理区的码点值 ---> Unicode符号编码值
return Character.toCodePoint(c1, c2);
}
}
return c2;
}
// ▶ 2-1 返回UTF16-SB中某处(index-1)符号(双字节/四字节)的Unicode编码(从后往前试探)
public static int codePointBeforeSB(byte[] val, int index) {
return codePointBefore(val, index, true /*checked*/);
}
// ▶ 2-2 返回UTF16-String中某处(index-1)符号(双字节/四字节)的Unicode编码(从后往前试探)
public static int codePointBefore(byte[] value, int index) {
return codePointBefore(value, index, false /* unchecked */);
}
/**
* 四U+56DB,条U+6761,𪚥U+2A6A5,其中𪚥的UTF-16大端法表示形式是:\uD869\uDEA5
* byte[] value = new byte[]{0x56,0xDB, 0x67,0x61, 0xD8,0x69, 0xDE,0xA5};
* codePointCount(value, 0, 4, true); // 返回3,识别了全部三个Unicode符号
* codePointCount(value, 0, 3, true); // 返回3,识别出了前两个Unicode符号和一个只存在高代理单元的符号
* codePointCount(value, 0, 2, true); // 返回2,识别了前两个Unicode符号
*/
// ▶ 3 统计UTF16-S中指定码元范围内存在多少个Unicode符号
private static int codePointCount(byte[] value, int beginIndex, int endIndex, boolean checked) {
assert beginIndex <= endIndex;
int count = endIndex - beginIndex; // 待判断的码元个数
int i = beginIndex;
if(checked && i < endIndex) {
checkBoundsBeginEnd(i, endIndex, value);
}
// 以码元为单位,判断其是否属于四字节字符,如果是的话,修正Unicode符号个数
for(; i < endIndex - 1; ) {
if(Character.isHighSurrogate(getChar(value, i++)) && Character.isLowSurrogate(getChar(value, i))) {
count--;
i++;
}
}
return count;
}
// ▶ 3-1 统计UTF16-SB中指定码元范围内存在多少个Unicode符号
public static int codePointCountSB(byte[] val, int beginIndex, int endIndex) {
return codePointCount(val, beginIndex, endIndex, true /*checked*/);
}
// ▶ 3-2 统计UTF16-String中指定码元范围内存在多少个Unicode符号
public static int codePointCount(byte[] value, int beginIndex, int endIndex) {
return codePointCount(value, beginIndex, endIndex, false /*unchecked*/);
}
/*▲ 码点/码元 ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ 比较/判等 ████████████████████████████████████████████████████████████████████████████████┓ */
// ▶ 1 比较两个UTF16-String的字节值,需要先将它们同时转为char再比较
private static int compareValues(byte[] value, byte[] other, int len1, int len2) {
int lim = Math.min(len1, len2);
for(int k = 0; k < lim; k++) {
char c1 = getChar(value, k);
char c2 = getChar(other, k);
if(c1 != c2) {
return c1 - c2;
}
}
return len1 - len2;
}
// ▶ 1-1 比较两个UTF16-String的字节值,需要先将它们同时转为char再比较
@HotSpotIntrinsicCandidate
public static int compareTo(byte[] value, byte[] other) {
int len1 = length(value);
int len2 = length(other);
return compareValues(value, other, len1, len2);
}
// ▶ 1-2 比较两个UTF16-String的字节值,需要先将它们同时转为char再比较,加入了范围检查
public static int compareTo(byte[] value, byte[] other, int len1, int len2) {
checkOffset(len1, value);
checkOffset(len2, other);
return compareValues(value, other, len1, len2);
}
// ▶ 2 比较UTF16-String的字节值(value)和Latin1-String的字节值(other),需要先将它们同时转为char再比较
@HotSpotIntrinsicCandidate
public static int compareToLatin1(byte[] value, byte[] other) {
return -StringLatin1.compareToUTF16(other, value);
}
// ▶ 3 比较UTF16-String的字节值(value)和Latin1-String的字节值(other),需要先将它们同时转为char再比较
public static int compareToLatin1(byte[] value, byte[] other, int len1, int len2) {
return -StringLatin1.compareToUTF16(other, value, len2, len1);
}
// ▶ 4 忽略大小写地比较两个UTF16-String的字节值
public static int compareToCI(byte[] value, byte[] other) {
int len1 = length(value);
int len2 = length(other);
int lim = Math.min(len1, len2);
for(int k = 0; k < lim; k++) {
char c1 = getChar(value, k);
char c2 = getChar(other, k);
if(c1 != c2) {
c1 = Character.toUpperCase(c1);
c2 = Character.toUpperCase(c2);
if(c1 != c2) {
c1 = Character.toLowerCase(c1);
c2 = Character.toLowerCase(c2);
if(c1 != c2) {
return c1 - c2;
}
}
}
}
return len1 - len2;
}
// ▶ 5 忽略大小写地比较两个UTF16-String的字节值
public static boolean regionMatchesCI(byte[] value, int toffset, byte[] other, int ooffset, int len) {
int last = toffset + len;
assert toffset >= 0 && ooffset >= 0;
assert ooffset + len <= length(other);
assert last <= length(value);
while(toffset < last) {
char c1 = getChar(value, toffset++);
char c2 = getChar(other, ooffset++);
if(c1 == c2) {
continue;
}
// try converting both characters to uppercase.
// If the results match, then the comparison scan should continue.
char u1 = Character.toUpperCase(c1);
char u2 = Character.toUpperCase(c2);
if(u1 == u2) {
continue;
}
// Unfortunately, conversion to uppercase does not work properly for the Georgian alphabet, which has strange rules about case conversion.
// So we need to make one last check before exiting.
if(Character.toLowerCase(u1) == Character.toLowerCase(u2)) {
continue;
}
return false;
}
return true;
}
// ▶ 6 忽略大小写地比较UTF16-String的字节值(value)和Latin1-String的字节值(other),需要先将它们同时转为char再比较
public static int compareToCI_Latin1(byte[] value, byte[] other) {
return -StringLatin1.compareToCI_UTF16(other, value);
}
// ▶ 7 忽略大小写地比较UTF16-String的字节值(value)和Latin1-String的字节值(other),需要先将它们同时转为char再比较
public static boolean regionMatchesCI_Latin1(byte[] value, int toffset, byte[] other, int ooffset, int len) {
return StringLatin1.regionMatchesCI_UTF16(other, ooffset, value, toffset, len);
}
// true:两个UTF16-String内容相等
@HotSpotIntrinsicCandidate
public static boolean equals(byte[] value, byte[] other) {
if(value.length == other.length) {
int len = value.length >> 1;
for(int i = 0; i < len; i++) {
if(getChar(value, i) != getChar(other, i)) {
return false;
}
}
return true;
}
return false;
}
// true:LATIN1-String v1 和 UTF16-String v2 表示的内涵一致
public static boolean contentEquals(byte[] v1, byte[] v2, int len) {
checkBoundsOffCount(0, len, v2);
for(int i = 0; i < len; i++) {
if((char) (v1[i] & 0xff) != getChar(v2, i)) {
return false;
}
}
return true;
}
// true:UTF16-String value 和 CharSequence:cs 表示的内涵一致
public static boolean contentEquals(byte[] value, CharSequence cs, int len) {
checkOffset(len, value);
for(int i = 0; i < len; i++) {
if(getChar(value, i) != cs.charAt(i)) {
return false;
}
}
return true;
}
/*▲ 比较/判等 ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ 查找Unicode符号下标 ████████████████████████████████████████████████████████████████████████████████┓ */
/*
* 注意此节中术语【位置】与byte数组中的元素索引的区别
*
* 比如对于byte[] bs = {0x12,0x34, 0x56,0x78}:
* 我们说字符'\u5678'在bs中的位置是1。
*/
/**
* byte[] value = new byte[]{0x12,0x34, 0x56,0x78, 0xAB,0xCD};
* indexOfCharUnsafe(value, 0x5678, 0, 3); // 返回1
*/
// ▶ 1 返回基本符号ch在UTF16-String的字节值value中的下标
private static int indexOfCharUnsafe(byte[] value, int ch, int fromIndex, int max) {
for(int i = fromIndex; i < max; i++) {
if(getChar(value, i) == ch) {
return i;
}
}
return -1;
}
// ▶ 1-1 返回基本符号ch在UTF16-String的字节值value中的下标,加入范围检查
@HotSpotIntrinsicCandidate
private static int indexOfChar(byte[] value, int ch, int fromIndex, int max) {
checkBoundsBeginEnd(fromIndex, max, value);
return indexOfCharUnsafe(value, ch, fromIndex, max);
}
/**
* Handles (rare) calls of indexOf with a supplementary character.
*/
/*
* ▶ 2 返回增补符号ch在UTF16-String的字节值value中的下标,加入范围检查
*
* byte[] value = new byte[]{0x56,0xDB, 0x67,0x61, 0xD8,0x69, 0xDE,0xA5};
* indexOfSupplementary(value, 0x2A6A5, 0, 4); // 返回2
* 因为0x2A6A5这个Unicode编码值拆成UTF-16编码大端表示法后就是:0xD869 0xDEA5
*/
private static int indexOfSupplementary(byte[] value, int ch, int fromIndex, int max) {
if(Character.isValidCodePoint(ch)) {
final char hi = Character.highSurrogate(ch); // 返回高代理处的码元(char)
final char lo = Character.lowSurrogate(ch); // 返回低代理处的码元(char)
checkBoundsBeginEnd(fromIndex, max, value);
for(int i = fromIndex; i < max - 1; i++) {
if(getChar(value, i) == hi && getChar(value, i + 1) == lo) {
return i;
}
}
}
return -1;
}
// ▶ 3 返回符号ch在UTF16-String的字节值value中的下标,内部包括了范围检查
public static int indexOf(byte[] value, int ch, int fromIndex) {
int max = value.length >> 1;
if(fromIndex < 0) {
fromIndex = 0;
} else if(fromIndex >= max) {
// Note: fromIndex might be near -1>>>1.
return -1;
}
if(ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// handle most cases here (ch is a BMP code point or a negative value (invalid code point))
return indexOfChar(value, ch, fromIndex, max);
} else {
return indexOfSupplementary(value, ch, fromIndex, max);
}
}
// 返回Unicode符号ch在UTF16-String的字节值value中最后一次出现的下标
public static int lastIndexOf(byte[] value, int ch, int fromIndex) {
if(ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
// handle most cases here (ch is a BMP code point or a negative value (invalid code point))
int i = Math.min(fromIndex, (value.length >> 1) - 1);
for(; i >= 0; i--) {
if(getChar(value, i) == ch) {
return i;
}
}
return -1;
} else {
return lastIndexOfSupplementary(value, ch, fromIndex);
}
}
/**
* Handles (rare) calls of lastIndexOf with a supplementary character.
*/
// 返回增补符号ch在UTF16-String的字节值value中最后一次出现的下标
private static int lastIndexOfSupplementary(final byte[] value, int ch, int fromIndex) {
if(Character.isValidCodePoint(ch)) {
char hi = Character.highSurrogate(ch); // 返回高代理处的码元(char)
char lo = Character.lowSurrogate(ch); // 返回低代理处的码元(char)
int i = Math.min(fromIndex, (value.length >> 1) - 2);
for(; i >= 0; i--) {
if(getChar(value, i) == hi && getChar(value, i + 1) == lo) {
return i;
}
}
}
return -1;
}
/*▲ 查找Unicode符号下标 ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ 查找子串下标 ████████████████████████████████████████████████████████████████████████████████┓ */
// 比对两个UTF16-String,返回子串str在主串value中第一次出现的下标
@HotSpotIntrinsicCandidate
public static int indexOf(byte[] value, byte[] str) {
if(str.length == 0) {
return 0;
}
if(value.length < str.length) {
return -1;
}
return indexOfUnsafe(value, length(value), str, length(str), 0);
}
/*
* 比对两个UTF16-String,返回子串str在主串value中第一次出现的下标
* 搜索时只比对主串的前valueCount个字符和子串的前strCount个字符,且从主串的fromIndex索引处向后搜索
*/
@HotSpotIntrinsicCandidate
public static int indexOf(byte[] value, int valueCount, byte[] str, int strCount, int fromIndex) {
checkBoundsBeginEnd(fromIndex, valueCount, value);
checkBoundsBeginEnd(0, strCount, str);
return indexOfUnsafe(value, valueCount, str, strCount, fromIndex);
}
/**
* 比对两个UTF16-String,返回子串str在主串value中第一次出现的下标
* 搜索时只比对主串的前srcCount个字符和子串的前tgtCount个字符,且从主串fromIndex索引处向后搜索
*/
private static int indexOfUnsafe(byte[] value, int valueCount, byte[] str, int strCount, int fromIndex) {
assert fromIndex >= 0;
assert strCount > 0;
assert strCount <= length(str);
assert valueCount >= strCount;
char first = getChar(str, 0); // 子串第一个字符
// 主串长度-子串长度
int max = (valueCount - strCount);
for(int i = fromIndex; i <= max; i++) {
// 用i遍历主串,直到主串和子串第一个字符相等为止
if(getChar(value, i) != first) {
while(++i <= max && getChar(value, i) != first)
;
}
// 找到了第一个相等字符,此时游标i最远也必须满足i<=valueCount - strCount,否则就没必要比较了,可画图理解
if(i <= max) {
int j = i + 1;
int end = j + strCount - 1;
for(int k = 1; j < end && getChar(value, j) == getChar(str, k); j++, k++)
;
if(j == end) {