From 150466b865593dc168bbcfb163daca7016271793 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 1 Aug 2024 11:19:12 -0700 Subject: [PATCH] ICU-22843 UnicodeString <-> std::u16string_view / wstring_view --- icu4c/source/common/unicode/char16ptr.h | 31 ++ icu4c/source/common/unicode/platform.h | 4 +- icu4c/source/common/unicode/unistr.h | 329 +++++++++++++++++-- icu4c/source/common/uniset_props.cpp | 23 +- icu4c/source/common/unistr.cpp | 72 +++- icu4c/source/i18n/number_decimalquantity.cpp | 7 +- icu4c/source/test/intltest/ustrtest.cpp | 185 +++++++++++ icu4c/source/test/intltest/ustrtest.h | 2 + 8 files changed, 614 insertions(+), 39 deletions(-) diff --git a/icu4c/source/common/unicode/char16ptr.h b/icu4c/source/common/unicode/char16ptr.h index de8182c7ada4..2fb8238f9a26 100644 --- a/icu4c/source/common/unicode/char16ptr.h +++ b/icu4c/source/common/unicode/char16ptr.h @@ -12,6 +12,7 @@ #if U_SHOW_CPLUSPLUS_API #include +#include /** * \file @@ -306,6 +307,36 @@ inline OldUChar *toOldUCharPtr(char16_t *p) { return reinterpret_cast(p); } +#ifndef U_FORCE_HIDE_INTERNAL_API +/** + * Is T convertible to a std::u16string_view or to a 16-bit std::wstring_view? + * @internal + */ +template +constexpr bool ConvertibleToU16StringView = + std::is_convertible_v || + (U_SIZEOF_WCHAR_T==2 && std::is_convertible_v); + +namespace internal { +/** + * Pass-through overload. + * @internal + */ +inline std::u16string_view toU16StringView(std::u16string_view sv) { return sv; } + +#if U_SIZEOF_WCHAR_T==2 +/** + * Basically undefined behavior but sometimes necessary conversion + * from std::wstring_view to std::u16string_view. + * @internal + */ +inline std::u16string_view toU16StringView(std::wstring_view sv) { + return { ConstChar16Ptr(sv.data()), sv.length() }; +} +#endif +} // internal +#endif // U_FORCE_HIDE_INTERNAL_API + U_NAMESPACE_END #endif /* U_SHOW_CPLUSPLUS_API */ diff --git a/icu4c/source/common/unicode/platform.h b/icu4c/source/common/unicode/platform.h index 7aca76c67db8..cbd9098dab98 100644 --- a/icu4c/source/common/unicode/platform.h +++ b/icu4c/source/common/unicode/platform.h @@ -735,7 +735,9 @@ * @{ * \def U_DECLARE_UTF16 * Do not use this macro because it is not defined on all platforms. - * Use the UNICODE_STRING or U_STRING_DECL macros instead. + * In C++, use std::u16string_view literals, see the UNICODE_STRING docs. + * In C, use u"UTF-16 literals". + * See also the public U_STRING_DECL macro. * @internal */ #ifdef U_DECLARE_UTF16 diff --git a/icu4c/source/common/unicode/unistr.h b/icu4c/source/common/unicode/unistr.h index 52f0fdbaf049..00e173862d6c 100644 --- a/icu4c/source/common/unicode/unistr.h +++ b/icu4c/source/common/unicode/unistr.h @@ -33,6 +33,7 @@ #if U_SHOW_CPLUSPLUS_API #include +#include #include "unicode/char16ptr.h" #include "unicode/rep.h" #include "unicode/std_string.h" @@ -97,16 +98,21 @@ class UnicodeStringAppendable; // unicode/appendable.h #define US_INV icu::UnicodeString::kInvariant /** - * Unicode String literals in C++. + * \def UNICODE_STRING + * Obsolete macro approximating UnicodeString literals. * - * Note: these macros are not recommended for new code. - * Prior to the availability of C++11 and u"unicode string literals", - * these macros were provided for portability and efficiency when + * Prior to the availability of C++11 and u"UTF-16 string literals", + * this macro was provided for portability and efficiency when * initializing UnicodeStrings from literals. * - * They work only for strings that contain "invariant characters", i.e., - * only latin letters, digits, and some punctuation. - * See utypes.h for details. + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode * * The string parameter must be a C string literal. * The length of the string, not including the terminating @@ -121,16 +127,12 @@ class UnicodeStringAppendable; // unicode/appendable.h /** * Unicode String literals in C++. - * Dependent on the platform properties, different UnicodeString - * constructors should be used to create a UnicodeString object from - * a string literal. - * The macros are defined for improved performance. - * They work only for strings that contain "invariant characters", i.e., - * only latin letters, digits, and some punctuation. - * See utypes.h for details. + * Obsolete macro approximating UnicodeString literals. + * See UNICODE_STRING. * * The string parameter must be a C string literal. * @stable ICU 2.0 + * @see UNICODE_STRING */ #define UNICODE_STRING_SIMPLE(cs) UNICODE_STRING(cs, -1) @@ -327,6 +329,32 @@ class U_COMMON_API UnicodeString : public Replaceable */ inline bool operator== (const UnicodeString& text) const; +#ifndef U_HIDE_DRAFT_API + /** + * Equality operator. Performs only bitwise comparison with `text` + * which is, or which is implicitly convertible to, + * a std::u16string_view or (if U_SIZEOF_WCHAR_T==2) std::wstring_view. + * + * For performance, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str = ...; + * if (str == u"literal"sv) { ... } + * \endcode + * @param text The string view to compare to this string. + * @return true if `text` contains the same characters as this one, false otherwise. + * @draft ICU 76 + */ + template>> + inline bool operator==(const S &text) const { + std::u16string_view sv(internal::toU16StringView(text)); + uint32_t len; // unsigned to avoid a compiler warning + return !isBogus() && (len = length()) == sv.length() && doEquals(sv.data(), len); + } +#endif // U_HIDE_DRAFT_API + /** * Inequality operator. Performs only bitwise comparison. * @param text The UnicodeString to compare to this one. @@ -1897,6 +1925,24 @@ class U_COMMON_API UnicodeString : public Replaceable */ UnicodeString &fastCopyFrom(const UnicodeString &src); +#ifndef U_HIDE_DRAFT_API + /** + * Assignment operator. Replaces the characters in this UnicodeString + * with a copy of the characters from the `src` + * which is, or which is implicitly convertible to, + * a std::u16string_view or (if U_SIZEOF_WCHAR_T==2) std::wstring_view. + * + * @param src The string view containing the characters to copy. + * @return a reference to this + * @draft ICU 76 + */ + template>> + inline UnicodeString &operator=(const S &src) { + unBogus(); + return doReplace(0, length(), internal::toU16StringView(src)); + } +#endif // U_HIDE_DRAFT_API + /** * Move assignment operator; might leave src in bogus state. * This string will have the same contents and state that the source string had. @@ -2146,6 +2192,23 @@ class U_COMMON_API UnicodeString : public Replaceable */ inline UnicodeString& operator+= (const UnicodeString& srcText); +#ifndef U_HIDE_DRAFT_API + /** + * Append operator. Appends the characters in `src` + * which is, or which is implicitly convertible to, + * a std::u16string_view or (if U_SIZEOF_WCHAR_T==2) std::wstring_view, + * to the UnicodeString object. + * + * @param src the source for the new characters + * @return a reference to this + * @draft ICU 76 + */ + template>> + inline UnicodeString& operator+=(const S &src) { + return doAppend(internal::toU16StringView(src)); + } +#endif // U_HIDE_DRAFT_API + /** * Append the characters * in `srcText` in the range @@ -2191,8 +2254,8 @@ class U_COMMON_API UnicodeString : public Replaceable int32_t srcLength); /** - * Append the characters in `srcChars` to the UnicodeString object - * at offset `start`. `srcChars` is not modified. + * Append the characters in `srcChars` to the UnicodeString object. + * `srcChars` is not modified. * @param srcChars the source for the new characters * @param srcLength the number of Unicode characters in `srcChars`; * can be -1 if `srcChars` is NUL-terminated @@ -2202,6 +2265,23 @@ class U_COMMON_API UnicodeString : public Replaceable inline UnicodeString& append(ConstChar16Ptr srcChars, int32_t srcLength); +#ifndef U_HIDE_DRAFT_API + /** + * Appends the characters in `src` + * which is, or which is implicitly convertible to, + * a std::u16string_view or (if U_SIZEOF_WCHAR_T==2) std::wstring_view, + * to the UnicodeString object. + * + * @param src the source for the new characters + * @return a reference to this + * @draft ICU 76 + */ + template>> + inline UnicodeString& append(const S &src) { + return doAppend(internal::toU16StringView(src)); + } +#endif // U_HIDE_DRAFT_API + /** * Append the code unit `srcChar` to the UnicodeString object. * @param srcChar the code unit to append @@ -2925,6 +3005,37 @@ class U_COMMON_API UnicodeString : public Replaceable */ const char16_t *getTerminatedBuffer(); +#ifndef U_HIDE_DRAFT_API + /** + * Converts to a std::u16string_view. + * + * @return a string view of the contents of this string + * @draft ICU 76 + */ + inline operator std::u16string_view() const { + return { getBuffer(), (std::u16string_view::size_type)length() }; + } + +#if U_SIZEOF_WCHAR_T==2 || defined(U_IN_DOXYGEN) + /** + * Converts to a std::wstring_view. + * + * Note: This should remain draft until C++ standard plans + * about char16_t vs. wchar_t become clearer. + * + * @return a string view of the contents of this string + * @draft ICU 76 + */ + inline operator std::wstring_view() const { + const char16_t *p = getBuffer(); +#ifdef U_ALIASING_BARRIER + U_ALIASING_BARRIER(p); +#endif + return { reinterpret_cast(p), (std::wstring_view::size_type)length() }; + } +#endif // U_SIZEOF_WCHAR_T +#endif // U_HIDE_DRAFT_API + //======================================== // Constructors //======================================== @@ -2975,6 +3086,17 @@ class U_COMMON_API UnicodeString : public Replaceable * It is recommended to mark this constructor "explicit" by * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` * on the compiler command line or similar. + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * * @param text The characters to place in the UnicodeString. `text` * must be NUL (U+0000) terminated. * @stable ICU 2.0 @@ -2989,6 +3111,17 @@ class U_COMMON_API UnicodeString : public Replaceable * It is recommended to mark this constructor "explicit" by * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` * on the compiler command line or similar. + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * * @param text NUL-terminated UTF-16 string * @stable ICU 59 */ @@ -3005,6 +3138,17 @@ class U_COMMON_API UnicodeString : public Replaceable * It is recommended to mark this constructor "explicit" by * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` * on the compiler command line or similar. + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * * @param text NUL-terminated UTF-16 string * @stable ICU 59 */ @@ -3026,6 +3170,17 @@ class U_COMMON_API UnicodeString : public Replaceable /** * char16_t* constructor. + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * * @param text The characters to place in the UnicodeString. * @param textLength The number of Unicode characters in `text` * to copy. @@ -3038,6 +3193,17 @@ class U_COMMON_API UnicodeString : public Replaceable /** * uint16_t * constructor. * Delegates to UnicodeString(const char16_t *, int32_t). + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * * @param text UTF-16 string * @param textLength string length * @stable ICU 59 @@ -3051,7 +3217,18 @@ class U_COMMON_API UnicodeString : public Replaceable * wchar_t * constructor. * (Only defined if U_SIZEOF_WCHAR_T==2.) * Delegates to UnicodeString(const char16_t *, int32_t). - * @param text NUL-terminated UTF-16 string + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * + * @param text UTF-16 string * @param textLength string length * @stable ICU 59 */ @@ -3068,6 +3245,26 @@ class U_COMMON_API UnicodeString : public Replaceable */ inline UnicodeString(const std::nullptr_t text, int32_t textLength); +#ifndef U_HIDE_DRAFT_API + /** + * Constructor from `text` + * which is, or which is implicitly convertible to, + * a std::u16string_view or (if U_SIZEOF_WCHAR_T==2) std::wstring_view. + * The string is bogus if the string view is too long. + * + * If you need a UnicodeString but need not copy the string view contents, + * then you can call the UnicodeString::readOnlyAlias() function instead of this constructor. + * + * @param text UTF-16 string + * @draft ICU 76 + */ + template>> + explicit inline UnicodeString(const S &text) { + fUnion.fFields.fLengthAndFlags = kShortString; + doAppend(internal::toU16StringView(text)); + } +#endif // U_HIDE_DRAFT_API + /** * Readonly-aliasing char16_t* constructor. * The text will be used for the UnicodeString object, but @@ -3082,6 +3279,16 @@ class U_COMMON_API UnicodeString : public Replaceable * When using fastCopyFrom(), the text will be aliased again, * so that both strings then alias the same readonly-text. * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString alias = UnicodeString::readOnlyAlias(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * * @param isTerminated specifies if `text` is `NUL`-terminated. * This must be true if `textLength==-1`. * @param text The characters to alias for the UnicodeString. @@ -3160,8 +3367,16 @@ class U_COMMON_API UnicodeString : public Replaceable * * For ASCII (really "invariant character") strings it is more efficient to use * the constructor that takes a US_INV (for its enum EInvariant). - * For ASCII (invariant-character) string literals, see UNICODE_STRING and - * UNICODE_STRING_SIMPLE. + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode * * It is recommended to mark this constructor "explicit" by * `-DUNISTR_FROM_STRING_EXPLICIT=explicit` @@ -3169,8 +3384,6 @@ class U_COMMON_API UnicodeString : public Replaceable * @param codepageData an array of bytes, null-terminated, * in the platform's default codepage. * @stable ICU 2.0 - * @see UNICODE_STRING - * @see UNICODE_STRING_SIMPLE */ UNISTR_FROM_STRING_EXPLICIT UnicodeString(const char *codepageData); @@ -3270,6 +3483,17 @@ class U_COMMON_API UnicodeString : public Replaceable * // use ustr ... * } * \endcode + * + * Note, for string literals: + * Since C++17 and ICU 76, you can use std::u16string_view literals with compile-time + * length determination: + * \code + * #include <string_view> + * using namespace std::string_view_literals; + * UnicodeString str(u"literal"sv); + * if (str == u"other literal"sv) { ... } + * \endcode + * * @param src String using only invariant characters. * @param textLength Length of src, or -1 if NUL-terminated. * @param inv Signature-distinguishing parameter, use US_INV. @@ -3343,6 +3567,35 @@ class U_COMMON_API UnicodeString : public Replaceable */ virtual ~UnicodeString(); +#ifndef U_HIDE_DRAFT_API + /** + * Readonly-aliasing factory method. + * Aliases the same buffer as the input `text` + * which is, or which is implicitly convertible to, + * a std::u16string_view or (if U_SIZEOF_WCHAR_T==2) std::wstring_view. + * The string is bogus if the string view is too long. + * + * The text will be used for the UnicodeString object, but + * it will not be released when the UnicodeString is destroyed. + * This has copy-on-write semantics: + * When the string is modified, then the buffer is first copied into + * newly allocated memory. + * The aliased buffer is never modified. + * + * In an assignment to another UnicodeString, when using the copy constructor + * or the assignment operator, the text will be copied. + * When using fastCopyFrom(), the text will be aliased again, + * so that both strings then alias the same readonly-text. + * + * @param text The string view to alias for the UnicodeString. + * @draft ICU 76 + */ + template>> + static inline UnicodeString readOnlyAlias(const S &text) { + return readOnlyAliasFromU16StringView(internal::toU16StringView(text)); + } +#endif // U_HIDE_DRAFT_API + /** * Create a UnicodeString from a UTF-8 string. * Illegal input is replaced with U+FFFD. Otherwise, errors result in a bogus string. @@ -3470,6 +3723,8 @@ class U_COMMON_API UnicodeString : public Replaceable virtual UChar32 getChar32At(int32_t offset) const override; private: + static UnicodeString readOnlyAliasFromU16StringView(std::u16string_view text); + // For char* constructors. Could be made public. UnicodeString &setToUTF8(StringPiece utf8); // For extract(char*). @@ -3485,7 +3740,10 @@ class U_COMMON_API UnicodeString : public Replaceable * Internal string contents comparison, called by operator==. * Requires: this & text not bogus and have same lengths. */ - UBool doEquals(const UnicodeString &text, int32_t len) const; + inline UBool doEquals(const UnicodeString &text, int32_t len) const { + return doEquals(text.getArrayStart(), len); + } + UBool doEquals(const char16_t *text, int32_t len) const; inline UBool doEqualsSubstring(int32_t start, @@ -3580,9 +3838,11 @@ class U_COMMON_API UnicodeString : public Replaceable const char16_t *srcChars, int32_t srcStart, int32_t srcLength); + UnicodeString& doReplace(int32_t start, int32_t length, std::u16string_view src); UnicodeString& doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength); UnicodeString& doAppend(const char16_t *srcChars, int32_t srcStart, int32_t srcLength); + UnicodeString& doAppend(std::u16string_view src); UnicodeString& doReverse(int32_t start, int32_t length); @@ -3802,7 +4062,7 @@ class U_COMMON_API UnicodeString : public Replaceable }; /** - * Create a new UnicodeString with the concatenation of two others. + * Creates a new UnicodeString from the concatenation of two others. * * @param s1 The first string to be copied to the new one. * @param s2 The second string to be copied to the new one, after s1. @@ -3812,6 +4072,29 @@ class U_COMMON_API UnicodeString : public Replaceable U_COMMON_API UnicodeString U_EXPORT2 operator+ (const UnicodeString &s1, const UnicodeString &s2); +#ifndef U_HIDE_DRAFT_API +/** + * Creates a new UnicodeString from the concatenation of a UnicodeString and `s2` + * which is, or which is implicitly convertible to, + * a std::u16string_view or (if U_SIZEOF_WCHAR_T==2) std::wstring_view. + * + * @param s1 The string to be copied to the new one. + * @param s2 The string view to be copied to the new string, after s1. + * @return UnicodeString(s1).append(s2) + * @draft ICU 76 + */ +template>> +inline UnicodeString operator+(const UnicodeString &s1, const S &s2) { + return unistr_internalConcat(s1, internal::toU16StringView(s2)); +} +#endif // U_HIDE_DRAFT_API + +#ifndef U_FORCE_HIDE_INTERNAL_API +/** @internal */ +U_COMMON_API UnicodeString U_EXPORT2 +unistr_internalConcat(const UnicodeString &s1, std::u16string_view s2); +#endif + //======================================== // Inline members //======================================== diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index c572a1fa4391..acb73b1788f3 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -18,6 +18,8 @@ * Character property dependent functions moved here from uniset.cpp */ +#include + #include "unicode/utypes.h" #include "unicode/uniset.h" #include "unicode/parsepos.h" @@ -45,16 +47,23 @@ #include "uassert.h" #include "hash.h" +// Makes u"literal"sv std::u16string_view literals possible. +// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv +using namespace std::string_view_literals; + U_NAMESPACE_USE +namespace { + // Special property set IDs -static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] -static const char ASCII[] = "ASCII"; // [\u0000-\u007F] -static const char ASSIGNED[] = "Assigned"; // [:^Cn:] +constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF] +constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F] +constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:] // Unicode name property alias -#define NAME_PROP "na" -#define NAME_PROP_LENGTH 2 +constexpr std::u16string_view NAME_PROP(u"na"sv); + +} // namespace // Cached sets ------------------------------------------------------------- *** @@ -83,7 +92,7 @@ namespace { // Cache some sets for other services -------------------------------------- *** void U_CALLCONV createUni32Set(UErrorCode &errorCode) { U_ASSERT(uni32Singleton == nullptr); - uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); + uni32Singleton = new UnicodeSet(UnicodeString(u"[:age=3.2:]"sv), errorCode); if(uni32Singleton==nullptr) { errorCode=U_MEMORY_ALLOCATION_ERROR; } else { @@ -1105,7 +1114,7 @@ UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, // support args of (UProperty, char*) then we can remove // NAME_PROP and make this a little more efficient. valueName = propName; - propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); + propName = NAME_PROP; } } diff --git a/icu4c/source/common/unistr.cpp b/icu4c/source/common/unistr.cpp index d6dcfc5ca446..6389c85134f1 100644 --- a/icu4c/source/common/unistr.cpp +++ b/icu4c/source/common/unistr.cpp @@ -20,6 +20,8 @@ ****************************************************************************** */ +#include + #include "unicode/utypes.h" #include "unicode/appendable.h" #include "unicode/putil.h" @@ -107,12 +109,34 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) UnicodeString U_EXPORT2 operator+ (const UnicodeString &s1, const UnicodeString &s2) { - return - UnicodeString(s1.length() + s2.length() + 1, static_cast(0), 0). - append(s1). - append(s2); + int32_t sumLengths; + if (uprv_add32_overflow(s1.length(), s2.length(), &sumLengths)) { + UnicodeString bogus; + bogus.setToBogus(); + return bogus; + } + if (sumLengths != INT32_MAX) { + ++sumLengths; // space for a terminating NUL if we need one + } + return UnicodeString(sumLengths, static_cast(0), 0).append(s1).append(s2); +} + +U_COMMON_API UnicodeString U_EXPORT2 +unistr_internalConcat(const UnicodeString &s1, std::u16string_view s2) { + int32_t sumLengths; + if (s2.length() > INT32_MAX || + uprv_add32_overflow(s1.length(), (int32_t)s2.length(), &sumLengths)) { + UnicodeString bogus; + bogus.setToBogus(); + return bogus; + } + if (sumLengths != INT32_MAX) { + ++sumLengths; // space for a terminating NUL if we need one + } + return UnicodeString(sumLengths, static_cast(0), 0).append(s1).append(s2); } + //======================================== // Reference Counting functions, put at top of file so that optimizing compilers // have a chance to automatically inline. @@ -279,6 +303,16 @@ UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) { } } +UnicodeString UnicodeString::readOnlyAliasFromU16StringView(std::u16string_view text) { + UnicodeString result; + if (text.length() <= INT32_MAX) { + result.setTo(false, text.data(), (int32_t)text.length()); + } else { + result.setToBogus(); + } + return result; +} + #if U_CHARSET_IS_UTF8 UnicodeString::UnicodeString(const char *codepageData) { @@ -656,10 +690,10 @@ UChar32 UnicodeString::unescapeAt(int32_t &offset) const { // Read-only implementation //======================================== UBool -UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { - // Requires: this & text not bogus and have same lengths. +UnicodeString::doEquals(const char16_t *text, int32_t len) const { + // Requires: this not bogus and have same lengths. // Byte-wise comparison works for equality regardless of endianness. - return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; + return uprv_memcmp(getArrayStart(), text, len * U_SIZEOF_UCHAR) == 0; } UBool @@ -1574,6 +1608,18 @@ UnicodeString::doReplace(int32_t start, return *this; } +UnicodeString& +UnicodeString::doReplace(int32_t start, int32_t length, std::u16string_view src) { + if (!isWritable()) { + return *this; + } + if (src.length() > INT32_MAX) { + setToBogus(); + return *this; + } + return doReplace(start, length, src.data(), 0, (int32_t)src.length()); +} + // Versions of doReplace() only for append() variants. // doReplace() and doAppend() optimize for different cases. @@ -1662,6 +1708,18 @@ UnicodeString::doAppend(const char16_t *srcChars, int32_t srcStart, int32_t srcL return *this; } +UnicodeString& +UnicodeString::doAppend(std::u16string_view src) { + if (!isWritable() || src.empty()) { + return *this; + } + if (src.length() > INT32_MAX) { + setToBogus(); + return *this; + } + return doAppend(src.data(), 0, (int32_t)src.length()); +} + /** * Replaceable API */ diff --git a/icu4c/source/i18n/number_decimalquantity.cpp b/icu4c/source/i18n/number_decimalquantity.cpp index a8c033261d09..c3fda6c8ab95 100644 --- a/icu4c/source/i18n/number_decimalquantity.cpp +++ b/icu4c/source/i18n/number_decimalquantity.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "unicode/plurrule.h" @@ -22,6 +23,10 @@ #include "uassert.h" #include "util.h" +// Makes u"literal"sv std::u16string_view literals possible. +// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv +using namespace std::string_view_literals; + using namespace icu; using namespace icu::number; using namespace icu::number::impl; @@ -1082,7 +1087,7 @@ UnicodeString DecimalQuantity::toScientificString() const { result.append(u'E'); int32_t _scale = upperPos + scale + exponent; if (_scale == INT32_MIN) { - result.append({u"-2147483648", -1}); + result.append(u"-2147483648"sv); return result; } else if (_scale < 0) { _scale *= -1; diff --git a/icu4c/source/test/intltest/ustrtest.cpp b/icu4c/source/test/intltest/ustrtest.cpp index fea50e881309..7a34b8a2f560 100644 --- a/icu4c/source/test/intltest/ustrtest.cpp +++ b/icu4c/source/test/intltest/ustrtest.cpp @@ -6,6 +6,8 @@ * others. All Rights Reserved. ********************************************************************/ +#include +#include #include #include "ustrtest.h" @@ -22,6 +24,13 @@ #include "cmemory.h" #include "charstr.h" +// Makes u"literal"sv std::u16string_view literals possible. +// https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv +using namespace std::string_view_literals; + +// Same for u"literal"s std::u16string literals. +using namespace std::string_literals; + #if 0 #include "unicode/ustream.h" @@ -68,6 +77,8 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* & TESTCASE_AUTO(TestNullPointers); TESTCASE_AUTO(TestUnicodeStringInsertAppendToSelf); TESTCASE_AUTO(TestLargeAppend); + TESTCASE_AUTO(TestU16StringView); + TESTCASE_AUTO(TestWStringView); TESTCASE_AUTO_END; } @@ -2400,3 +2411,177 @@ void UnicodeStringTest::TestLargeAppend() { } } } + +void UnicodeStringTest::TestU16StringView() { + IcuTestErrorCode status(*this, "TestU16StringView"); + // ICU-22843 Test ICU 76 new UnicodeString APIs that take or return a std::u16string_view + // or something convertible to it. + // NOTE: Keep this function very parallel with TestWStringView()! + const char16_t *p16 = u"p16"; + std::u16string_view sv16 = u"sv16"; + std::u16string str16 = u"str16"; + + // These copy the string contents. + UnicodeString fromPtr(p16); // pointer is convertible to std::u16string_view + UnicodeString fromSV(sv16); // std::u16string_view itself + UnicodeString fromSV2(u"sv16_2"sv); // std::u16string_view literal + UnicodeString fromStr(str16); // std::u16string is convertible to std::u16string_view + assertEquals("UnicodeString(const char16_t *pointer)", UnicodeString(u"p16", 3), fromPtr); + assertEquals("UnicodeString(std::u16string_view)", UnicodeString(u"sv16", 4), fromSV); + assertEquals("UnicodeString(std::u16string_view literal)", UnicodeString(u"sv16_2", 6), fromSV2); + assertEquals("UnicodeString(std::u16string)", UnicodeString(u"str16", 5), fromStr); + + // Read-only aliases + UnicodeString aliasFromPtr = UnicodeString::readOnlyAlias(p16); + assertTrue("aliasFromPtr pointer alias", aliasFromPtr.getBuffer() == p16); + assertEquals("aliasFromPtr length", 3, aliasFromPtr.length()); + + UnicodeString aliasFromSV = UnicodeString::readOnlyAlias(sv16); + assertTrue("aliasFromSV pointer alias", aliasFromSV.getBuffer() == sv16.data()); + assertEquals("aliasFromSV length", (int32_t)sv16.length(), aliasFromSV.length()); + + UnicodeString aliasFromStr = UnicodeString::readOnlyAlias(str16); + assertTrue("aliasFromStr pointer alias", aliasFromStr.getBuffer() == str16.data()); + assertEquals("aliasFromStr length", (int32_t)str16.length(), aliasFromStr.length()); + + // operator== + UnicodeString any(true, u"any", 3); + assertFalse("any == pointer-p16", any == p16); + assertTrue("any == pointer-any", any == u"any"); + assertFalse("any == string_view-sv16", any == sv16); + assertTrue("any == string_view-any", any == u"any"sv); + assertFalse("any == string-str16", any == str16); + assertTrue("any == string-any", any == u"any"s); + + // Assignment copies the string contents. + UnicodeString x; + x = p16; + assertEquals("x = p16", UnicodeString(true, u"p16", 3), x); + x = sv16; + assertEquals("x = sv16", UnicodeString(true, u"sv16", 4), x); + x = str16; + assertEquals("x = str16", UnicodeString(true, u"str16", 5), x); + + // Append + x += p16; + assertEquals("+= p16", UnicodeString(true, u"str16p16", 8), x); + x += sv16; + assertEquals("+= sv16", UnicodeString(true, u"str16p16sv16", 12), x); + x += str16; + assertEquals("+= str16", UnicodeString(true, u"str16p16sv16str16", 17), x); + + x = u"x"sv; + x.append(p16); + assertEquals("append(p16)", UnicodeString(true, u"xp16", 4), x); + x.append(sv16); + assertEquals("append(sv16)", UnicodeString(true, u"xp16sv16", 8), x); + x.append(str16); + assertEquals("append(str16)", UnicodeString(true, u"xp16sv16str16", 13), x); + + // Convert UnicodeString to string view. + std::u16string_view sv16FromUniStr(any); + assertTrue("sv16FromUniStr buffer alias", sv16FromUniStr.data() == any.getBuffer()); + assertEquals("sv16FromUniStr length", any.length(), (int32_t)sv16FromUniStr.length()); + + // Just to show convenience: Convert UnicodeString to string view, then to std string. + std::u16string str16FromUniStr(any); + assertTrue("str16FromUniStr contents", str16FromUniStr == u"any"s); + + // operator+ + x = any + p16; + assertEquals("any + p16", UnicodeString(true, u"anyp16", 6), x); + x = any + sv16; + assertEquals("any + sv16", UnicodeString(true, u"anysv16", 7), x); + x = any + str16; + assertEquals("any + str16", UnicodeString(true, u"anystr16", 8), x); +} + +void UnicodeStringTest::TestWStringView() { +#if U_SIZEOF_WCHAR_T==2 + IcuTestErrorCode status(*this, "TestU16StringView"); + // ICU-22843 Test ICU 76 new UnicodeString APIs that take or return a std::wstring_view + // or something convertible to it. + // NOTE: Keep this function very parallel with TestU16StringView()! + const wchar_t *p16 = L"p16"; + std::wstring_view sv16 = L"sv16"; + std::wstring str16 = L"str16"; + + // These copy the string contents. + UnicodeString fromPtr(p16); // pointer is convertible to std::wstring_view + UnicodeString fromSV(sv16); // std::wstring_view itself + UnicodeString fromSV2(L"sv16_2"sv); // std::wstring_view literal + UnicodeString fromStr(str16); // std::wstring is convertible to std::wstring_view + assertEquals("UnicodeString(const wchar_t *pointer)", UnicodeString(L"p16", 3), fromPtr); + assertEquals("UnicodeString(std::wstring_view)", UnicodeString(L"sv16", 4), fromSV); + assertEquals("UnicodeString(std::wstring_view literal)", UnicodeString(L"sv16_2", 6), fromSV2); + assertEquals("UnicodeString(std::wstring)", UnicodeString(L"str16", 5), fromStr); + + // Read-only aliases + UnicodeString aliasFromPtr = UnicodeString::readOnlyAlias(p16); + assertTrue("aliasFromPtr pointer alias", + aliasFromPtr.getBuffer() == reinterpret_cast(p16)); + assertEquals("aliasFromPtr length", 3, aliasFromPtr.length()); + + UnicodeString aliasFromSV = UnicodeString::readOnlyAlias(sv16); + assertTrue("aliasFromSV pointer alias", + aliasFromSV.getBuffer() == reinterpret_cast(sv16.data())); + assertEquals("aliasFromSV length", (int32_t)sv16.length(), aliasFromSV.length()); + + UnicodeString aliasFromStr = UnicodeString::readOnlyAlias(str16); + assertTrue("aliasFromStr pointer alias", + aliasFromStr.getBuffer() == reinterpret_cast(str16.data())); + assertEquals("aliasFromStr length", (int32_t)str16.length(), aliasFromStr.length()); + + // operator== + UnicodeString any(true, L"any", 3); + assertFalse("any == pointer-p16", any == p16); + assertTrue("any == pointer-any", any == L"any"); + assertFalse("any == string_view-sv16", any == sv16); + assertTrue("any == string_view-any", any == L"any"sv); + assertFalse("any == string-str16", any == str16); + assertTrue("any == string-any", any == L"any"s); + + // Assignment copies the string contents. + UnicodeString x; + x = p16; + assertEquals("x = p16", UnicodeString(true, L"p16", 3), x); + x = sv16; + assertEquals("x = sv16", UnicodeString(true, L"sv16", 4), x); + x = str16; + assertEquals("x = str16", UnicodeString(true, L"str16", 5), x); + + // Append + x += p16; + assertEquals("+= p16", UnicodeString(true, L"str16p16", 8), x); + x += sv16; + assertEquals("+= sv16", UnicodeString(true, L"str16p16sv16", 12), x); + x += str16; + assertEquals("+= str16", UnicodeString(true, L"str16p16sv16str16", 17), x); + + x = L"x"sv; + x.append(p16); + assertEquals("append(p16)", UnicodeString(true, L"xp16", 4), x); + x.append(sv16); + assertEquals("append(sv16)", UnicodeString(true, L"xp16sv16", 8), x); + x.append(str16); + assertEquals("append(str16)", UnicodeString(true, L"xp16sv16str16", 13), x); + + // Convert UnicodeString to string view. + std::wstring_view sv16FromUniStr(any); + assertTrue("sv16FromUniStr buffer alias", + reinterpret_cast(sv16FromUniStr.data()) == any.getBuffer()); + assertEquals("sv16FromUniStr length", any.length(), (int32_t)sv16FromUniStr.length()); + + // Just to show convenience: Convert UnicodeString to string view, then to std string. + std::wstring str16FromUniStr(any); + assertTrue("str16FromUniStr contents", str16FromUniStr == L"any"s); + + // operator+ + x = any + p16; + assertEquals("any + p16", UnicodeString(true, L"anyp16", 6), x); + x = any + sv16; + assertEquals("any + sv16", UnicodeString(true, L"anysv16", 7), x); + x = any + str16; + assertEquals("any + str16", UnicodeString(true, L"anystr16", 8), x); +#endif +} diff --git a/icu4c/source/test/intltest/ustrtest.h b/icu4c/source/test/intltest/ustrtest.h index 088b71399d10..47d4a8acc825 100644 --- a/icu4c/source/test/intltest/ustrtest.h +++ b/icu4c/source/test/intltest/ustrtest.h @@ -98,6 +98,8 @@ class UnicodeStringTest: public IntlTest { void TestNullPointers(); void TestUnicodeStringInsertAppendToSelf(); void TestLargeAppend(); + void TestU16StringView(); + void TestWStringView(); }; #endif