forked from andrewprock/ustl
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutf8.h
213 lines (191 loc) · 8.6 KB
/
utf8.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
// This file is part of the uSTL library, an STL implementation.
//
// Copyright (c) 2005 by Mike Sharov <[email protected]>
// This file is free software, distributed under the MIT License.
//
// This file contains stream iterators that read and write UTF-8 encoded
// characters. The encoding is defined as follows:
//
// U-00000000 - U-0000007F: 0xxxxxxx
// U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
// U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
// U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// U-80000000 - U-FFFFFFFF: 11111110 100000xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
#pragma once
#include "uiterator.h"
namespace ustl {
//----------------------------------------------------------------------
typedef uint8_t utf8subchar_t; ///< Type for the encoding subcharacters.
//----------------------------------------------------------------------
inline size_t Utf8Bytes (wchar_t v) __attribute__((const));
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last) __attribute__((pure));
inline size_t Utf8SequenceBytes (wchar_t c) __attribute__((const));
//----------------------------------------------------------------------
/// Returns the number of bytes required to UTF-8 encode \p v.
inline size_t Utf8Bytes (wchar_t v)
{
if (uint32_t(v) < 128)
return 1;
size_t n;
#if __x86__
uint32_t r = 0;
asm ("bsr\t%2, %%eax\n\t"
"add\t$4, %0\n\t"
"div\t%3":"=a"(n),"+d"(r):"r"(v),"c"(5));
#else
static const uint32_t c_Bounds[7] = { 0x0000007F, 0x000007FF, 0x0000FFFF, 0x001FFFFF, 0x03FFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
for (n = 0; c_Bounds[n++] < uint32_t(v););
#endif
return n;
}
/// Measures the size of a wchar_t array in UTF-8 encoding.
inline size_t Utf8Bytes (const wchar_t* first, const wchar_t* last)
{
size_t bc = 0;
for (; first < last; ++first)
bc += Utf8Bytes(*first);
return bc;
}
/// Returns the number of bytes in a UTF-8 sequence that starts with \p c.
inline size_t Utf8SequenceBytes (wchar_t c) // a wchar_t to keep c in a full register
{
// Count the leading bits. Header bits are 1 * nBytes followed by a 0.
// 0 - single byte character. Take 7 bits (0xFF >> 1)
// 1 - error, in the middle of the character. Take 6 bits (0xFF >> 2)
// so you will keep reading invalid entries until you hit the next character.
// >2 - multibyte character. Take remaining bits, and get the next bytes.
// All errors are ignored, since the user can not correct them.
//
wchar_t mask = 0x80;
size_t nBytes = 0;
for (; c & mask; ++nBytes)
mask >>= 1;
return nBytes ? nBytes : 1; // A sequence is always at least 1 byte.
}
//----------------------------------------------------------------------
/// \class utf8in_iterator utf8.h ustl.h
/// \ingroup IteratorAdaptors
///
/// \brief An iterator adaptor to character containers for reading UTF-8 encoded text.
///
/// For example, you can copy from ustl::string to ustl::vector<wchar_t> with
/// copy (utf8in (str.begin()), utf8in (str.end()), back_inserter(wvect));
/// There is no error handling; if the reading frame slips you'll get extra
/// characters, one for every misaligned byte. Although it is possible to skip
/// to the start of the next character, that would result in omitting the
/// misformatted character and the one after it, making it very difficult to
/// detect by the user. It is better to write some strange characters and let
/// the user know his file is corrupted. Another problem is overflow on bad
/// encodings (like a 0xFF on the end of a string). This is checked through
/// the end-of-string nul character, which will always be there as long as
/// you are using the string class.
///
template <typename Iterator, typename WChar = wchar_t>
class utf8in_iterator {
public:
typedef typename iterator_traits<Iterator>::value_type value_type;
typedef typename iterator_traits<Iterator>::difference_type difference_type;
typedef typename iterator_traits<Iterator>::pointer pointer;
typedef typename iterator_traits<Iterator>::reference reference;
typedef input_iterator_tag iterator_category;
public:
explicit utf8in_iterator (const Iterator& is) : _i (is), _v (0) { Read(); }
utf8in_iterator (const utf8in_iterator& i) : _i (i._i), _v (i._v) {}
inline const utf8in_iterator& operator= (const utf8in_iterator& i) { _i = i._i; _v = i._v; return *this; }
inline Iterator base (void) const { return _i - (Utf8Bytes(_v) - 1); }
/// Reads and returns the next value.
inline WChar operator* (void) const { return _v; }
inline utf8in_iterator& operator++ (void) { ++_i; Read(); return *this; }
inline utf8in_iterator operator++ (int) { utf8in_iterator old (*this); operator++(); return old; }
inline utf8in_iterator& operator+= (uoff_t n) { while (n--) operator++(); return *this; }
inline utf8in_iterator operator+ (uoff_t n) { utf8in_iterator v (*this); return v += n; }
inline bool operator== (const utf8in_iterator& i) const { return _i == i._i; }
inline bool operator< (const utf8in_iterator& i) const { return _i < i._i; }
difference_type operator- (const utf8in_iterator& i) const;
private:
void Read (void);
private:
Iterator _i;
WChar _v;
};
/// Steps to the next character and updates current returnable value.
template <typename Iterator, typename WChar>
void utf8in_iterator<Iterator,WChar>::Read (void)
{
const utf8subchar_t c = *_i;
size_t nBytes = Utf8SequenceBytes (c);
_v = c & (0xFF >> nBytes); // First byte contains bits after the header.
while (--nBytes && *++_i) // Each subsequent byte has 6 bits.
_v = (_v << 6) | (*_i & 0x3F);
}
/// Returns the distance in characters (as opposed to the distance in bytes).
template <typename Iterator, typename WChar>
typename utf8in_iterator<Iterator,WChar>::difference_type
utf8in_iterator<Iterator,WChar>::operator- (const utf8in_iterator<Iterator,WChar>& last) const
{
difference_type dist = 0;
for (Iterator first (last._i); first < _i; ++dist)
first = advance (first, Utf8SequenceBytes (*first));
return dist;
}
//----------------------------------------------------------------------
/// \class utf8out_iterator utf8.h ustl.h
/// \ingroup IteratorAdaptors
///
/// \brief An iterator adaptor to character containers for writing UTF-8 encoded text.
///
template <typename Iterator, typename WChar = wchar_t>
class utf8out_iterator {
public:
typedef typename iterator_traits<Iterator>::value_type value_type;
typedef typename iterator_traits<Iterator>::difference_type difference_type;
typedef typename iterator_traits<Iterator>::pointer pointer;
typedef typename iterator_traits<Iterator>::reference reference;
typedef output_iterator_tag iterator_category;
public:
explicit utf8out_iterator (const Iterator& os) : _i (os) {}
utf8out_iterator (const utf8out_iterator& i) : _i (i._i) {}
inline const Iterator& base (void) const { return _i; }
/// Writes \p v into the stream.
utf8out_iterator& operator= (WChar v);
inline utf8out_iterator& operator* (void) { return *this; }
inline utf8out_iterator& operator++ (void) { return *this; }
inline utf8out_iterator& operator++ (int) { return *this; }
inline bool operator== (const utf8out_iterator& i) const { return _i == i._i; }
inline bool operator< (const utf8out_iterator& i) const { return _i < i._i; }
private:
Iterator _i;
};
/// Writes \p v into the stream.
template <typename Iterator, typename WChar>
utf8out_iterator<Iterator,WChar>& utf8out_iterator<Iterator,WChar>::operator= (WChar v)
{
const size_t nBytes = Utf8Bytes (v);
if (nBytes > 1) {
// Write the bits 6 bits at a time, except for the first one,
// which may be less than 6 bits.
wchar_t shift = nBytes * 6;
*_i++ = ((v >> (shift -= 6)) & 0x3F) | (0xFF << (8 - nBytes));
while (shift)
*_i++ = ((v >> (shift -= 6)) & 0x3F) | 0x80;
} else // If only one byte, there is no header.
*_i++ = v;
return *this;
}
//----------------------------------------------------------------------
/// Returns a UTF-8 adaptor writing to \p i. Useful in conjuction with back_insert_iterator.
template <typename Iterator>
inline utf8out_iterator<Iterator> utf8out (Iterator i)
{
return utf8out_iterator<Iterator> (i);
}
/// Returns a UTF-8 adaptor reading from \p i.
template <typename Iterator>
inline utf8in_iterator<Iterator> utf8in (Iterator i)
{
return utf8in_iterator<Iterator> (i);
}
//----------------------------------------------------------------------
} // namespace ustl