18
18
#include < windows.h>
19
19
#endif
20
20
21
- // / Determine endianness of the architecture
22
- // / \return True if the architecture is little_endian
23
- bool is_little_endian_arch ()
24
- {
25
- uint32_t i=1 ;
26
- return reinterpret_cast <uint8_t &>(i) != 0 ;
27
- }
28
-
29
- #define BUFSIZE 100
30
-
31
21
std::string narrow (const wchar_t *s)
32
22
{
33
23
#ifdef _WIN32
@@ -138,9 +128,10 @@ static void utf8_append_code(unsigned int c, std::string &result)
138
128
}
139
129
}
140
130
141
- // / \param utf32: encoded wide string
131
+ // / \param s UTF-32 encoded wide string
142
132
// / \return utf8-encoded string with the same unicode characters as the input.
143
- std::string utf32_to_utf8 (const std::basic_string<unsigned int > &s)
133
+ std::string
134
+ utf32_native_endian_to_utf8 (const std::basic_string<unsigned int > &s)
144
135
{
145
136
std::string result;
146
137
@@ -166,51 +157,37 @@ std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
166
157
return argv_narrow;
167
158
}
168
159
169
- // / A helper function for dealing with different UTF16 endians
170
- // / \par parameters: A 16-bit integer
171
- // / \return A 16-bit integer with bytes swapped
172
- uint16_t do_swap_bytes (uint16_t x)
173
- {
174
- uint16_t b1=x & 0xFF ;
175
- uint16_t b2=x & 0xFF00 ;
176
- return (b1 << 8 ) | (b2 >> 8 );
177
- }
178
-
179
-
180
- void utf16_append_code (unsigned int code, bool swap_bytes, std::wstring &result)
160
+ static void utf16_append_code (unsigned int code, std::wstring &result)
181
161
{
182
162
// we do not treat 0xD800 to 0xDFFF, although
183
163
// they are not valid unicode symbols
184
164
185
165
if (code<0xFFFF )
186
- { // code is encoded as one UTF16 character
187
- // we just take the code and possibly swap the bytes
188
- unsigned int a=(swap_bytes)?do_swap_bytes (code):code;
189
- result+=static_cast <wchar_t >(a);
166
+ {
167
+ // code is encoded as one UTF16 character
168
+ result += static_cast <wchar_t >(code);
190
169
}
191
170
else // code is encoded as two UTF16 characters
192
171
{
193
172
// if this is valid unicode, we have
194
173
// code<0x10FFFF
195
174
// but let's not check it programmatically
196
175
197
- // encode the code in UTF16, possibly swapping bytes.
176
+ // encode the code in UTF16
198
177
code=code-0x10000 ;
199
- unsigned int i1=((code>>10 ) & 0x3ff ) | 0xD800 ;
200
- unsigned int a1=(swap_bytes)?do_swap_bytes (static_cast <uint16_t >(i1)):i1;
201
- result+=static_cast <wchar_t >(a1);
202
- unsigned int i2=(code & 0x3ff ) | 0xDC00 ;
203
- unsigned int a2=(swap_bytes)?do_swap_bytes (static_cast <uint16_t >(i2)):i2;
204
- result+=static_cast <wchar_t >(a2);
178
+ const uint16_t i1 = static_cast <uint16_t >(((code >> 10 ) & 0x3ff ) | 0xD800 );
179
+ result += static_cast <wchar_t >(i1);
180
+ const uint16_t i2 = static_cast <uint16_t >((code & 0x3ff ) | 0xDC00 );
181
+ result += static_cast <wchar_t >(i2);
205
182
}
206
183
}
207
184
208
185
209
- // / \par parameters: String in UTF-8 format, bool value indicating whether the
210
- // / endianness should be different from the architecture one.
186
+ // / Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
187
+ // / \par parameters: String in UTF-8 format
211
188
// / \return String in UTF-16 format. The encoding follows the endianness of the
212
189
// / architecture iff swap_bytes is true.
213
- std::wstring utf8_to_utf16 (const std::string& in, bool swap_bytes )
190
+ std::wstring utf8_to_utf16_native_endian (const std::string &in )
214
191
{
215
192
std::wstring result;
216
193
result.reserve (in.size ());
@@ -263,33 +240,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
263
240
code=32 ;
264
241
}
265
242
266
- utf16_append_code (code, swap_bytes, result);
243
+ utf16_append_code (code, result);
267
244
}
268
245
269
246
return result;
270
247
}
271
248
272
- // / \par parameters: String in UTF-8 format
273
- // / \return String in UTF-16BE format
274
- std::wstring utf8_to_utf16_big_endian (const std::string &in)
275
- {
276
- bool swap_bytes=is_little_endian_arch ();
277
- return utf8_to_utf16 (in, swap_bytes);
278
- }
279
-
280
- // / \par parameters: String in UTF-8 format
281
- // / \return String in UTF-16LE format
282
- std::wstring utf8_to_utf16_little_endian (const std::string &in)
283
- {
284
- bool swap_bytes=!is_little_endian_arch ();
285
- return utf8_to_utf16 (in, swap_bytes);
286
- }
287
-
288
- // / \param ch: UTF-16LE character
249
+ // / \param ch: UTF-16 character in architecture-native endianness encoding
289
250
// / \param result: stream to receive string in US-ASCII format, with \\uxxxx
290
251
// / escapes for other characters
291
252
// / \param loc: locale to check for printable characters
292
- static void utf16_little_endian_to_java (
253
+ static void utf16_native_endian_to_java (
293
254
const wchar_t ch,
294
255
std::ostringstream &result,
295
256
const std::locale &loc)
@@ -326,23 +287,23 @@ static void utf16_little_endian_to_java(
326
287
}
327
288
}
328
289
329
- // / \param ch: UTF-16LE character
290
+ // / \param ch: UTF-16 character in architecture-native endianness encoding
330
291
// / \return String in US-ASCII format, with \\uxxxx escapes for other characters
331
- std::string utf16_little_endian_to_java (const wchar_t ch)
292
+ std::string utf16_native_endian_to_java (const wchar_t ch)
332
293
{
333
294
std::ostringstream result;
334
295
const std::locale loc;
335
- utf16_little_endian_to_java (ch, result, loc);
296
+ utf16_native_endian_to_java (ch, result, loc);
336
297
return result.str ();
337
298
}
338
299
339
- // / \param in: String in UTF-16LE format
300
+ // / \param in: String in UTF-16 (native endianness) format
340
301
// / \return String in US-ASCII format, with \\uxxxx escapes for other characters
341
- std::string utf16_little_endian_to_java (const std::wstring &in)
302
+ std::string utf16_native_endian_to_java (const std::wstring &in)
342
303
{
343
304
std::ostringstream result;
344
305
const std::locale loc;
345
306
for (const auto ch : in)
346
- utf16_little_endian_to_java (ch, result, loc);
307
+ utf16_native_endian_to_java (ch, result, loc);
347
308
return result.str ();
348
309
}
0 commit comments