8
8
9
9
#include < cstring>
10
10
#include < locale>
11
- #include < codecvt>
12
11
#include < iomanip>
13
12
#include < sstream>
13
+ #include < cstdint>
14
14
15
15
#include " unicode.h"
16
16
20
20
21
21
/* ******************************************************************\
22
22
23
+ Function: is_little_endian_arch
24
+
25
+ Inputs:
26
+
27
+ Outputs: True if the architecture is little_endian
28
+
29
+ Purpose: Determine endianness of the architecture
30
+
31
+ \*******************************************************************/
32
+
33
+ bool is_little_endian_arch ()
34
+ {
35
+ uint32_t i=1 ;
36
+ return reinterpret_cast <uint8_t &>(i);
37
+ }
38
+
39
+ /* ******************************************************************\
40
+
23
41
Function: narrow
24
42
25
43
Inputs:
@@ -154,17 +172,17 @@ std::wstring widen(const std::string &s)
154
172
155
173
/* ******************************************************************\
156
174
157
- Function: utf32_to_utf8
175
+ Function: utf8_append_code
158
176
159
- Inputs:
177
+ Inputs: character to append, string to append to
160
178
161
179
Outputs:
162
180
163
- Purpose:
181
+ Purpose: Appends a unicode character to a utf8-encoded string
164
182
165
183
\*******************************************************************/
166
184
167
- void utf32_to_utf8 (unsigned int c, std::string &result)
185
+ static void utf8_append_code (unsigned int c, std::string &result)
168
186
{
169
187
if (c<=0x7f )
170
188
result+=static_cast <char >(c);
@@ -192,9 +210,10 @@ void utf32_to_utf8(unsigned int c, std::string &result)
192
210
193
211
Function: utf32_to_utf8
194
212
195
- Inputs:
213
+ Inputs: utf32-encoded wide string
196
214
197
- Outputs:
215
+ Outputs: utf8-encoded string with the same unicode characters
216
+ as the input.
198
217
199
218
Purpose:
200
219
@@ -207,14 +226,14 @@ std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
207
226
result.reserve (s.size ()); // at least that long
208
227
209
228
for (const auto c : s)
210
- utf32_to_utf8 (c, result);
229
+ utf8_append_code (c, result);
211
230
212
231
return result;
213
232
}
214
233
215
234
/* ******************************************************************\
216
235
217
- Function: utf16_to_utf8
236
+ Function: narrow_argv
218
237
219
238
Inputs:
220
239
@@ -224,43 +243,140 @@ Function: utf16_to_utf8
224
243
225
244
\*******************************************************************/
226
245
227
- std::string utf16_to_utf8 ( const std::basic_string< unsigned short int > &s )
246
+ const char ** narrow_argv ( int argc, const wchar_t **argv_wide )
228
247
{
229
- std::string result;
248
+ if (argv_wide==NULL )
249
+ return NULL ;
230
250
231
- result.reserve (s.size ()); // at least that long
251
+ // the following never gets deleted
252
+ const char **argv_narrow=new const char *[argc+1 ];
253
+ argv_narrow[argc]=0 ;
232
254
233
- for (const auto c : s )
234
- utf32_to_utf8 (c, result );
255
+ for (int i= 0 ; i<argc; i++ )
256
+ argv_narrow[i]= strdup ( narrow (argv_wide[i]). c_str () );
235
257
236
- return result ;
258
+ return argv_narrow ;
237
259
}
238
260
239
261
/* ******************************************************************\
240
262
241
- Function: narrow_argv
263
+ Function: do_swap_bytes
242
264
243
- Inputs:
265
+ Inputs: A 16-bit integer
244
266
245
- Outputs:
267
+ Outputs: A 16-bit integer with bytes swapped
246
268
247
- Purpose:
269
+ Purpose: A helper function for dealing with different UTF16 endians
248
270
249
271
\*******************************************************************/
250
272
251
- const char ** narrow_argv ( int argc, const wchar_t **argv_wide )
273
+ uint16_t do_swap_bytes ( uint16_t x )
252
274
{
253
- if (argv_wide==NULL )
254
- return NULL ;
275
+ uint16_t b1=x & 0xFF ;
276
+ uint16_t b2=x & 0xFF00 ;
277
+ return (b1 << 8 ) | (b2 >> 8 );
278
+ }
255
279
256
- // the following never gets deleted
257
- const char **argv_narrow=new const char *[argc+1 ];
258
- argv_narrow[argc]=0 ;
259
280
260
- for (int i=0 ; i<argc; i++)
261
- argv_narrow[i]=strdup (narrow (argv_wide[i]).c_str ());
281
+ void utf16_append_code (unsigned int code, bool swap_bytes, std::wstring &result)
282
+ {
283
+ // we do not treat 0xD800 to 0xDFFF, although
284
+ // they are not valid unicode symbols
285
+
286
+ if (code<0xFFFF )
287
+ { // code is encoded as one UTF16 character
288
+ // we just take the code and possibly swap the bytes
289
+ unsigned int a=(swap_bytes)?do_swap_bytes (code):code;
290
+ result+=static_cast <wchar_t >(a);
291
+ }
292
+ else // code is encoded as two UTF16 characters
293
+ {
294
+ // if this is valid unicode, we have
295
+ // code<0x10FFFF
296
+ // but let's not check it programmatically
297
+
298
+ // encode the code in UTF16, possibly swapping bytes.
299
+ code=code-0x10000 ;
300
+ unsigned int i1=((code>>10 ) & 0x3ff ) | 0xD800 ;
301
+ unsigned int a1=(swap_bytes)?do_swap_bytes (static_cast <uint16_t >(i1)):i1;
302
+ result+=static_cast <wchar_t >(a1);
303
+ unsigned int i2=(code & 0x3ff ) | 0xDC00 ;
304
+ unsigned int a2=(swap_bytes)?do_swap_bytes (static_cast <uint16_t >(i2)):i2;
305
+ result+=static_cast <wchar_t >(a2);
306
+ }
307
+ }
262
308
263
- return argv_narrow;
309
+
310
+ /* ******************************************************************\
311
+
312
+ Function: utf8_to_utf16
313
+
314
+ Inputs: String in UTF-8 format, bool value indicating whether the
315
+ endianness should be different from the architecture one.
316
+
317
+ Outputs: String in UTF-16 format. The encoding follows the
318
+ endianness of the architecture iff swap_bytes is true.
319
+
320
+ Purpose:
321
+
322
+ \*******************************************************************/
323
+ std::wstring utf8_to_utf16 (const std::string& in, bool swap_bytes)
324
+ {
325
+ std::wstring result;
326
+ result.reserve (in.size ());
327
+ int i=0 ;
328
+ while (i<in.size ())
329
+ {
330
+ unsigned char c=in[i++];
331
+ unsigned int code=0 ;
332
+ // the ifs that follow find out how many UTF8 characters (1-4) store the
333
+ // next unicode character. This is determined by the few most
334
+ // significant bits.
335
+ if (c<=0x7F )
336
+ {
337
+ // if it's one character, then code is exactly the value
338
+ code=c;
339
+ }
340
+ else if (c<=0xDF && i<in.size ())
341
+ { // in other cases, we need to read the right number of chars and decode
342
+ // note: if we wanted to make sure that we capture incorrect strings,
343
+ // we should check that whatever follows first character starts with
344
+ // bits 10.
345
+ code=(c & 0x1F ) << 6 ;
346
+ c=in[i++];
347
+ code+=c & 0x3F ;
348
+ }
349
+ else if (c<=0xEF && i+1 <in.size ())
350
+ {
351
+ code=(c & 0xF ) << 12 ;
352
+ c=in[i++];
353
+ code+=(c & 0x3F ) << 6 ;
354
+ c=in[i++];
355
+ code+=c & 0x3F ;
356
+ }
357
+ else if (c<=0xF7 && i+2 <in.size ())
358
+ {
359
+ code=(c & 0x7 ) << 18 ;
360
+ c=in[i++];
361
+ code+=(c & 0x3F ) << 12 ;
362
+ c=in[i++];
363
+ code+=(c & 0x3F ) << 6 ;
364
+ c=in[i++];
365
+ code+=c & 0x3F ;
366
+ }
367
+ else
368
+ {
369
+ // The string is not a valid UTF8 string! Either it has some characters
370
+ // missing from a multi-character unicode symbol, or it has a char with
371
+ // too high value.
372
+ // For now, let's replace the character with a space
373
+ code=32 ;
374
+ }
375
+
376
+ utf16_append_code (code, swap_bytes, result);
377
+ }
378
+
379
+ return result;
264
380
}
265
381
266
382
/* ******************************************************************\
@@ -271,14 +387,14 @@ Function: utf8_to_utf16_big_endian
271
387
272
388
Outputs: String in UTF-16BE format
273
389
274
- Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
390
+ Purpose:
275
391
276
392
\*******************************************************************/
277
393
278
394
std::wstring utf8_to_utf16_big_endian (const std::string& in)
279
395
{
280
- std::wstring_convert<std::codecvt_utf8_utf16< wchar_t > > converter ;
281
- return converter. from_bytes (in);
396
+ bool swap_bytes= is_little_endian_arch () ;
397
+ return utf8_to_utf16 (in, swap_bytes );
282
398
}
283
399
284
400
/* ******************************************************************\
@@ -289,21 +405,14 @@ Function: utf8_to_utf16_little_endian
289
405
290
406
Outputs: String in UTF-16LE format
291
407
292
- Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
408
+ Purpose:
293
409
294
410
\*******************************************************************/
295
411
296
412
std::wstring utf8_to_utf16_little_endian (const std::string& in)
297
413
{
298
- const std::codecvt_mode mode=std::codecvt_mode::little_endian;
299
-
300
- // default largest value codecvt_utf8_utf16 reads without error is 0x10ffff
301
- // see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16
302
- const unsigned long maxcode=0x10ffff ;
303
-
304
- typedef std::codecvt_utf8_utf16<wchar_t , maxcode, mode> codecvt_utf8_utf16t;
305
- std::wstring_convert<codecvt_utf8_utf16t> converter;
306
- return converter.from_bytes (in);
414
+ bool swap_bytes=!is_little_endian_arch ();
415
+ return utf8_to_utf16 (in, swap_bytes);
307
416
}
308
417
309
418
/* ******************************************************************\
0 commit comments