|
7 | 7 | \*******************************************************************/
|
8 | 8 |
|
9 | 9 | #include <cstring>
|
| 10 | +#include <cassert> |
| 11 | +#include <vector> |
10 | 12 |
|
11 | 13 | #include "unicode.h"
|
12 | 14 |
|
@@ -253,3 +255,79 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)
|
253 | 255 |
|
254 | 256 | return argv_narrow;
|
255 | 257 | }
|
| 258 | + |
| 259 | + |
| 260 | +/*******************************************************************\ |
| 261 | +
|
| 262 | +Function: utf8_to_utf16_little_endian |
| 263 | +
|
| 264 | + Inputs: a utf8 string |
| 265 | +
|
| 266 | + Outputs: a utf16 u16string |
| 267 | +
|
| 268 | + Purpose: converts between utf8 and utf16 strings encoded in little endian |
| 269 | +
|
| 270 | +\*******************************************************************/ |
| 271 | + |
| 272 | +std::u16string utf8_to_utf16_little_endian(const std::string& utf8) |
| 273 | +{ |
| 274 | + std::vector<unsigned long> unicode; |
| 275 | + size_t i=0; |
| 276 | + while(i<utf8.size()) |
| 277 | + { |
| 278 | + unsigned long unicode_char; |
| 279 | + size_t size; |
| 280 | + unsigned char ch=utf8[i++]; |
| 281 | + |
| 282 | + if(ch<=0x7F) |
| 283 | + { |
| 284 | + unicode_char=ch; |
| 285 | + size=1; |
| 286 | + } |
| 287 | + else if(ch<=0xDF) |
| 288 | + { |
| 289 | + unicode_char=ch&0x1F; |
| 290 | + size=2; |
| 291 | + } |
| 292 | + else if(ch<=0xEF) |
| 293 | + { |
| 294 | + unicode_char=ch&0x0F; |
| 295 | + size=3; |
| 296 | + } |
| 297 | + else if(ch<=0xF7) |
| 298 | + { |
| 299 | + unicode_char=ch&0x07; |
| 300 | + size=4; |
| 301 | + } |
| 302 | + else |
| 303 | + assert(false); |
| 304 | + |
| 305 | + for(size_t j=1; j<size; ++j) |
| 306 | + { |
| 307 | + assert(i<utf8.size()); |
| 308 | + unsigned char ch=utf8[i++]; |
| 309 | + assert(ch>=0x80 && ch<=0xBF); |
| 310 | + unicode_char<<=6; |
| 311 | + unicode_char+=ch&0x3F; |
| 312 | + } |
| 313 | + assert(unicode_char<0xD800 || unicode_char>0xDFFF); |
| 314 | + assert(unicode_char<=0x10FFFF); |
| 315 | + unicode.push_back(unicode_char); |
| 316 | + } |
| 317 | + |
| 318 | + std::u16string utf16; |
| 319 | + for(size_t i=0; i<unicode.size(); ++i) |
| 320 | + { |
| 321 | + unsigned long uchar=unicode[i]; |
| 322 | + if(uchar<=0xFFFF) |
| 323 | + utf16+=(char16_t)uchar; |
| 324 | + else |
| 325 | + { |
| 326 | + // We have to take care of endianness |
| 327 | + uchar-=0x10000; |
| 328 | + utf16+=(char16_t)((uchar&0x3FF)+0xDC00); |
| 329 | + utf16+=(char16_t)((uchar >> 10)+0xD800); |
| 330 | + } |
| 331 | + } |
| 332 | + return utf16; |
| 333 | +} |
0 commit comments