Skip to content

Commit 23485bf

Browse files
Using our own implementation of utf8 to utf16
Since codecvt does not seem to be available to all compiler, I put an implementation in util/unicode
1 parent cb822e0 commit 23485bf

File tree

3 files changed

+81
-1
lines changed

3 files changed

+81
-1
lines changed

src/java_bytecode/java_bytecode_typecheck_expr.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ static void escape_non_alnum(std::string &toescape)
101101
ch='_';
102102
}
103103

104-
static array_exprt utf16_to_array(const std::wstring& in)
104+
static array_exprt utf16_to_array(const std::u16string& in)
105105
{
106106
const auto jchar=java_char_type();
107107
array_exprt ret(array_typet(jchar, infinity_exprt(java_int_type())));

src/util/unicode.cpp

+78
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Author: Daniel Kroening, [email protected]
77
\*******************************************************************/
88

99
#include <cstring>
10+
#include <cassert>
11+
#include <vector>
1012

1113
#include "unicode.h"
1214

@@ -253,3 +255,79 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)
253255

254256
return argv_narrow;
255257
}
258+
259+
260+
/*******************************************************************\
261+
262+
Function: utf8_to_utf16_little_endian
263+
264+
Inputs: a utf8 string
265+
266+
Outputs: a utf16 u16string
267+
268+
Purpose: converts between utf8 and utf16 strings encoded in little endian
269+
270+
\*******************************************************************/
271+
272+
std::u16string utf8_to_utf16_little_endian(const std::string& utf8)
273+
{
274+
std::vector<unsigned long> unicode;
275+
size_t i=0;
276+
while(i<utf8.size())
277+
{
278+
unsigned long unicode_char;
279+
size_t size;
280+
unsigned char ch=utf8[i++];
281+
282+
if(ch<=0x7F)
283+
{
284+
unicode_char=ch;
285+
size=1;
286+
}
287+
else if(ch<=0xDF)
288+
{
289+
unicode_char=ch&0x1F;
290+
size=2;
291+
}
292+
else if(ch<=0xEF)
293+
{
294+
unicode_char=ch&0x0F;
295+
size=3;
296+
}
297+
else if(ch<=0xF7)
298+
{
299+
unicode_char=ch&0x07;
300+
size=4;
301+
}
302+
else
303+
assert(false);
304+
305+
for(size_t j=1; j<size; ++j)
306+
{
307+
assert(i<utf8.size());
308+
unsigned char ch=utf8[i++];
309+
assert(ch>=0x80 && ch<=0xBF);
310+
unicode_char<<=6;
311+
unicode_char+=ch&0x3F;
312+
}
313+
assert(unicode_char<0xD800 || unicode_char>0xDFFF);
314+
assert(unicode_char<=0x10FFFF);
315+
unicode.push_back(unicode_char);
316+
}
317+
318+
std::u16string utf16;
319+
for(size_t i=0; i<unicode.size(); ++i)
320+
{
321+
unsigned long uchar=unicode[i];
322+
if(uchar<=0xFFFF)
323+
utf16+=(char16_t)uchar;
324+
else
325+
{
326+
// We have to take care of endianness
327+
uchar-=0x10000;
328+
utf16+=(char16_t)((uchar&0x3FF)+0xDC00);
329+
utf16+=(char16_t)((uchar >> 10)+0xD800);
330+
}
331+
}
332+
return utf16;
333+
}

src/util/unicode.h

+2
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,6 @@ std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s);
2424

2525
const char **narrow_argv(int argc, const wchar_t **argv_wide);
2626

27+
std::u16string utf8_to_utf16_little_endian(const std::string& utf8);
28+
2729
#endif // CPROVER_UTIL_UNICODE_H

0 commit comments

Comments
 (0)