Using our own implementation of utf8 to utf16

romainbrenguier · romainbrenguier · commit 23485bfa78d7 · 2017-02-10T16:28:59.000Z
Since codecvt does not seem to be available to all compiler, I put an
implementation in util/unicode
diff --git a/src/java_bytecode/java_bytecode_typecheck_expr.cpp b/src/java_bytecode/java_bytecode_typecheck_expr.cpp
@@ -101,7 +101,7 @@ static void escape_non_alnum(std::string &toescape)
       ch='_';
 }
 
-static array_exprt utf16_to_array(const std::wstring& in)
+static array_exprt utf16_to_array(const std::u16string& in)
 {
   const auto jchar=java_char_type();
   array_exprt ret(array_typet(jchar, infinity_exprt(java_int_type())));
diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp
@@ -7,6 +7,8 @@ Author: Daniel Kroening, kroening@kroening.com
 \*******************************************************************/
 
 #include <cstring>
+#include <cassert>
+#include <vector>
 
 #include "unicode.h"
 
@@ -253,3 +255,79 @@ const char **narrow_argv(int argc, const wchar_t **argv_wide)
 
   return argv_narrow;
 }
+
+
+/*******************************************************************\
+
+Function: utf8_to_utf16_little_endian
+
+  Inputs: a utf8 string
+
+ Outputs: a utf16 u16string
+
+ Purpose: converts between utf8 and utf16 strings encoded in little endian
+
+\*******************************************************************/
+
+std::u16string utf8_to_utf16_little_endian(const std::string& utf8)
+{
+  std::vector<unsigned long> unicode;
+  size_t i=0;
+  while(i<utf8.size())
+  {
+    unsigned long unicode_char;
+    size_t size;
+    unsigned char ch=utf8[i++];
+
+    if(ch<=0x7F)
+    {
+      unicode_char=ch;
+      size=1;
+    }
+    else if(ch<=0xDF)
+    {
+      unicode_char=ch&0x1F;
+      size=2;
+    }
+    else if(ch<=0xEF)
+    {
+      unicode_char=ch&0x0F;
+      size=3;
+    }
+    else if(ch<=0xF7)
+    {
+      unicode_char=ch&0x07;
+      size=4;
+    }
+    else
+      assert(false);
+
+    for(size_t j=1; j<size; ++j)
+    {
+      assert(i<utf8.size());
+      unsigned char ch=utf8[i++];
+      assert(ch>=0x80 && ch<=0xBF);
+      unicode_char<<=6;
+      unicode_char+=ch&0x3F;
+    }
+    assert(unicode_char<0xD800 || unicode_char>0xDFFF);
+    assert(unicode_char<=0x10FFFF);
+    unicode.push_back(unicode_char);
+  }
+
+  std::u16string utf16;
+  for(size_t i=0; i<unicode.size(); ++i)
+  {
+    unsigned long uchar=unicode[i];
+    if(uchar<=0xFFFF)
+      utf16+=(char16_t)uchar;
+    else
+    {
+      // We have to take care of endianness
+      uchar-=0x10000;
+      utf16+=(char16_t)((uchar&0x3FF)+0xDC00);
+      utf16+=(char16_t)((uchar >> 10)+0xD800);
+    }
+  }
+  return utf16;
+}
diff --git a/src/util/unicode.h b/src/util/unicode.h
@@ -24,4 +24,6 @@ std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s);
 
 const char **narrow_argv(int argc, const wchar_t **argv_wide);
 
+std::u16string utf8_to_utf16_little_endian(const std::string& utf8);
+
 #endif // CPROVER_UTIL_UNICODE_H

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ static void escape_non_alnum(std::string &toescape)`
`101`	`101`	`ch='_';`
`102`	`102`	`}`
`103`	`103`
`104`		`-static array_exprt utf16_to_array(const std::wstring& in)`
	`104`	`+static array_exprt utf16_to_array(const std::u16string& in)`
`105`	`105`	`{`
`106`	`106`	`const auto jchar=java_char_type();`
`107`	`107`	`array_exprt ret(array_typet(jchar, infinity_exprt(java_int_type())));`