diff --git a/jbmc/src/java_bytecode/expr2java.cpp b/jbmc/src/java_bytecode/expr2java.cpp index 8bbd9b093c5..69518dd1c45 100644 --- a/jbmc/src/java_bytecode/expr2java.cpp +++ b/jbmc/src/java_bytecode/expr2java.cpp @@ -202,7 +202,7 @@ std::string expr2javat::convert_constant( if(to_integer(src, int_value)) UNREACHABLE; - dest += "(char)'" + utf16_little_endian_to_java(int_value.to_long()) + '\''; + dest += "(char)'" + utf16_native_endian_to_java(int_value.to_long()) + '\''; return dest; } else if(src.type()==java_byte_type()) diff --git a/jbmc/src/java_bytecode/java_string_literals.cpp b/jbmc/src/java_bytecode/java_string_literals.cpp index 324988544ac..464a1e38274 100644 --- a/jbmc/src/java_bytecode/java_string_literals.cpp +++ b/jbmc/src/java_bytecode/java_string_literals.cpp @@ -106,7 +106,7 @@ symbol_exprt get_or_create_string_literal_symbol( if(string_refinement_enabled) { const array_exprt data = - utf16_to_array(utf8_to_utf16(id2string(value), false)); + utf16_to_array(utf8_to_utf16_native_endian(id2string(value))); struct_exprt literal_init(new_symbol.type); literal_init.operands().resize(jls_struct.components().size()); diff --git a/src/ansi-c/literals/convert_string_literal.cpp b/src/ansi-c/literals/convert_string_literal.cpp index b115697badf..34aebb34331 100644 --- a/src/ansi-c/literals/convert_string_literal.cpp +++ b/src/ansi-c/literals/convert_string_literal.cpp @@ -34,7 +34,7 @@ std::basic_string convert_one_string_literal( unescape_wide_string(std::string(src, 3, src.size()-4)); // turn into utf-8 - std::string utf8_value=utf32_to_utf8(value); + const std::string utf8_value = utf32_native_endian_to_utf8(value); // pad into wide string value.resize(utf8_value.size()); diff --git a/src/ansi-c/literals/unescape_string.cpp b/src/ansi-c/literals/unescape_string.cpp index 7e0cef9fe23..619183b1ba3 100644 --- a/src/ansi-c/literals/unescape_string.cpp +++ b/src/ansi-c/literals/unescape_string.cpp @@ -23,7 +23,7 @@ static void append_universal_char( std::basic_string value_str(1, value); // turn into utf-8 - std::string utf8_value=utf32_to_utf8(value_str); + const std::string utf8_value = utf32_native_endian_to_utf8(value_str); dest.append(utf8_value); } diff --git a/src/ansi-c/scanner.l b/src/ansi-c/scanner.l index 765a30fbf0f..71e894e3682 100644 --- a/src/ansi-c/scanner.l +++ b/src/ansi-c/scanner.l @@ -67,7 +67,7 @@ int make_identifier() utf32+=letter; // turn into utf-8 - std::string utf8_value=utf32_to_utf8(utf32); + const std::string utf8_value = utf32_native_endian_to_utf8(utf32); final_base_name+=utf8_value; } else diff --git a/src/solvers/refinement/string_constraint_generator_format.cpp b/src/solvers/refinement/string_constraint_generator_format.cpp index 1b34a98cdf4..20b9d332472 100644 --- a/src/solvers/refinement/string_constraint_generator_format.cpp +++ b/src/solvers/refinement/string_constraint_generator_format.cpp @@ -444,7 +444,7 @@ utf16_constant_array_to_java(const array_exprt &arr, std::size_t length) INVARIANT(!conversion_failed, "constant should be convertible to unsigned"); out[i]=c; } - return utf16_little_endian_to_java(out); + return utf16_native_endian_to_java(out); } /// Formatted string using a format string and list of arguments diff --git a/src/util/file_util.cpp b/src/util/file_util.cpp index 9a943a3c68e..8c3c17ba3a5 100644 --- a/src/util/file_util.cpp +++ b/src/util/file_util.cpp @@ -95,7 +95,7 @@ void delete_directory_utf16(const std::wstring &path) void delete_directory(const std::string &path) { #ifdef _WIN32 - delete_directory_utf16(utf8_to_utf16_little_endian(path)); + delete_directory_utf16(utf8_to_utf16_native_endian(path)); #else DIR *dir=opendir(path.c_str()); if(dir!=nullptr) diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index 6900048e425..ca9ef98b3fd 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -18,16 +18,6 @@ Author: Daniel Kroening, kroening@kroening.com #include #endif -/// Determine endianness of the architecture -/// \return True if the architecture is little_endian -bool is_little_endian_arch() -{ - uint32_t i=1; - return reinterpret_cast(i) != 0; -} - -#define BUFSIZE 100 - std::string narrow(const wchar_t *s) { #ifdef _WIN32 @@ -138,9 +128,10 @@ static void utf8_append_code(unsigned int c, std::string &result) } } -/// \param utf32:encoded wide string +/// \param s UTF-32 encoded wide string /// \return utf8-encoded string with the same unicode characters as the input. -std::string utf32_to_utf8(const std::basic_string &s) +std::string +utf32_native_endian_to_utf8(const std::basic_string &s) { std::string result; @@ -166,27 +157,15 @@ std::vector narrow_argv(int argc, const wchar_t **argv_wide) return argv_narrow; } -/// A helper function for dealing with different UTF16 endians -/// \par parameters: A 16-bit integer -/// \return A 16-bit integer with bytes swapped -uint16_t do_swap_bytes(uint16_t x) -{ - uint16_t b1=x & 0xFF; - uint16_t b2=x & 0xFF00; - return (b1 << 8) | (b2 >> 8); -} - - -void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result) +static void utf16_append_code(unsigned int code, std::wstring &result) { // we do not treat 0xD800 to 0xDFFF, although // they are not valid unicode symbols if(code<0xFFFF) - { // code is encoded as one UTF16 character - // we just take the code and possibly swap the bytes - unsigned int a=(swap_bytes)?do_swap_bytes(code):code; - result+=static_cast(a); + { + // code is encoded as one UTF16 character + result += static_cast(code); } else // code is encoded as two UTF16 characters { @@ -194,23 +173,21 @@ void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result) // code<0x10FFFF // but let's not check it programmatically - // encode the code in UTF16, possibly swapping bytes. + // encode the code in UTF16 code=code-0x10000; - unsigned int i1=((code>>10) & 0x3ff) | 0xD800; - unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast(i1)):i1; - result+=static_cast(a1); - unsigned int i2=(code & 0x3ff) | 0xDC00; - unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast(i2)):i2; - result+=static_cast(a2); + const uint16_t i1 = static_cast(((code >> 10) & 0x3ff) | 0xD800); + result += static_cast(i1); + const uint16_t i2 = static_cast((code & 0x3ff) | 0xDC00); + result += static_cast(i2); } } -/// \par parameters: String in UTF-8 format, bool value indicating whether the -/// endianness should be different from the architecture one. +/// Convert UTF8-encoded string to UTF-16 with architecture-native endianness. +/// \par parameters: String in UTF-8 format /// \return String in UTF-16 format. The encoding follows the endianness of the /// architecture iff swap_bytes is true. -std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes) +std::wstring utf8_to_utf16_native_endian(const std::string &in) { std::wstring result; result.reserve(in.size()); @@ -263,33 +240,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes) code=32; } - utf16_append_code(code, swap_bytes, result); + utf16_append_code(code, result); } return result; } -/// \par parameters: String in UTF-8 format -/// \return String in UTF-16BE format -std::wstring utf8_to_utf16_big_endian(const std::string &in) -{ - bool swap_bytes=is_little_endian_arch(); - return utf8_to_utf16(in, swap_bytes); -} - -/// \par parameters: String in UTF-8 format -/// \return String in UTF-16LE format -std::wstring utf8_to_utf16_little_endian(const std::string &in) -{ - bool swap_bytes=!is_little_endian_arch(); - return utf8_to_utf16(in, swap_bytes); -} - -/// \param ch: UTF-16LE character +/// \param ch: UTF-16 character in architecture-native endianness encoding /// \param result: stream to receive string in US-ASCII format, with \\uxxxx /// escapes for other characters /// \param loc: locale to check for printable characters -static void utf16_little_endian_to_java( +static void utf16_native_endian_to_java( const wchar_t ch, std::ostringstream &result, const std::locale &loc) @@ -326,23 +287,23 @@ static void utf16_little_endian_to_java( } } -/// \param ch: UTF-16LE character +/// \param ch: UTF-16 character in architecture-native endianness encoding /// \return String in US-ASCII format, with \\uxxxx escapes for other characters -std::string utf16_little_endian_to_java(const wchar_t ch) +std::string utf16_native_endian_to_java(const wchar_t ch) { std::ostringstream result; const std::locale loc; - utf16_little_endian_to_java(ch, result, loc); + utf16_native_endian_to_java(ch, result, loc); return result.str(); } -/// \param in: String in UTF-16LE format +/// \param in: String in UTF-16 (native endianness) format /// \return String in US-ASCII format, with \\uxxxx escapes for other characters -std::string utf16_little_endian_to_java(const std::wstring &in) +std::string utf16_native_endian_to_java(const std::wstring &in) { std::ostringstream result; const std::locale loc; for(const auto ch : in) - utf16_little_endian_to_java(ch, result, loc); + utf16_native_endian_to_java(ch, result, loc); return result.str(); } diff --git a/src/util/unicode.h b/src/util/unicode.h index aeee035de1b..dba750dea2c 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -22,13 +22,12 @@ std::wstring widen(const char *s); std::string narrow(const std::wstring &s); std::wstring widen(const std::string &s); -std::string utf32_to_utf8(const std::basic_string &s); +std::string +utf32_native_endian_to_utf8(const std::basic_string &s); -std::wstring utf8_to_utf16(const std::string &in, bool swap_bytes); -std::wstring utf8_to_utf16_big_endian(const std::string &); -std::wstring utf8_to_utf16_little_endian(const std::string &); -std::string utf16_little_endian_to_java(const wchar_t ch); -std::string utf16_little_endian_to_java(const std::wstring &in); +std::wstring utf8_to_utf16_native_endian(const std::string &in); +std::string utf16_native_endian_to_java(const wchar_t ch); +std::string utf16_native_endian_to_java(const std::wstring &in); std::vector narrow_argv(int argc, const wchar_t **argv_wide); diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index 9f57805af26..6def86d4887 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -13,7 +13,6 @@ list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/json.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp_parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/osx_fat_reader.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp ${CMAKE_CURRENT_SOURCE_DIR}/wp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp_scanner.cpp ${CMAKE_CURRENT_SOURCE_DIR}/float_utils.cpp diff --git a/unit/Makefile b/unit/Makefile index 496f4cb0143..1f92f2be2d6 100644 --- a/unit/Makefile +++ b/unit/Makefile @@ -37,6 +37,7 @@ SRC += unit_tests.cpp \ util/string_utils/split_string.cpp \ util/string_utils/strip_string.cpp \ util/symbol_table.cpp \ + util/unicode.cpp \ catch_example.cpp \ # Empty last line diff --git a/unit/unicode.cpp b/unit/util/unicode.cpp similarity index 52% rename from unit/unicode.cpp rename to unit/util/unicode.cpp index 9670856419f..ff4901f2e45 100644 --- a/unit/unicode.cpp +++ b/unit/util/unicode.cpp @@ -6,20 +6,22 @@ Author: Vojtech Forejt, forejtv@diffblue.com \*******************************************************************/ -#include +#include + #include #include #include -#include -#include #include #include +// the u8 prefix is only available from VS 2015 onwards +#if !defined(_MSC_VER) || _MSC_VER >= 1900 + // This unit test compares our implementation with codecvt implementation, // checking bit-by-bit equivalence of results. -bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b) +static bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b) { if(a.size() != b.size()) return false; @@ -35,7 +37,10 @@ bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b) } // helper print function, can be called for debugging problem -void wstr_print(const std::wstring &a, const std::wstring &b) +#if 0 +#include + +static void wstr_print(const std::wstring &a, const std::wstring &b) { int endi=(a.size()>b.size())?a.size():b.size(); const unsigned char @@ -49,46 +54,46 @@ void wstr_print(const std::wstring &a, const std::wstring &b) } std::cout << '\n'; } +#endif -void compare_utf8_to_utf16_big_endian(std::string& in) +static bool compare_utf8_to_utf16(const std::string &in) { - std::wstring s1=utf8_to_utf16_big_endian(in); + const std::wstring s1 = utf8_to_utf16_native_endian(in); typedef std::codecvt_utf8_utf16 codecvt_utf8_utf16t; std::wstring_convert converter; std::wstring s2=converter.from_bytes(in); - assert(paranoid_wstr_equals(s1, s2)); + return paranoid_wstr_equals(s1, s2); } -void compare_utf8_to_utf16_little_endian(std::string& in) +TEST_CASE("unicode0", "[core][util][unicode]") { - std::wstring s1=utf8_to_utf16_little_endian(in); - - const std::codecvt_mode mode=std::codecvt_mode::little_endian; - const unsigned long maxcode=0x10ffff; + const std::string s = u8"abc"; + REQUIRE(compare_utf8_to_utf16(s)); +} - typedef std::codecvt_utf8_utf16 codecvt_utf8_utf16t; - std::wstring_convert converter; - std::wstring s2=converter.from_bytes(in); +TEST_CASE("unicode1", "[core][util][unicode]") +{ + const std::string s = u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8"; + REQUIRE(compare_utf8_to_utf16(s)); +} - assert(paranoid_wstr_equals(s1, s2)); +TEST_CASE("unicode2", "[core][util][unicode]") +{ + const std::string s = u8"$¢€𐍈"; + REQUIRE(compare_utf8_to_utf16(s)); } -int main() +TEST_CASE("unicode3", "[core][util][unicode]") { - std::string s; - s=u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8"; - compare_utf8_to_utf16_big_endian(s); - compare_utf8_to_utf16_little_endian(s); - s=u8"$¢€𐍈"; - compare_utf8_to_utf16_big_endian(s); - compare_utf8_to_utf16_little_endian(s); - s=u8"𐐏𤭢"; - compare_utf8_to_utf16_big_endian(s); - compare_utf8_to_utf16_little_endian(s); - s=u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ"; - compare_utf8_to_utf16_big_endian(s); - compare_utf8_to_utf16_little_endian(s); + const std::string s = u8"𐐏𤭢"; + REQUIRE(compare_utf8_to_utf16(s)); } +TEST_CASE("unicode4", "[core][util][unicode]") +{ + const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ"; + REQUIRE(compare_utf8_to_utf16(s)); +} +#endif