diff --git a/src/solvers/strings/string_refinement_util.cpp b/src/solvers/strings/string_refinement_util.cpp index 9efc6a9afbd..dd0cd19cfd0 100644 --- a/src/solvers/strings/string_refinement_util.cpp +++ b/src/solvers/strings/string_refinement_util.cpp @@ -63,7 +63,7 @@ utf16_constant_array_to_java(const array_exprt &arr, std::size_t length) for(std::size_t i = 0; i < arr.operands().size() && i < length; i++) out[i] = numeric_cast_v(to_constant_expr(arr.operands()[i])); - return utf16_native_endian_to_java(out); + return utf16_native_endian_to_java_string(out); } sparse_arrayt::sparse_arrayt(const with_exprt &expr) diff --git a/src/util/unicode.cpp b/src/util/unicode.cpp index b879cae7369..1216d86a099 100644 --- a/src/util/unicode.cpp +++ b/src/util/unicode.cpp @@ -257,11 +257,15 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in) return result; } +/// Escapes non-printable characters, whitespace except for spaces, double +/// quotes and backslashes. This should yield a valid Java string literal. +/// Note that this specifically does not escape single quotes, as these are not +/// required to be escaped for Java string literals. /// \param ch: UTF-16 character in architecture-native endianness encoding /// \param result: stream to receive string in US-ASCII format, with \\uxxxx /// escapes for other characters /// \param loc: locale to check for printable characters -static void utf16_native_endian_to_java( +static void utf16_native_endian_to_java_string( const wchar_t ch, std::ostringstream &result, const std::locale &loc) @@ -284,8 +288,9 @@ static void utf16_native_endian_to_java( else if(ch <= 255 && isprint(ch, loc)) { const auto uch = static_cast(ch); - // ", \ and ' need to be escaped. - if(uch == '"' || uch == '\\' || uch == '\'') + // ", and \ need to be escaped, but not ' for java strings + // e.g. "\"\\" needs escaping but "'" does not. + if(uch == '"' || uch == '\\') result << '\\'; result << uch; } @@ -298,6 +303,29 @@ static void utf16_native_endian_to_java( } } +/// Escapes non-printable characters, whitespace except for spaces, double- and +/// single-quotes and backslashes. This should yield a valid Java identifier. +/// \param ch: UTF-16 character in architecture-native endianness encoding +/// \param result: stream to receive string in US-ASCII format, with \\uxxxx +/// escapes for other characters +/// \param loc: locale to check for printable characters +static void utf16_native_endian_to_java( + const wchar_t ch, + std::ostringstream &result, + const std::locale &loc) +{ + if(ch == (wchar_t)'\'') + { + const auto uch = static_cast(ch); + // ' needs to be escaped for java characters, e.g. '\'' + result << '\\' << uch; + } + else + { + utf16_native_endian_to_java_string(ch, result, loc); + } +} + /// \param ch: UTF-16 character in architecture-native endianness encoding /// \return String in US-ASCII format, with \\uxxxx escapes for other characters std::string utf16_native_endian_to_java(const char16_t ch) @@ -308,14 +336,19 @@ std::string utf16_native_endian_to_java(const char16_t ch) return result.str(); } +/// Escapes non-printable characters, whitespace except for spaces, double +/// quotes and backslashes. This should yield a valid Java string literal. +/// Note that this specifically does not escape single quotes, as these are not +/// required to be escaped for Java string literals. /// \param in: String in UTF-16 (native endianness) format -/// \return String in US-ASCII format, with \\uxxxx escapes for other characters -std::string utf16_native_endian_to_java(const std::wstring &in) +/// \return Valid Java string literal in US-ASCII format, with \\uxxxx escapes +/// for other characters +std::string utf16_native_endian_to_java_string(const std::wstring &in) { std::ostringstream result; const std::locale loc; for(const auto ch : in) - utf16_native_endian_to_java(ch, result, loc); + utf16_native_endian_to_java_string(ch, result, loc); return result.str(); } diff --git a/src/util/unicode.h b/src/util/unicode.h index e317aca8564..307c1551307 100644 --- a/src/util/unicode.h +++ b/src/util/unicode.h @@ -28,6 +28,7 @@ utf32_native_endian_to_utf8(const std::basic_string &s); std::wstring utf8_to_utf16_native_endian(const std::string &in); std::string utf16_native_endian_to_java(const char16_t ch); std::string utf16_native_endian_to_java(const std::wstring &in); +std::string utf16_native_endian_to_java_string(const std::wstring &in); std::vector narrow_argv(int argc, const wchar_t **argv_wide); diff --git a/unit/util/unicode.cpp b/unit/util/unicode.cpp index ef4537f1676..2424e0174bc 100644 --- a/unit/util/unicode.cpp +++ b/unit/util/unicode.cpp @@ -93,3 +93,10 @@ TEST_CASE("unicode4", "[core][util][unicode]") const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ"; REQUIRE(compare_utf8_to_utf16(s)); } + +TEST_CASE("utf16_native_endian_to_java_string", "[core][util][unicode]") +{ + const std::wstring in = L"abc \" ' \\ub374 \\"; + REQUIRE( + utf16_native_endian_to_java_string(in) == "abc \\\" \' \\\\ub374 \\\\"); +}