Skip to content

Do not escape ' in java strings #5038

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/solvers/strings/string_refinement_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ utf16_constant_array_to_java(const array_exprt &arr, std::size_t length)
for(std::size_t i = 0; i < arr.operands().size() && i < length; i++)
out[i] = numeric_cast_v<unsigned>(to_constant_expr(arr.operands()[i]));

return utf16_native_endian_to_java(out);
return utf16_native_endian_to_java_string(out);
}

sparse_arrayt::sparse_arrayt(const with_exprt &expr)
Expand Down
45 changes: 39 additions & 6 deletions src/util/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,15 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
return result;
}

/// Escapes non-printable characters, whitespace except for spaces, double
/// quotes and backslashes. This should yield a valid Java string literal.
/// Note that this specifically does not escape single quotes, as these are not
/// required to be escaped for Java string literals.
/// \param ch: UTF-16 character in architecture-native endianness encoding
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
/// escapes for other characters
/// \param loc: locale to check for printable characters
static void utf16_native_endian_to_java(
static void utf16_native_endian_to_java_string(
const wchar_t ch,
std::ostringstream &result,
const std::locale &loc)
Expand All @@ -284,8 +288,9 @@ static void utf16_native_endian_to_java(
else if(ch <= 255 && isprint(ch, loc))
{
const auto uch = static_cast<unsigned char>(ch);
// ", \ and ' need to be escaped.
if(uch == '"' || uch == '\\' || uch == '\'')
// ", and \ need to be escaped, but not ' for java strings
// e.g. "\"\\" needs escaping but "'" does not.
if(uch == '"' || uch == '\\')
result << '\\';
result << uch;
}
Expand All @@ -298,6 +303,29 @@ static void utf16_native_endian_to_java(
}
}

/// Escapes non-printable characters, whitespace except for spaces, double- and
/// single-quotes and backslashes. This should yield a valid Java identifier.
/// \param ch: UTF-16 character in architecture-native endianness encoding
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
/// escapes for other characters
/// \param loc: locale to check for printable characters
static void utf16_native_endian_to_java(
const wchar_t ch,
std::ostringstream &result,
const std::locale &loc)
{
if(ch == (wchar_t)'\'')
{
const auto uch = static_cast<unsigned char>(ch);
// ' needs to be escaped for java characters, e.g. '\''
result << '\\' << uch;
}
else
{
utf16_native_endian_to_java_string(ch, result, loc);
}
}

/// \param ch: UTF-16 character in architecture-native endianness encoding
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
std::string utf16_native_endian_to_java(const char16_t ch)
Expand All @@ -308,14 +336,19 @@ std::string utf16_native_endian_to_java(const char16_t ch)
return result.str();
}

/// Escapes non-printable characters, whitespace except for spaces, double
/// quotes and backslashes. This should yield a valid Java string literal.
/// Note that this specifically does not escape single quotes, as these are not
/// required to be escaped for Java string literals.
/// \param in: String in UTF-16 (native endianness) format
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
std::string utf16_native_endian_to_java(const std::wstring &in)
/// \return Valid Java string literal in US-ASCII format, with \\uxxxx escapes
/// for other characters
std::string utf16_native_endian_to_java_string(const std::wstring &in)
{
std::ostringstream result;
const std::locale loc;
for(const auto ch : in)
utf16_native_endian_to_java(ch, result, loc);
utf16_native_endian_to_java_string(ch, result, loc);
return result.str();
}

Expand Down
1 change: 1 addition & 0 deletions src/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s);
std::wstring utf8_to_utf16_native_endian(const std::string &in);
std::string utf16_native_endian_to_java(const char16_t ch);
std::string utf16_native_endian_to_java(const std::wstring &in);
std::string utf16_native_endian_to_java_string(const std::wstring &in);

std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);

Expand Down
7 changes: 7 additions & 0 deletions unit/util/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,10 @@ TEST_CASE("unicode4", "[core][util][unicode]")
const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
REQUIRE(compare_utf8_to_utf16(s));
}

TEST_CASE("utf16_native_endian_to_java_string", "[core][util][unicode]")
{
const std::wstring in = L"abc \" ' \\ub374 \\";
REQUIRE(
utf16_native_endian_to_java_string(in) == "abc \\\" \' \\\\ub374 \\\\");
}