Skip to content

Commit 153a4b9

Browse files
Merge pull request diffblue#5038 from jeannielynnmoulton/jeannie/NoEscape
Do not escape ' in java strings
2 parents 04aa83e + 93004ed commit 153a4b9

File tree

4 files changed

+48
-7
lines changed

4 files changed

+48
-7
lines changed

src/solvers/strings/string_refinement_util.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ utf16_constant_array_to_java(const array_exprt &arr, std::size_t length)
6363
for(std::size_t i = 0; i < arr.operands().size() && i < length; i++)
6464
out[i] = numeric_cast_v<unsigned>(to_constant_expr(arr.operands()[i]));
6565

66-
return utf16_native_endian_to_java(out);
66+
return utf16_native_endian_to_java_string(out);
6767
}
6868

6969
sparse_arrayt::sparse_arrayt(const with_exprt &expr)

src/util/unicode.cpp

+39-6
Original file line numberDiff line numberDiff line change
@@ -257,11 +257,15 @@ std::wstring utf8_to_utf16_native_endian(const std::string &in)
257257
return result;
258258
}
259259

260+
/// Escapes non-printable characters, whitespace except for spaces, double
261+
/// quotes and backslashes. This should yield a valid Java string literal.
262+
/// Note that this specifically does not escape single quotes, as these are not
263+
/// required to be escaped for Java string literals.
260264
/// \param ch: UTF-16 character in architecture-native endianness encoding
261265
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
262266
/// escapes for other characters
263267
/// \param loc: locale to check for printable characters
264-
static void utf16_native_endian_to_java(
268+
static void utf16_native_endian_to_java_string(
265269
const wchar_t ch,
266270
std::ostringstream &result,
267271
const std::locale &loc)
@@ -284,8 +288,9 @@ static void utf16_native_endian_to_java(
284288
else if(ch <= 255 && isprint(ch, loc))
285289
{
286290
const auto uch = static_cast<unsigned char>(ch);
287-
// ", \ and ' need to be escaped.
288-
if(uch == '"' || uch == '\\' || uch == '\'')
291+
// ", and \ need to be escaped, but not ' for java strings
292+
// e.g. "\"\\" needs escaping but "'" does not.
293+
if(uch == '"' || uch == '\\')
289294
result << '\\';
290295
result << uch;
291296
}
@@ -298,6 +303,29 @@ static void utf16_native_endian_to_java(
298303
}
299304
}
300305

306+
/// Escapes non-printable characters, whitespace except for spaces, double- and
307+
/// single-quotes and backslashes. This should yield a valid Java identifier.
308+
/// \param ch: UTF-16 character in architecture-native endianness encoding
309+
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
310+
/// escapes for other characters
311+
/// \param loc: locale to check for printable characters
312+
static void utf16_native_endian_to_java(
313+
const wchar_t ch,
314+
std::ostringstream &result,
315+
const std::locale &loc)
316+
{
317+
if(ch == (wchar_t)'\'')
318+
{
319+
const auto uch = static_cast<unsigned char>(ch);
320+
// ' needs to be escaped for java characters, e.g. '\''
321+
result << '\\' << uch;
322+
}
323+
else
324+
{
325+
utf16_native_endian_to_java_string(ch, result, loc);
326+
}
327+
}
328+
301329
/// \param ch: UTF-16 character in architecture-native endianness encoding
302330
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
303331
std::string utf16_native_endian_to_java(const char16_t ch)
@@ -308,14 +336,19 @@ std::string utf16_native_endian_to_java(const char16_t ch)
308336
return result.str();
309337
}
310338

339+
/// Escapes non-printable characters, whitespace except for spaces, double
340+
/// quotes and backslashes. This should yield a valid Java string literal.
341+
/// Note that this specifically does not escape single quotes, as these are not
342+
/// required to be escaped for Java string literals.
311343
/// \param in: String in UTF-16 (native endianness) format
312-
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
313-
std::string utf16_native_endian_to_java(const std::wstring &in)
344+
/// \return Valid Java string literal in US-ASCII format, with \\uxxxx escapes
345+
/// for other characters
346+
std::string utf16_native_endian_to_java_string(const std::wstring &in)
314347
{
315348
std::ostringstream result;
316349
const std::locale loc;
317350
for(const auto ch : in)
318-
utf16_native_endian_to_java(ch, result, loc);
351+
utf16_native_endian_to_java_string(ch, result, loc);
319352
return result.str();
320353
}
321354

src/util/unicode.h

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s);
2828
std::wstring utf8_to_utf16_native_endian(const std::string &in);
2929
std::string utf16_native_endian_to_java(const char16_t ch);
3030
std::string utf16_native_endian_to_java(const std::wstring &in);
31+
std::string utf16_native_endian_to_java_string(const std::wstring &in);
3132

3233
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
3334

unit/util/unicode.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,10 @@ TEST_CASE("unicode4", "[core][util][unicode]")
9393
const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
9494
REQUIRE(compare_utf8_to_utf16(s));
9595
}
96+
97+
TEST_CASE("utf16_native_endian_to_java_string", "[core][util][unicode]")
98+
{
99+
const std::wstring in = L"abc \" ' \\ub374 \\";
100+
REQUIRE(
101+
utf16_native_endian_to_java_string(in) == "abc \\\" \' \\\\ub374 \\\\");
102+
}

0 commit comments

Comments
 (0)