Skip to content

Commit a53f5bf

Browse files
committed
Split UTF-16 conversion code into two cases
Previously there was only one case, for strings as an argument. This commit adds a new function with the same logic applied to a wchar_t argument. The common code of the two functions is moved to a static function. Also added an escape for single-quote characters.
1 parent 5297646 commit a53f5bf

File tree

2 files changed

+55
-29
lines changed

2 files changed

+55
-29
lines changed

src/util/unicode.cpp

+54-29
Original file line numberDiff line numberDiff line change
@@ -284,39 +284,64 @@ std::wstring utf8_to_utf16_little_endian(const std::string &in)
284284
return utf8_to_utf16(in, swap_bytes);
285285
}
286286

287-
/// \par parameters: String in UTF-16LE format
288-
/// \return String in US-ASCII format, with \uxxxx escapes for other characters
287+
/// \param ch: UTF-16LE character
288+
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
289+
/// escapes for other characters
290+
/// \param loc: locale to check for printable characters
291+
static void utf16_little_endian_to_java(
292+
const wchar_t ch,
293+
std::ostringstream &result,
294+
const std::locale &loc)
295+
{
296+
// \u unicode characters are translated very early by the Java compiler and so
297+
// \u000a or \u000d would become a newline character in a char constant, which
298+
// is illegal. Instead use \n or \r.
299+
if(ch == '\n')
300+
result << "\\n";
301+
else if(ch == '\r')
302+
result << "\\r";
303+
// \f, \b and \t do not need to be escaped, but this will improve readability
304+
// of generated tests.
305+
else if(ch == '\f')
306+
result << "\\f";
307+
else if(ch == '\b')
308+
result << "\\b";
309+
else if(ch == '\t')
310+
result << "\\t";
311+
else if(ch <= 255 && isprint(ch, loc))
312+
{
313+
const auto uch = static_cast<unsigned char>(ch);
314+
// ", \ and ' need to be escaped.
315+
if(uch == '"' || uch == '\\' || uch == '\'')
316+
result << '\\';
317+
result << uch;
318+
}
319+
else
320+
{
321+
// Format ch as a hexadecimal unicode character padded to four digits with
322+
// zeros.
323+
result << "\\u" << std::hex << std::setw(4) << std::setfill('0')
324+
<< static_cast<unsigned int>(ch);
325+
}
326+
}
327+
328+
/// \param ch: UTF-16LE character
329+
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
330+
std::string utf16_little_endian_to_java(const wchar_t ch)
331+
{
332+
std::ostringstream result;
333+
const std::locale loc;
334+
utf16_little_endian_to_java(ch, result, loc);
335+
return result.str();
336+
}
337+
338+
/// \param in: String in UTF-16LE format
339+
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
289340
std::string utf16_little_endian_to_java(const std::wstring &in)
290341
{
291342
std::ostringstream result;
292343
const std::locale loc;
293344
for(const auto ch : in)
294-
{
295-
if(ch=='\n')
296-
result << "\\n";
297-
else if(ch=='\r')
298-
result << "\\r";
299-
else if(ch=='\f')
300-
result << "\\f";
301-
else if(ch=='\b')
302-
result << "\\b";
303-
else if(ch=='\t')
304-
result << "\\t";
305-
else if(ch<=255 && isprint(ch, loc))
306-
{
307-
const auto uch=static_cast<unsigned char>(ch);
308-
if(uch=='"' || uch=='\\')
309-
result << '\\';
310-
result << uch;
311-
}
312-
else
313-
{
314-
result << "\\u"
315-
<< std::hex
316-
<< std::setw(4)
317-
<< std::setfill('0')
318-
<< static_cast<unsigned int>(ch);
319-
}
320-
}
345+
utf16_little_endian_to_java(ch, result, loc);
321346
return result.str();
322347
}

src/util/unicode.h

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
2626

2727
std::wstring utf8_to_utf16_big_endian(const std::string &);
2828
std::wstring utf8_to_utf16_little_endian(const std::string &);
29+
std::string utf16_little_endian_to_java(const wchar_t ch);
2930
std::string utf16_little_endian_to_java(const std::wstring &in);
3031

3132
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);

0 commit comments

Comments
 (0)