Skip to content

Commit 9d2ca43

Browse files
authored
Merge pull request #4594 from antlechner/antonia/json-parsing-utf16
Add support for all Unicode codepoints of the BMP in the JSON parser [TG-7634]
2 parents 5871906 + 4d502e7 commit 9d2ca43

File tree

4 files changed

+88
-11
lines changed

4 files changed

+88
-11
lines changed

src/json/parser.y

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
%{
1919
#include "json_parser.h"
2020

21+
#include <util/unicode.h>
22+
2123
int yyjsonlex();
2224
extern char *yyjsontext;
2325
extern int yyjsonleng; // really an int, not a size_t
@@ -49,10 +51,11 @@ static std::string convert_TOK_STRING()
4951
{
5052
// Character in hexadecimal Unicode representation, in the format
5153
// \uABCD, i.e. the following four digits are part of this character.
52-
assert(p + 4 < yyjsontext + len - 1);
54+
char *last_hex_digit = p + 4;
55+
assert(last_hex_digit < yyjsontext + len - 1);
5356
std::string hex(++p, 4);
54-
result += std::stoi(hex, nullptr, 16);
55-
p += 3;
57+
result += codepoint_hex_to_utf8(hex);
58+
p = last_hex_digit;
5659
break;
5760
}
5861
default:; /* an error */

src/util/unicode.cpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ Author: Daniel Kroening, [email protected]
88

99
#include "unicode.h"
1010

11+
#include <codecvt>
12+
#include <cstdint>
1113
#include <cstring>
12-
#include <locale>
1314
#include <iomanip>
15+
#include <locale>
1416
#include <sstream>
15-
#include <cstdint>
17+
18+
#include "invariant.h"
1619

1720
#ifdef _WIN32
1821
#include <util/pragma_push.def>
@@ -315,3 +318,33 @@ std::string utf16_native_endian_to_java(const std::wstring &in)
315318
utf16_native_endian_to_java(ch, result, loc);
316319
return result.str();
317320
}
321+
322+
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
323+
{
324+
return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
325+
}
326+
327+
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
328+
{
329+
#ifdef _MSC_VER
330+
// Workaround for Visual Studio bug, see
331+
// https://stackoverflow.com/questions/32055357
332+
std::wstring wide_string(utf16_str.begin(), utf16_str.end());
333+
return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
334+
.to_bytes(wide_string);
335+
#else
336+
return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
337+
.to_bytes(utf16_str);
338+
#endif
339+
}
340+
341+
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
342+
{
343+
PRECONDITION(hex.length() == 4);
344+
return std::strtol(hex.c_str(), nullptr, 16);
345+
}
346+
347+
std::string codepoint_hex_to_utf8(const std::string &hex)
348+
{
349+
return utf16_native_endian_to_utf8(codepoint_hex_to_utf16_native_endian(hex));
350+
}

src/util/unicode.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,26 @@ std::string utf16_native_endian_to_java(const std::wstring &in);
3131

3232
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
3333

34+
/// \param utf16_char: UTF-16 character in architecture-native endianness
35+
/// encoding
36+
/// \return UTF-8 encoding of the same codepoint
37+
std::string utf16_native_endian_to_utf8(char16_t utf16_char);
38+
39+
/// \param utf16_str: UTF-16 string in architecture-native endianness encoding
40+
/// \return UTF-8 encoding of the string
41+
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str);
42+
43+
/// \param hex: representation of a BMP codepoint as a four-digit string
44+
/// (e.g.\ "0041" for \\u0041)
45+
/// \return encoding of the codepoint as a single UTF-16 character in
46+
/// architecture-native endianness encoding
47+
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex);
48+
49+
/// \param hex: representation of a BMP codepoint as a four-digit string
50+
/// (e.g.\ "0041" for \\u0041)
51+
/// \return UTF-8 encoding of the codepoint
52+
std::string codepoint_hex_to_utf8(const std::string &hex);
53+
3454
template <typename It>
3555
std::vector<const char *> to_c_str_array(It b, It e)
3656
{

unit/json/json_parser.cpp

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,17 +84,22 @@ SCENARIO("Loading JSON files")
8484
}
8585
}
8686
}
87-
GIVEN("A JSON file containing a hexadecimal Unicode character")
87+
GIVEN("A JSON file containing hexadecimal Unicode symbols")
8888
{
8989
temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json");
9090
const std::string unicode_json_path = unicode_json_file();
9191
{
9292
std::ofstream unicode_json_out(unicode_json_path);
9393
unicode_json_out << "{\n"
94-
<< " \"special character\": \"\\u0001\"\n"
94+
<< " \"one\": \"\\u0001\",\n"
95+
<< " \"latin\": \"\\u0042\",\n"
96+
<< " \"grave\": \"\\u00E0\",\n"
97+
<< " \"trema\": \"\\u00FF\",\n"
98+
<< " \"high\": \"\\uFFFF\",\n"
99+
<< " \"several\": \"a\\u0041b\\u2FC3\\uFFFF\"\n"
95100
<< "}\n";
96101
}
97-
WHEN("Loading the JSON file with the special character")
102+
WHEN("Loading the JSON file with the Unicode symbols")
98103
{
99104
jsont unicode_json;
100105
const auto unicode_parse_error =
@@ -105,9 +110,25 @@ SCENARIO("Loading JSON files")
105110
REQUIRE(unicode_json.is_object());
106111

107112
const json_objectt &json_object = to_json_object(unicode_json);
108-
REQUIRE(json_object.find("special character") != json_object.end());
109-
REQUIRE(json_object["special character"].value.size() == 1);
110-
REQUIRE(json_object["special character"].value == "\u0001");
113+
114+
REQUIRE(json_object.find("one") != json_object.end());
115+
REQUIRE(json_object["one"].value.size() == 1);
116+
REQUIRE(json_object["one"].value == u8"\u0001");
117+
118+
REQUIRE(json_object.find("latin") != json_object.end());
119+
REQUIRE(json_object["latin"].value == "B");
120+
121+
REQUIRE(json_object.find("grave") != json_object.end());
122+
REQUIRE(json_object["grave"].value == "à");
123+
124+
REQUIRE(json_object.find("trema") != json_object.end());
125+
REQUIRE(json_object["trema"].value == "ÿ");
126+
127+
REQUIRE(json_object.find("high") != json_object.end());
128+
REQUIRE(json_object["high"].value == u8"\uFFFF");
129+
130+
REQUIRE(json_object.find("several") != json_object.end());
131+
REQUIRE(json_object["several"].value == u8"aAb\u2FC3\uFFFF");
111132
}
112133
}
113134
}

0 commit comments

Comments
 (0)