Skip to content

Commit 0a3ddf8

Browse files
committed
Add support for whole BMP in the JSON parser
The previous implementation only supported codepoints up to 0x7f as characters, and all remaining codepoints up to 0xff as integers. The new implementation supports all codepoints in the BMP, i.e. up to 0xffff.
1 parent a847d8c commit 0a3ddf8

File tree

4 files changed

+85
-9
lines changed

4 files changed

+85
-9
lines changed

src/json/parser.y

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
%{
1919
#include "json_parser.h"
2020

21+
#include <util/unicode.h>
22+
2123
int yyjsonlex();
2224
extern char *yyjsontext;
2325
extern int yyjsonleng; // really an int, not a size_t
@@ -51,7 +53,7 @@ static std::string convert_TOK_STRING()
5153
// \uABCD, i.e. the following four digits are part of this character.
5254
assert(p + 4 < yyjsontext + len - 1);
5355
std::string hex(++p, 4);
54-
result += std::stoi(hex, nullptr, 16);
56+
result += codepoint_hex_to_utf8(hex);
5557
p += 3;
5658
break;
5759
}

src/util/unicode.cpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ Author: Daniel Kroening, [email protected]
88

99
#include "unicode.h"
1010

11+
#include <codecvt>
12+
#include <cstdint>
1113
#include <cstring>
12-
#include <locale>
1314
#include <iomanip>
15+
#include <locale>
1416
#include <sstream>
15-
#include <cstdint>
17+
18+
#include "invariant.h"
1619

1720
#ifdef _WIN32
1821
#include <util/pragma_push.def>
@@ -315,3 +318,33 @@ std::string utf16_native_endian_to_java(const std::wstring &in)
315318
utf16_native_endian_to_java(ch, result, loc);
316319
return result.str();
317320
}
321+
322+
std::string utf16_native_endian_to_utf8(const char16_t utf16_char)
323+
{
324+
return utf16_native_endian_to_utf8(std::u16string(1, utf16_char));
325+
}
326+
327+
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str)
328+
{
329+
#ifdef _MSC_VER
330+
// Workaround for Visual Studio bug, see
331+
// https://stackoverflow.com/questions/32055357
332+
std::wstring wide_string(utf16_str.begin(), utf16_str.end());
333+
return std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t>{}
334+
.to_bytes(wide_string);
335+
#else
336+
return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}
337+
.to_bytes(utf16_str);
338+
#endif
339+
}
340+
341+
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex)
342+
{
343+
PRECONDITION(hex.length() == 4);
344+
return std::strtol(hex.c_str(), nullptr, 16);
345+
}
346+
347+
std::string codepoint_hex_to_utf8(const std::string &hex)
348+
{
349+
return utf16_native_endian_to_utf8(codepoint_hex_to_utf16_native_endian(hex));
350+
}

src/util/unicode.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,26 @@ std::string utf16_native_endian_to_java(const std::wstring &in);
3131

3232
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
3333

34+
/// \param utf16_char: UTF-16 character in architecture-native endianness
35+
/// encoding
36+
/// \return UTF-8 encoding of the same codepoint
37+
std::string utf16_native_endian_to_utf8(char16_t utf16_char);
38+
39+
/// \param utf16_str: UTF-16 string in architecture-native endianness encoding
40+
/// \return UTF-8 encoding of the string
41+
std::string utf16_native_endian_to_utf8(const std::u16string &utf16_str);
42+
43+
/// \param hex: representation of a BMP codepoint as a four-digit string
44+
/// (e.g.\ "0041" for \\u0041)
45+
/// \return encoding of the codepoint as a single UTF-16 character in
46+
/// architecture-native endianness encoding
47+
char16_t codepoint_hex_to_utf16_native_endian(const std::string &hex);
48+
49+
/// \param hex: representation of a BMP codepoint as a four-digit string
50+
/// (e.g.\ "0041" for \\u0041)
51+
/// \return UTF-8 encoding of the codepoint
52+
std::string codepoint_hex_to_utf8(const std::string &hex);
53+
3454
template <typename It>
3555
std::vector<const char *> to_c_str_array(It b, It e)
3656
{

unit/json/json_parser.cpp

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,17 +84,22 @@ SCENARIO("Loading JSON files")
8484
}
8585
}
8686
}
87-
GIVEN("A JSON file containing a hexadecimal Unicode character")
87+
GIVEN("A JSON file containing hexadecimal Unicode symbols")
8888
{
8989
temporary_filet unicode_json_file("cbmc_unit_json_parser_unicode", ".json");
9090
const std::string unicode_json_path = unicode_json_file();
9191
{
9292
std::ofstream unicode_json_out(unicode_json_path);
9393
unicode_json_out << "{\n"
94-
<< " \"special character\": \"\\u0001\"\n"
94+
<< " \"one\": \"\\u0001\",\n"
95+
<< " \"latin\": \"\\u0042\",\n"
96+
<< " \"grave\": \"\\u00E0\",\n"
97+
<< " \"trema\": \"\\u00FF\",\n"
98+
<< " \"high\": \"\\uFFFF\",\n"
99+
<< " \"several\": \"a\\u0041b\\u2FC3\\uFFFF\"\n"
95100
<< "}\n";
96101
}
97-
WHEN("Loading the JSON file with the special character")
102+
WHEN("Loading the JSON file with the Unicode symbols")
98103
{
99104
jsont unicode_json;
100105
const auto unicode_parse_error =
@@ -105,9 +110,25 @@ SCENARIO("Loading JSON files")
105110
REQUIRE(unicode_json.is_object());
106111

107112
const json_objectt &json_object = to_json_object(unicode_json);
108-
REQUIRE(json_object.find("special character") != json_object.end());
109-
REQUIRE(json_object["special character"].value.size() == 1);
110-
REQUIRE(json_object["special character"].value == "\u0001");
113+
114+
REQUIRE(json_object.find("one") != json_object.end());
115+
REQUIRE(json_object["one"].value.size() == 1);
116+
REQUIRE(json_object["one"].value == u8"\u0001");
117+
118+
REQUIRE(json_object.find("latin") != json_object.end());
119+
REQUIRE(json_object["latin"].value == "B");
120+
121+
REQUIRE(json_object.find("grave") != json_object.end());
122+
REQUIRE(json_object["grave"].value == "à");
123+
124+
REQUIRE(json_object.find("trema") != json_object.end());
125+
REQUIRE(json_object["trema"].value == "ÿ");
126+
127+
REQUIRE(json_object.find("high") != json_object.end());
128+
REQUIRE(json_object["high"].value == u8"\uFFFF");
129+
130+
REQUIRE(json_object.find("several") != json_object.end());
131+
REQUIRE(json_object["several"].value == u8"aAb\u2FC3\uFFFF");
111132
}
112133
}
113134
}

0 commit comments

Comments
 (0)