Skip to content

Commit c7457fb

Browse files
author
Daniel Kroening
authored
Merge pull request diffblue#2543 from tautschnig/vs-unsigned-byte-swap
Explicit unsigned -> uint16_t casts to avoid conversion warnings
2 parents 0e72433 + 9b9aecf commit c7457fb

12 files changed

+73
-108
lines changed

jbmc/src/java_bytecode/expr2java.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ std::string expr2javat::convert_constant(
202202
if(to_integer(src, int_value))
203203
UNREACHABLE;
204204

205-
dest += "(char)'" + utf16_little_endian_to_java(int_value.to_long()) + '\'';
205+
dest += "(char)'" + utf16_native_endian_to_java(int_value.to_long()) + '\'';
206206
return dest;
207207
}
208208
else if(src.type()==java_byte_type())

jbmc/src/java_bytecode/java_string_literals.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ symbol_exprt get_or_create_string_literal_symbol(
106106
if(string_refinement_enabled)
107107
{
108108
const array_exprt data =
109-
utf16_to_array(utf8_to_utf16(id2string(value), false));
109+
utf16_to_array(utf8_to_utf16_native_endian(id2string(value)));
110110

111111
struct_exprt literal_init(new_symbol.type);
112112
literal_init.operands().resize(jls_struct.components().size());

src/ansi-c/literals/convert_string_literal.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ std::basic_string<unsigned int> convert_one_string_literal(
3434
unescape_wide_string(std::string(src, 3, src.size()-4));
3535

3636
// turn into utf-8
37-
std::string utf8_value=utf32_to_utf8(value);
37+
const std::string utf8_value = utf32_native_endian_to_utf8(value);
3838

3939
// pad into wide string
4040
value.resize(utf8_value.size());

src/ansi-c/literals/unescape_string.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ static void append_universal_char(
2323
std::basic_string<unsigned int> value_str(1, value);
2424

2525
// turn into utf-8
26-
std::string utf8_value=utf32_to_utf8(value_str);
26+
const std::string utf8_value = utf32_native_endian_to_utf8(value_str);
2727

2828
dest.append(utf8_value);
2929
}

src/ansi-c/scanner.l

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ int make_identifier()
6767
utf32+=letter;
6868

6969
// turn into utf-8
70-
std::string utf8_value=utf32_to_utf8(utf32);
70+
const std::string utf8_value = utf32_native_endian_to_utf8(utf32);
7171
final_base_name+=utf8_value;
7272
}
7373
else

src/solvers/refinement/string_constraint_generator_format.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ utf16_constant_array_to_java(const array_exprt &arr, std::size_t length)
444444
INVARIANT(!conversion_failed, "constant should be convertible to unsigned");
445445
out[i]=c;
446446
}
447-
return utf16_little_endian_to_java(out);
447+
return utf16_native_endian_to_java(out);
448448
}
449449

450450
/// Formatted string using a format string and list of arguments

src/util/file_util.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ void delete_directory_utf16(const std::wstring &path)
9595
void delete_directory(const std::string &path)
9696
{
9797
#ifdef _WIN32
98-
delete_directory_utf16(utf8_to_utf16_little_endian(path));
98+
delete_directory_utf16(utf8_to_utf16_native_endian(path));
9999
#else
100100
DIR *dir=opendir(path.c_str());
101101
if(dir!=nullptr)

src/util/unicode.cpp

+24-63
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,6 @@ Author: Daniel Kroening, [email protected]
1818
#include <windows.h>
1919
#endif
2020

21-
/// Determine endianness of the architecture
22-
/// \return True if the architecture is little_endian
23-
bool is_little_endian_arch()
24-
{
25-
uint32_t i=1;
26-
return reinterpret_cast<uint8_t &>(i) != 0;
27-
}
28-
29-
#define BUFSIZE 100
30-
3121
std::string narrow(const wchar_t *s)
3222
{
3323
#ifdef _WIN32
@@ -138,9 +128,10 @@ static void utf8_append_code(unsigned int c, std::string &result)
138128
}
139129
}
140130

141-
/// \param utf32:encoded wide string
131+
/// \param s UTF-32 encoded wide string
142132
/// \return utf8-encoded string with the same unicode characters as the input.
143-
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
133+
std::string
134+
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
144135
{
145136
std::string result;
146137

@@ -166,51 +157,37 @@ std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
166157
return argv_narrow;
167158
}
168159

169-
/// A helper function for dealing with different UTF16 endians
170-
/// \par parameters: A 16-bit integer
171-
/// \return A 16-bit integer with bytes swapped
172-
uint16_t do_swap_bytes(uint16_t x)
173-
{
174-
uint16_t b1=x & 0xFF;
175-
uint16_t b2=x & 0xFF00;
176-
return (b1 << 8) | (b2 >> 8);
177-
}
178-
179-
180-
void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
160+
static void utf16_append_code(unsigned int code, std::wstring &result)
181161
{
182162
// we do not treat 0xD800 to 0xDFFF, although
183163
// they are not valid unicode symbols
184164

185165
if(code<0xFFFF)
186-
{ // code is encoded as one UTF16 character
187-
// we just take the code and possibly swap the bytes
188-
unsigned int a=(swap_bytes)?do_swap_bytes(code):code;
189-
result+=static_cast<wchar_t>(a);
166+
{
167+
// code is encoded as one UTF16 character
168+
result += static_cast<wchar_t>(code);
190169
}
191170
else // code is encoded as two UTF16 characters
192171
{
193172
// if this is valid unicode, we have
194173
// code<0x10FFFF
195174
// but let's not check it programmatically
196175

197-
// encode the code in UTF16, possibly swapping bytes.
176+
// encode the code in UTF16
198177
code=code-0x10000;
199-
unsigned int i1=((code>>10) & 0x3ff) | 0xD800;
200-
unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1;
201-
result+=static_cast<wchar_t>(a1);
202-
unsigned int i2=(code & 0x3ff) | 0xDC00;
203-
unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2;
204-
result+=static_cast<wchar_t>(a2);
178+
const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
179+
result += static_cast<wchar_t>(i1);
180+
const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
181+
result += static_cast<wchar_t>(i2);
205182
}
206183
}
207184

208185

209-
/// \par parameters: String in UTF-8 format, bool value indicating whether the
210-
/// endianness should be different from the architecture one.
186+
/// Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
187+
/// \par parameters: String in UTF-8 format
211188
/// \return String in UTF-16 format. The encoding follows the endianness of the
212189
/// architecture iff swap_bytes is true.
213-
std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
190+
std::wstring utf8_to_utf16_native_endian(const std::string &in)
214191
{
215192
std::wstring result;
216193
result.reserve(in.size());
@@ -263,33 +240,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
263240
code=32;
264241
}
265242

266-
utf16_append_code(code, swap_bytes, result);
243+
utf16_append_code(code, result);
267244
}
268245

269246
return result;
270247
}
271248

272-
/// \par parameters: String in UTF-8 format
273-
/// \return String in UTF-16BE format
274-
std::wstring utf8_to_utf16_big_endian(const std::string &in)
275-
{
276-
bool swap_bytes=is_little_endian_arch();
277-
return utf8_to_utf16(in, swap_bytes);
278-
}
279-
280-
/// \par parameters: String in UTF-8 format
281-
/// \return String in UTF-16LE format
282-
std::wstring utf8_to_utf16_little_endian(const std::string &in)
283-
{
284-
bool swap_bytes=!is_little_endian_arch();
285-
return utf8_to_utf16(in, swap_bytes);
286-
}
287-
288-
/// \param ch: UTF-16LE character
249+
/// \param ch: UTF-16 character in architecture-native endianness encoding
289250
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
290251
/// escapes for other characters
291252
/// \param loc: locale to check for printable characters
292-
static void utf16_little_endian_to_java(
253+
static void utf16_native_endian_to_java(
293254
const wchar_t ch,
294255
std::ostringstream &result,
295256
const std::locale &loc)
@@ -326,23 +287,23 @@ static void utf16_little_endian_to_java(
326287
}
327288
}
328289

329-
/// \param ch: UTF-16LE character
290+
/// \param ch: UTF-16 character in architecture-native endianness encoding
330291
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
331-
std::string utf16_little_endian_to_java(const wchar_t ch)
292+
std::string utf16_native_endian_to_java(const wchar_t ch)
332293
{
333294
std::ostringstream result;
334295
const std::locale loc;
335-
utf16_little_endian_to_java(ch, result, loc);
296+
utf16_native_endian_to_java(ch, result, loc);
336297
return result.str();
337298
}
338299

339-
/// \param in: String in UTF-16LE format
300+
/// \param in: String in UTF-16 (native endianness) format
340301
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
341-
std::string utf16_little_endian_to_java(const std::wstring &in)
302+
std::string utf16_native_endian_to_java(const std::wstring &in)
342303
{
343304
std::ostringstream result;
344305
const std::locale loc;
345306
for(const auto ch : in)
346-
utf16_little_endian_to_java(ch, result, loc);
307+
utf16_native_endian_to_java(ch, result, loc);
347308
return result.str();
348309
}

src/util/unicode.h

+5-6
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,12 @@ std::wstring widen(const char *s);
2222
std::string narrow(const std::wstring &s);
2323
std::wstring widen(const std::string &s);
2424

25-
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
25+
std::string
26+
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s);
2627

27-
std::wstring utf8_to_utf16(const std::string &in, bool swap_bytes);
28-
std::wstring utf8_to_utf16_big_endian(const std::string &);
29-
std::wstring utf8_to_utf16_little_endian(const std::string &);
30-
std::string utf16_little_endian_to_java(const wchar_t ch);
31-
std::string utf16_little_endian_to_java(const std::wstring &in);
28+
std::wstring utf8_to_utf16_native_endian(const std::string &in);
29+
std::string utf16_native_endian_to_java(const wchar_t ch);
30+
std::string utf16_native_endian_to_java(const std::wstring &in);
3231

3332
std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);
3433

unit/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ list(REMOVE_ITEM sources
1313
${CMAKE_CURRENT_SOURCE_DIR}/json.cpp
1414
${CMAKE_CURRENT_SOURCE_DIR}/cpp_parser.cpp
1515
${CMAKE_CURRENT_SOURCE_DIR}/osx_fat_reader.cpp
16-
${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp
1716
${CMAKE_CURRENT_SOURCE_DIR}/wp.cpp
1817
${CMAKE_CURRENT_SOURCE_DIR}/cpp_scanner.cpp
1918
${CMAKE_CURRENT_SOURCE_DIR}/float_utils.cpp

unit/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ SRC += unit_tests.cpp \
3737
util/string_utils/split_string.cpp \
3838
util/string_utils/strip_string.cpp \
3939
util/symbol_table.cpp \
40+
util/unicode.cpp \
4041
catch_example.cpp \
4142
# Empty last line
4243

unit/unicode.cpp renamed to unit/util/unicode.cpp

+36-31
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,22 @@ Author: Vojtech Forejt, [email protected]
66
77
\*******************************************************************/
88

9-
#include <cassert>
9+
#include <testing-utils/catch.hpp>
10+
1011
#include <vector>
1112
#include <string>
1213
#include <codecvt>
13-
#include <iomanip>
14-
#include <iostream>
1514
#include <locale>
1615

1716
#include <util/unicode.h>
1817

18+
// the u8 prefix is only available from VS 2015 onwards
19+
#if !defined(_MSC_VER) || _MSC_VER >= 1900
20+
1921
// This unit test compares our implementation with codecvt implementation,
2022
// checking bit-by-bit equivalence of results.
2123

22-
bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
24+
static bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
2325
{
2426
if(a.size() != b.size())
2527
return false;
@@ -35,7 +37,10 @@ bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
3537
}
3638

3739
// helper print function, can be called for debugging problem
38-
void wstr_print(const std::wstring &a, const std::wstring &b)
40+
#if 0
41+
#include <iostream>
42+
43+
static void wstr_print(const std::wstring &a, const std::wstring &b)
3944
{
4045
int endi=(a.size()>b.size())?a.size():b.size();
4146
const unsigned char
@@ -49,46 +54,46 @@ void wstr_print(const std::wstring &a, const std::wstring &b)
4954
}
5055
std::cout << '\n';
5156
}
57+
#endif
5258

53-
void compare_utf8_to_utf16_big_endian(std::string& in)
59+
static bool compare_utf8_to_utf16(const std::string &in)
5460
{
55-
std::wstring s1=utf8_to_utf16_big_endian(in);
61+
const std::wstring s1 = utf8_to_utf16_native_endian(in);
5662

5763
typedef std::codecvt_utf8_utf16<wchar_t> codecvt_utf8_utf16t;
5864
std::wstring_convert<codecvt_utf8_utf16t> converter;
5965
std::wstring s2=converter.from_bytes(in);
6066

61-
assert(paranoid_wstr_equals(s1, s2));
67+
return paranoid_wstr_equals(s1, s2);
6268
}
6369

64-
void compare_utf8_to_utf16_little_endian(std::string& in)
70+
TEST_CASE("unicode0", "[core][util][unicode]")
6571
{
66-
std::wstring s1=utf8_to_utf16_little_endian(in);
67-
68-
const std::codecvt_mode mode=std::codecvt_mode::little_endian;
69-
const unsigned long maxcode=0x10ffff;
72+
const std::string s = u8"abc";
73+
REQUIRE(compare_utf8_to_utf16(s));
74+
}
7075

71-
typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
72-
std::wstring_convert<codecvt_utf8_utf16t> converter;
73-
std::wstring s2=converter.from_bytes(in);
76+
TEST_CASE("unicode1", "[core][util][unicode]")
77+
{
78+
const std::string s = u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8";
79+
REQUIRE(compare_utf8_to_utf16(s));
80+
}
7481

75-
assert(paranoid_wstr_equals(s1, s2));
82+
TEST_CASE("unicode2", "[core][util][unicode]")
83+
{
84+
const std::string s = u8"$¢€𐍈";
85+
REQUIRE(compare_utf8_to_utf16(s));
7686
}
7787

78-
int main()
88+
TEST_CASE("unicode3", "[core][util][unicode]")
7989
{
80-
std::string s;
81-
s=u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8";
82-
compare_utf8_to_utf16_big_endian(s);
83-
compare_utf8_to_utf16_little_endian(s);
84-
s=u8"$¢€𐍈";
85-
compare_utf8_to_utf16_big_endian(s);
86-
compare_utf8_to_utf16_little_endian(s);
87-
s=u8"𐐏𤭢";
88-
compare_utf8_to_utf16_big_endian(s);
89-
compare_utf8_to_utf16_little_endian(s);
90-
s=u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
91-
compare_utf8_to_utf16_big_endian(s);
92-
compare_utf8_to_utf16_little_endian(s);
90+
const std::string s = u8"𐐏𤭢";
91+
REQUIRE(compare_utf8_to_utf16(s));
9392
}
9493

94+
TEST_CASE("unicode4", "[core][util][unicode]")
95+
{
96+
const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
97+
REQUIRE(compare_utf8_to_utf16(s));
98+
}
99+
#endif

0 commit comments

Comments
 (0)