Skip to content

Explicit unsigned -> uint16_t casts to avoid conversion warnings #2543

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion jbmc/src/java_bytecode/expr2java.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ std::string expr2javat::convert_constant(
if(to_integer(src, int_value))
UNREACHABLE;

dest += "(char)'" + utf16_little_endian_to_java(int_value.to_long()) + '\'';
dest += "(char)'" + utf16_native_endian_to_java(int_value.to_long()) + '\'';
return dest;
}
else if(src.type()==java_byte_type())
Expand Down
2 changes: 1 addition & 1 deletion jbmc/src/java_bytecode/java_string_literals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ symbol_exprt get_or_create_string_literal_symbol(
if(string_refinement_enabled)
{
const array_exprt data =
utf16_to_array(utf8_to_utf16(id2string(value), false));
utf16_to_array(utf8_to_utf16_native_endian(id2string(value)));

struct_exprt literal_init(new_symbol.type);
literal_init.operands().resize(jls_struct.components().size());
Expand Down
2 changes: 1 addition & 1 deletion src/ansi-c/literals/convert_string_literal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ std::basic_string<unsigned int> convert_one_string_literal(
unescape_wide_string(std::string(src, 3, src.size()-4));

// turn into utf-8
std::string utf8_value=utf32_to_utf8(value);
const std::string utf8_value = utf32_native_endian_to_utf8(value);

// pad into wide string
value.resize(utf8_value.size());
Expand Down
2 changes: 1 addition & 1 deletion src/ansi-c/literals/unescape_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ static void append_universal_char(
std::basic_string<unsigned int> value_str(1, value);

// turn into utf-8
std::string utf8_value=utf32_to_utf8(value_str);
const std::string utf8_value = utf32_native_endian_to_utf8(value_str);

dest.append(utf8_value);
}
Expand Down
2 changes: 1 addition & 1 deletion src/ansi-c/scanner.l
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ int make_identifier()
utf32+=letter;

// turn into utf-8
std::string utf8_value=utf32_to_utf8(utf32);
const std::string utf8_value = utf32_native_endian_to_utf8(utf32);
final_base_name+=utf8_value;
}
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ utf16_constant_array_to_java(const array_exprt &arr, std::size_t length)
INVARIANT(!conversion_failed, "constant should be convertible to unsigned");
out[i]=c;
}
return utf16_little_endian_to_java(out);
return utf16_native_endian_to_java(out);
}

/// Formatted string using a format string and list of arguments
Expand Down
2 changes: 1 addition & 1 deletion src/util/file_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ void delete_directory_utf16(const std::wstring &path)
void delete_directory(const std::string &path)
{
#ifdef _WIN32
delete_directory_utf16(utf8_to_utf16_little_endian(path));
delete_directory_utf16(utf8_to_utf16_native_endian(path));
#else
DIR *dir=opendir(path.c_str());
if(dir!=nullptr)
Expand Down
87 changes: 24 additions & 63 deletions src/util/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,6 @@ Author: Daniel Kroening, [email protected]
#include <windows.h>
#endif

/// Determine endianness of the architecture
/// \return True if the architecture is little_endian
bool is_little_endian_arch()
{
uint32_t i=1;
return reinterpret_cast<uint8_t &>(i) != 0;
}

#define BUFSIZE 100

std::string narrow(const wchar_t *s)
{
#ifdef _WIN32
Expand Down Expand Up @@ -138,9 +128,10 @@ static void utf8_append_code(unsigned int c, std::string &result)
}
}

/// \param utf32:encoded wide string
/// \param s UTF-32 encoded wide string
/// \return utf8-encoded string with the same unicode characters as the input.
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
std::string
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s)
{
std::string result;

Expand All @@ -166,51 +157,37 @@ std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide)
return argv_narrow;
}

/// A helper function for dealing with different UTF16 endians
/// \par parameters: A 16-bit integer
/// \return A 16-bit integer with bytes swapped
uint16_t do_swap_bytes(uint16_t x)
{
uint16_t b1=x & 0xFF;
uint16_t b2=x & 0xFF00;
return (b1 << 8) | (b2 >> 8);
}


void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
static void utf16_append_code(unsigned int code, std::wstring &result)
{
// we do not treat 0xD800 to 0xDFFF, although
// they are not valid unicode symbols

if(code<0xFFFF)
{ // code is encoded as one UTF16 character
// we just take the code and possibly swap the bytes
unsigned int a=(swap_bytes)?do_swap_bytes(code):code;
result+=static_cast<wchar_t>(a);
{
// code is encoded as one UTF16 character
result += static_cast<wchar_t>(code);
}
else // code is encoded as two UTF16 characters
{
// if this is valid unicode, we have
// code<0x10FFFF
// but let's not check it programmatically

// encode the code in UTF16, possibly swapping bytes.
// encode the code in UTF16
code=code-0x10000;
unsigned int i1=((code>>10) & 0x3ff) | 0xD800;
unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1;
result+=static_cast<wchar_t>(a1);
unsigned int i2=(code & 0x3ff) | 0xDC00;
unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2;
result+=static_cast<wchar_t>(a2);
const uint16_t i1 = static_cast<uint16_t>(((code >> 10) & 0x3ff) | 0xD800);
result += static_cast<wchar_t>(i1);
const uint16_t i2 = static_cast<uint16_t>((code & 0x3ff) | 0xDC00);
result += static_cast<wchar_t>(i2);
}
}


/// \par parameters: String in UTF-8 format, bool value indicating whether the
/// endianness should be different from the architecture one.
/// Convert UTF8-encoded string to UTF-16 with architecture-native endianness.
/// \par parameters: String in UTF-8 format
/// \return String in UTF-16 format. The encoding follows the endianness of the
/// architecture iff swap_bytes is true.
std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
std::wstring utf8_to_utf16_native_endian(const std::string &in)
{
std::wstring result;
result.reserve(in.size());
Expand Down Expand Up @@ -263,33 +240,17 @@ std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
code=32;
}

utf16_append_code(code, swap_bytes, result);
utf16_append_code(code, result);
}

return result;
}

/// \par parameters: String in UTF-8 format
/// \return String in UTF-16BE format
std::wstring utf8_to_utf16_big_endian(const std::string &in)
{
bool swap_bytes=is_little_endian_arch();
return utf8_to_utf16(in, swap_bytes);
}

/// \par parameters: String in UTF-8 format
/// \return String in UTF-16LE format
std::wstring utf8_to_utf16_little_endian(const std::string &in)
{
bool swap_bytes=!is_little_endian_arch();
return utf8_to_utf16(in, swap_bytes);
}

/// \param ch: UTF-16LE character
/// \param ch: UTF-16 character in architecture-native endianness encoding
/// \param result: stream to receive string in US-ASCII format, with \\uxxxx
/// escapes for other characters
/// \param loc: locale to check for printable characters
static void utf16_little_endian_to_java(
static void utf16_native_endian_to_java(
const wchar_t ch,
std::ostringstream &result,
const std::locale &loc)
Expand Down Expand Up @@ -326,23 +287,23 @@ static void utf16_little_endian_to_java(
}
}

/// \param ch: UTF-16LE character
/// \param ch: UTF-16 character in architecture-native endianness encoding
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
std::string utf16_little_endian_to_java(const wchar_t ch)
std::string utf16_native_endian_to_java(const wchar_t ch)
{
std::ostringstream result;
const std::locale loc;
utf16_little_endian_to_java(ch, result, loc);
utf16_native_endian_to_java(ch, result, loc);
return result.str();
}

/// \param in: String in UTF-16LE format
/// \param in: String in UTF-16 (native endianness) format
/// \return String in US-ASCII format, with \\uxxxx escapes for other characters
std::string utf16_little_endian_to_java(const std::wstring &in)
std::string utf16_native_endian_to_java(const std::wstring &in)
{
std::ostringstream result;
const std::locale loc;
for(const auto ch : in)
utf16_little_endian_to_java(ch, result, loc);
utf16_native_endian_to_java(ch, result, loc);
return result.str();
}
11 changes: 5 additions & 6 deletions src/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,12 @@ std::wstring widen(const char *s);
std::string narrow(const std::wstring &s);
std::wstring widen(const std::string &s);

std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
std::string
utf32_native_endian_to_utf8(const std::basic_string<unsigned int> &s);

std::wstring utf8_to_utf16(const std::string &in, bool swap_bytes);
std::wstring utf8_to_utf16_big_endian(const std::string &);
std::wstring utf8_to_utf16_little_endian(const std::string &);
std::string utf16_little_endian_to_java(const wchar_t ch);
std::string utf16_little_endian_to_java(const std::wstring &in);
std::wstring utf8_to_utf16_native_endian(const std::string &in);
std::string utf16_native_endian_to_java(const wchar_t ch);
std::string utf16_native_endian_to_java(const std::wstring &in);

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that everything is "native endian", does that need to be mentioned in the name of the function?
It's implicit in any function that takes numbers as arguments.

std::vector<std::string> narrow_argv(int argc, const wchar_t **argv_wide);

Expand Down
1 change: 0 additions & 1 deletion unit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ list(REMOVE_ITEM sources
${CMAKE_CURRENT_SOURCE_DIR}/json.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp_parser.cpp
${CMAKE_CURRENT_SOURCE_DIR}/osx_fat_reader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/unicode.cpp
${CMAKE_CURRENT_SOURCE_DIR}/wp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cpp_scanner.cpp
${CMAKE_CURRENT_SOURCE_DIR}/float_utils.cpp
Expand Down
1 change: 1 addition & 0 deletions unit/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ SRC += unit_tests.cpp \
util/string_utils/split_string.cpp \
util/string_utils/strip_string.cpp \
util/symbol_table.cpp \
util/unicode.cpp \
catch_example.cpp \
# Empty last line

Expand Down
67 changes: 36 additions & 31 deletions unit/unicode.cpp → unit/util/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@ Author: Vojtech Forejt, [email protected]

\*******************************************************************/

#include <cassert>
#include <testing-utils/catch.hpp>

#include <vector>
#include <string>
#include <codecvt>
#include <iomanip>
#include <iostream>
#include <locale>

#include <util/unicode.h>

// the u8 prefix is only available from VS 2015 onwards
#if !defined(_MSC_VER) || _MSC_VER >= 1900
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check with @kroening but I think this is our minimum compiler on MS environments anyhow.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, at least AppVeyor's VS seems to be older than that (it does not support the u8 prefix).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll do a PR that upgrades AppVeyor to 2013. There are no stakeholders that I know of that still care about VS 2013.


// This unit test compares our implementation with codecvt implementation,
// checking bit-by-bit equivalence of results.

bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
static bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
{
if(a.size() != b.size())
return false;
Expand All @@ -35,7 +37,10 @@ bool paranoid_wstr_equals(const std::wstring &a, const std::wstring &b)
}

// helper print function, can be called for debugging problem
void wstr_print(const std::wstring &a, const std::wstring &b)
#if 0
#include <iostream>

static void wstr_print(const std::wstring &a, const std::wstring &b)
{
int endi=(a.size()>b.size())?a.size():b.size();
const unsigned char
Expand All @@ -49,46 +54,46 @@ void wstr_print(const std::wstring &a, const std::wstring &b)
}
std::cout << '\n';
}
#endif

void compare_utf8_to_utf16_big_endian(std::string& in)
static bool compare_utf8_to_utf16(const std::string &in)
{
std::wstring s1=utf8_to_utf16_big_endian(in);
const std::wstring s1 = utf8_to_utf16_native_endian(in);

typedef std::codecvt_utf8_utf16<wchar_t> codecvt_utf8_utf16t;
std::wstring_convert<codecvt_utf8_utf16t> converter;
std::wstring s2=converter.from_bytes(in);

assert(paranoid_wstr_equals(s1, s2));
return paranoid_wstr_equals(s1, s2);
}

void compare_utf8_to_utf16_little_endian(std::string& in)
TEST_CASE("unicode0", "[core][util][unicode]")
{
std::wstring s1=utf8_to_utf16_little_endian(in);

const std::codecvt_mode mode=std::codecvt_mode::little_endian;
const unsigned long maxcode=0x10ffff;
const std::string s = u8"abc";
REQUIRE(compare_utf8_to_utf16(s));
}

typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
std::wstring_convert<codecvt_utf8_utf16t> converter;
std::wstring s2=converter.from_bytes(in);
TEST_CASE("unicode1", "[core][util][unicode]")
{
const std::string s = u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8";
REQUIRE(compare_utf8_to_utf16(s));
}

assert(paranoid_wstr_equals(s1, s2));
TEST_CASE("unicode2", "[core][util][unicode]")
{
const std::string s = u8"$¢€𐍈";
REQUIRE(compare_utf8_to_utf16(s));
}

int main()
TEST_CASE("unicode3", "[core][util][unicode]")
{
std::string s;
s=u8"\u0070\u00DF\u00E0\u00EF\u00F0\u00F7\u00F8";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
s=u8"$¢€𐍈";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
s=u8"𐐏𤭢";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
s=u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
compare_utf8_to_utf16_big_endian(s);
compare_utf8_to_utf16_little_endian(s);
const std::string s = u8"𐐏𤭢";
REQUIRE(compare_utf8_to_utf16(s));
}

TEST_CASE("unicode4", "[core][util][unicode]")
{
const std::string s = u8"дȚȨɌṡʒʸͼἨѶݔݺ→⅒⅀▤▞╢◍⛳⻥龍ンㄗㄸ";
REQUIRE(compare_utf8_to_utf16(s));
}
#endif