-
Notifications
You must be signed in to change notification settings - Fork 273
Unicode support cleanup #868
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,9 +8,9 @@ Author: Daniel Kroening, [email protected] | |
|
||
#include <cstring> | ||
#include <locale> | ||
#include <codecvt> | ||
#include <iomanip> | ||
#include <sstream> | ||
#include <cstdint> | ||
|
||
#include "unicode.h" | ||
|
||
|
@@ -20,6 +20,24 @@ Author: Daniel Kroening, [email protected] | |
|
||
/*******************************************************************\ | ||
|
||
Function: is_little_endian_arch | ||
|
||
Inputs: | ||
|
||
Outputs: True if the architecture is little_endian | ||
|
||
Purpose: Determine endianness of the architecture | ||
|
||
\*******************************************************************/ | ||
|
||
bool is_little_endian_arch() | ||
{ | ||
uint32_t i=1; | ||
return reinterpret_cast<uint8_t &>(i); | ||
} | ||
|
||
/*******************************************************************\ | ||
|
||
Function: narrow | ||
|
||
Inputs: | ||
|
@@ -154,17 +172,17 @@ std::wstring widen(const std::string &s) | |
|
||
/*******************************************************************\ | ||
|
||
Function: utf32_to_utf8 | ||
Function: utf8_append_code | ||
|
||
Inputs: | ||
Inputs: character to append, string to append to | ||
|
||
Outputs: | ||
|
||
Purpose: | ||
Purpose: Appends a unicode character to a utf8-encoded string | ||
|
||
\*******************************************************************/ | ||
|
||
void utf32_to_utf8(unsigned int c, std::string &result) | ||
static void utf8_append_code(unsigned int c, std::string &result) | ||
{ | ||
if(c<=0x7f) | ||
result+=static_cast<char>(c); | ||
|
@@ -192,9 +210,10 @@ void utf32_to_utf8(unsigned int c, std::string &result) | |
|
||
Function: utf32_to_utf8 | ||
|
||
Inputs: | ||
Inputs: utf32-encoded wide string | ||
|
||
Outputs: | ||
Outputs: utf8-encoded string with the same unicode characters | ||
as the input. | ||
|
||
Purpose: | ||
|
||
|
@@ -207,14 +226,14 @@ std::string utf32_to_utf8(const std::basic_string<unsigned int> &s) | |
result.reserve(s.size()); // at least that long | ||
|
||
for(const auto c : s) | ||
utf32_to_utf8(c, result); | ||
utf8_append_code(c, result); | ||
|
||
return result; | ||
} | ||
|
||
/*******************************************************************\ | ||
|
||
Function: utf16_to_utf8 | ||
Function: narrow_argv | ||
|
||
Inputs: | ||
|
||
|
@@ -224,43 +243,140 @@ Function: utf16_to_utf8 | |
|
||
\*******************************************************************/ | ||
|
||
std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s) | ||
const char **narrow_argv(int argc, const wchar_t **argv_wide) | ||
{ | ||
std::string result; | ||
if(argv_wide==NULL) | ||
return NULL; | ||
|
||
result.reserve(s.size()); // at least that long | ||
// the following never gets deleted | ||
const char **argv_narrow=new const char *[argc+1]; | ||
argv_narrow[argc]=0; | ||
|
||
for(const auto c : s) | ||
utf32_to_utf8(c, result); | ||
for(int i=0; i<argc; i++) | ||
argv_narrow[i]=strdup(narrow(argv_wide[i]).c_str()); | ||
|
||
return result; | ||
return argv_narrow; | ||
} | ||
|
||
/*******************************************************************\ | ||
|
||
Function: narrow_argv | ||
Function: do_swap_bytes | ||
|
||
Inputs: | ||
Inputs: A 16-bit integer | ||
|
||
Outputs: | ||
Outputs: A 16-bit integer with bytes swapped | ||
|
||
Purpose: | ||
Purpose: A helper function for dealing with different UTF16 endians | ||
|
||
\*******************************************************************/ | ||
|
||
const char **narrow_argv(int argc, const wchar_t **argv_wide) | ||
uint16_t do_swap_bytes(uint16_t x) | ||
{ | ||
if(argv_wide==NULL) | ||
return NULL; | ||
uint16_t b1=x & 0xFF; | ||
uint16_t b2=x & 0xFF00; | ||
return (b1 << 8) | (b2 >> 8); | ||
} | ||
|
||
// the following never gets deleted | ||
const char **argv_narrow=new const char *[argc+1]; | ||
argv_narrow[argc]=0; | ||
|
||
for(int i=0; i<argc; i++) | ||
argv_narrow[i]=strdup(narrow(argv_wide[i]).c_str()); | ||
void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result) | ||
{ | ||
// we do not treat 0xD800 to 0xDFFF, although | ||
// they are not valid unicode symbols | ||
|
||
if(code<0xFFFF) | ||
{ // code is encoded as one UTF16 character | ||
// we just take the code and possibly swap the bytes | ||
unsigned int a=(swap_bytes)?do_swap_bytes(code):code; | ||
result+=static_cast<wchar_t>(a); | ||
} | ||
else // code is encoded as two UTF16 characters | ||
{ | ||
// if this is valid unicode, we have | ||
// code<0x10FFFF | ||
// but let's not check it programmatically | ||
|
||
// encode the code in UTF16, possibly swapping bytes. | ||
code=code-0x10000; | ||
unsigned int i1=((code>>10) & 0x3ff) | 0xD800; | ||
unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1; | ||
result+=static_cast<wchar_t>(a1); | ||
unsigned int i2=(code & 0x3ff) | 0xDC00; | ||
unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2; | ||
result+=static_cast<wchar_t>(a2); | ||
} | ||
} | ||
|
||
return argv_narrow; | ||
|
||
/*******************************************************************\ | ||
|
||
Function: utf8_to_utf16 | ||
|
||
Inputs: String in UTF-8 format, bool value indicating whether the | ||
endianness should be different from the architecture one. | ||
|
||
Outputs: String in UTF-16 format. The encoding follows the | ||
endianness of the architecture iff swap_bytes is true. | ||
|
||
Purpose: | ||
|
||
\*******************************************************************/ | ||
std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes) | ||
{ | ||
std::wstring result; | ||
result.reserve(in.size()); | ||
int i=0; | ||
while(i<in.size()) | ||
{ | ||
unsigned char c=in[i++]; | ||
unsigned int code=0; | ||
// the ifs that follow find out how many UTF8 characters (1-4) store the | ||
// next unicode character. This is determined by the few most | ||
// significant bits. | ||
if(c<=0x7F) | ||
{ | ||
// if it's one character, then code is exactly the value | ||
code=c; | ||
} | ||
else if(c<=0xDF && i<in.size()) | ||
{ // in other cases, we need to read the right number of chars and decode | ||
// note: if we wanted to make sure that we capture incorrect strings, | ||
// we should check that whatever follows first character starts with | ||
// bits 10. | ||
code=(c & 0x1F) << 6; | ||
c=in[i++]; | ||
code+=c & 0x3F; | ||
} | ||
else if(c<=0xEF && i+1<in.size()) | ||
{ | ||
code=(c & 0xF) << 12; | ||
c=in[i++]; | ||
code+=(c & 0x3F) << 6; | ||
c=in[i++]; | ||
code+=c & 0x3F; | ||
} | ||
else if(c<=0xF7 && i+2<in.size()) | ||
{ | ||
code=(c & 0x7) << 18; | ||
c=in[i++]; | ||
code+=(c & 0x3F) << 12; | ||
c=in[i++]; | ||
code+=(c & 0x3F) << 6; | ||
c=in[i++]; | ||
code+=c & 0x3F; | ||
} | ||
else | ||
{ | ||
// The string is not a valid UTF8 string! Either it has some characters | ||
// missing from a multi-character unicode symbol, or it has a char with | ||
// too high value. | ||
// For now, let's replace the character with a space | ||
code=32; | ||
} | ||
|
||
utf16_append_code(code, swap_bytes, result); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
/*******************************************************************\ | ||
|
@@ -271,14 +387,14 @@ Function: utf8_to_utf16_big_endian | |
|
||
Outputs: String in UTF-16BE format | ||
|
||
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+ | ||
Purpose: | ||
|
||
\*******************************************************************/ | ||
|
||
std::wstring utf8_to_utf16_big_endian(const std::string& in) | ||
{ | ||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > converter; | ||
return converter.from_bytes(in); | ||
bool swap_bytes=is_little_endian_arch(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Following up to my own comment above: do we actually have use cases for this function where the operating environment decides endianness, as well as cases where it's the program under analysis that matters (and thus information in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I understand the context (and how the original codecvt-based function was implemented), this has nothing to do with analysis endianness. But anyway, this particular function seems to be never used, but I still implemented it as it was quite easy having the other one, which is used. And I was not sure if we might need this one in the future when we extend string support in cbmc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The last one is the runtime environment, which I believe we would thus know at compile team. The first one processes analysis input into an internal format. I do not think that the internal format should vary across architectures, but instead should always be of a single endianness. (I believe internally we're actually big-endian. @kroening ?) The middle case I'm not entirely sure about... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I lack deeper knowledge of the usecases, and the more I look at them the more I am confused. We use the little endian function in refinements, is there a reason for this (@smowton?). I thought this has something to do with Java strings, but a quick google search suggests that Java uses big endian utf16 no matter what the platform is. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The use case I originally wrote this for was to translate UTF-8 strings stored in Java class files into UTF-16 (in analysis host endian-ness) string literals. It doesn't matter to us that the JVM would use big-endian UTF-16-- rather that arithmetic on codepoints corresponds to arithmetic on the host doing the analysis. I suspect the code in https://github.com/diffblue/cbmc/blob/c8c9085f38317359bf175d8bb99ab16e4605c1e1/src/java_bytecode/java_bytecode_typecheck_expr.cpp only works on little-endian architectures as a result. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @smowton Should this be raised as a bug and dealt with in a separate PR? And is there any additional functionality needed for this in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably, yes. There's already a to-big-endian function too so no extra work in unicode.cpp to my knowledge. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we thus really need to run-time check |
||
return utf8_to_utf16(in, swap_bytes); | ||
} | ||
|
||
/*******************************************************************\ | ||
|
@@ -289,21 +405,14 @@ Function: utf8_to_utf16_little_endian | |
|
||
Outputs: String in UTF-16LE format | ||
|
||
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+ | ||
Purpose: | ||
|
||
\*******************************************************************/ | ||
|
||
std::wstring utf8_to_utf16_little_endian(const std::string& in) | ||
{ | ||
const std::codecvt_mode mode=std::codecvt_mode::little_endian; | ||
|
||
// default largest value codecvt_utf8_utf16 reads without error is 0x10ffff | ||
// see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16 | ||
const unsigned long maxcode=0x10ffff; | ||
|
||
typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t; | ||
std::wstring_convert<codecvt_utf8_utf16t> converter; | ||
return converter.from_bytes(in); | ||
bool swap_bytes=!is_little_endian_arch(); | ||
return utf8_to_utf16(in, swap_bytes); | ||
} | ||
|
||
/*******************************************************************\ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,6 @@ std::string narrow(const std::wstring &s); | |
std::wstring widen(const std::string &s); | ||
|
||
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The above needs to be renamed or even removed (see my suggestion for marking it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't follow this comment. Are you confusing this function, which still exist, with a function that got renamed to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I may well be - there is a function that has been renamed and I cannot see a corresponding change in the header file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the renamed one was |
||
std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s); | ||
|
||
std::wstring utf8_to_utf16_big_endian(const std::string &); | ||
std::wstring utf8_to_utf16_little_endian(const std::string &); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May I hug you for this change? This obsoletes #752 if I'm right? And thus "COMPILING" should say we can actually build using GCC 4.9 again!? THANK YOU!!!