Skip to content

Commit 065810f

Browse files
author
Daniel Kroening
authored
Merge pull request #868 from forejtv/feature/unicode-cleanup
Unicode support cleanup
2 parents b0196c5 + 49b77ce commit 065810f

File tree

5 files changed

+249
-43
lines changed

5 files changed

+249
-43
lines changed

COMPILING

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ We assume that you have a Debian/Ubuntu or Red Hat-like distribution.
3838

3939
yum install gcc gcc-c++ flex bison perl-libwww-perl patch devtoolset-6
4040

41-
Note that you need g++ version 5.2 or newer.
41+
Note that you need g++ version 4.9 or newer.
4242

4343
1) As a user, get the CBMC source via
4444

src/util/unicode.cpp

+150-41
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ Author: Daniel Kroening, [email protected]
88

99
#include <cstring>
1010
#include <locale>
11-
#include <codecvt>
1211
#include <iomanip>
1312
#include <sstream>
13+
#include <cstdint>
1414

1515
#include "unicode.h"
1616

@@ -20,6 +20,24 @@ Author: Daniel Kroening, [email protected]
2020

2121
/*******************************************************************\
2222
23+
Function: is_little_endian_arch
24+
25+
Inputs:
26+
27+
Outputs: True if the architecture is little_endian
28+
29+
Purpose: Determine endianness of the architecture
30+
31+
\*******************************************************************/
32+
33+
bool is_little_endian_arch()
34+
{
35+
uint32_t i=1;
36+
return reinterpret_cast<uint8_t &>(i);
37+
}
38+
39+
/*******************************************************************\
40+
2341
Function: narrow
2442
2543
Inputs:
@@ -154,17 +172,17 @@ std::wstring widen(const std::string &s)
154172

155173
/*******************************************************************\
156174
157-
Function: utf32_to_utf8
175+
Function: utf8_append_code
158176
159-
Inputs:
177+
Inputs: character to append, string to append to
160178
161179
Outputs:
162180
163-
Purpose:
181+
Purpose: Appends a unicode character to a utf8-encoded string
164182
165183
\*******************************************************************/
166184

167-
void utf32_to_utf8(unsigned int c, std::string &result)
185+
static void utf8_append_code(unsigned int c, std::string &result)
168186
{
169187
if(c<=0x7f)
170188
result+=static_cast<char>(c);
@@ -192,9 +210,10 @@ void utf32_to_utf8(unsigned int c, std::string &result)
192210
193211
Function: utf32_to_utf8
194212
195-
Inputs:
213+
Inputs: utf32-encoded wide string
196214
197-
Outputs:
215+
Outputs: utf8-encoded string with the same unicode characters
216+
as the input.
198217
199218
Purpose:
200219
@@ -207,14 +226,14 @@ std::string utf32_to_utf8(const std::basic_string<unsigned int> &s)
207226
result.reserve(s.size()); // at least that long
208227

209228
for(const auto c : s)
210-
utf32_to_utf8(c, result);
229+
utf8_append_code(c, result);
211230

212231
return result;
213232
}
214233

215234
/*******************************************************************\
216235
217-
Function: utf16_to_utf8
236+
Function: narrow_argv
218237
219238
Inputs:
220239
@@ -224,43 +243,140 @@ Function: utf16_to_utf8
224243
225244
\*******************************************************************/
226245

227-
std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s)
246+
const char **narrow_argv(int argc, const wchar_t **argv_wide)
228247
{
229-
std::string result;
248+
if(argv_wide==NULL)
249+
return NULL;
230250

231-
result.reserve(s.size()); // at least that long
251+
// the following never gets deleted
252+
const char **argv_narrow=new const char *[argc+1];
253+
argv_narrow[argc]=0;
232254

233-
for(const auto c : s)
234-
utf32_to_utf8(c, result);
255+
for(int i=0; i<argc; i++)
256+
argv_narrow[i]=strdup(narrow(argv_wide[i]).c_str());
235257

236-
return result;
258+
return argv_narrow;
237259
}
238260

239261
/*******************************************************************\
240262
241-
Function: narrow_argv
263+
Function: do_swap_bytes
242264
243-
Inputs:
265+
Inputs: A 16-bit integer
244266
245-
Outputs:
267+
Outputs: A 16-bit integer with bytes swapped
246268
247-
Purpose:
269+
Purpose: A helper function for dealing with different UTF16 endians
248270
249271
\*******************************************************************/
250272

251-
const char **narrow_argv(int argc, const wchar_t **argv_wide)
273+
uint16_t do_swap_bytes(uint16_t x)
252274
{
253-
if(argv_wide==NULL)
254-
return NULL;
275+
uint16_t b1=x & 0xFF;
276+
uint16_t b2=x & 0xFF00;
277+
return (b1 << 8) | (b2 >> 8);
278+
}
255279

256-
// the following never gets deleted
257-
const char **argv_narrow=new const char *[argc+1];
258-
argv_narrow[argc]=0;
259280

260-
for(int i=0; i<argc; i++)
261-
argv_narrow[i]=strdup(narrow(argv_wide[i]).c_str());
281+
void utf16_append_code(unsigned int code, bool swap_bytes, std::wstring &result)
282+
{
283+
// we do not treat 0xD800 to 0xDFFF, although
284+
// they are not valid unicode symbols
285+
286+
if(code<0xFFFF)
287+
{ // code is encoded as one UTF16 character
288+
// we just take the code and possibly swap the bytes
289+
unsigned int a=(swap_bytes)?do_swap_bytes(code):code;
290+
result+=static_cast<wchar_t>(a);
291+
}
292+
else // code is encoded as two UTF16 characters
293+
{
294+
// if this is valid unicode, we have
295+
// code<0x10FFFF
296+
// but let's not check it programmatically
297+
298+
// encode the code in UTF16, possibly swapping bytes.
299+
code=code-0x10000;
300+
unsigned int i1=((code>>10) & 0x3ff) | 0xD800;
301+
unsigned int a1=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i1)):i1;
302+
result+=static_cast<wchar_t>(a1);
303+
unsigned int i2=(code & 0x3ff) | 0xDC00;
304+
unsigned int a2=(swap_bytes)?do_swap_bytes(static_cast<uint16_t>(i2)):i2;
305+
result+=static_cast<wchar_t>(a2);
306+
}
307+
}
262308

263-
return argv_narrow;
309+
310+
/*******************************************************************\
311+
312+
Function: utf8_to_utf16
313+
314+
Inputs: String in UTF-8 format, bool value indicating whether the
315+
endianness should be different from the architecture one.
316+
317+
Outputs: String in UTF-16 format. The encoding follows the
318+
endianness of the architecture iff swap_bytes is true.
319+
320+
Purpose:
321+
322+
\*******************************************************************/
323+
std::wstring utf8_to_utf16(const std::string& in, bool swap_bytes)
324+
{
325+
std::wstring result;
326+
result.reserve(in.size());
327+
int i=0;
328+
while(i<in.size())
329+
{
330+
unsigned char c=in[i++];
331+
unsigned int code=0;
332+
// the ifs that follow find out how many UTF8 characters (1-4) store the
333+
// next unicode character. This is determined by the few most
334+
// significant bits.
335+
if(c<=0x7F)
336+
{
337+
// if it's one character, then code is exactly the value
338+
code=c;
339+
}
340+
else if(c<=0xDF && i<in.size())
341+
{ // in other cases, we need to read the right number of chars and decode
342+
// note: if we wanted to make sure that we capture incorrect strings,
343+
// we should check that whatever follows first character starts with
344+
// bits 10.
345+
code=(c & 0x1F) << 6;
346+
c=in[i++];
347+
code+=c & 0x3F;
348+
}
349+
else if(c<=0xEF && i+1<in.size())
350+
{
351+
code=(c & 0xF) << 12;
352+
c=in[i++];
353+
code+=(c & 0x3F) << 6;
354+
c=in[i++];
355+
code+=c & 0x3F;
356+
}
357+
else if(c<=0xF7 && i+2<in.size())
358+
{
359+
code=(c & 0x7) << 18;
360+
c=in[i++];
361+
code+=(c & 0x3F) << 12;
362+
c=in[i++];
363+
code+=(c & 0x3F) << 6;
364+
c=in[i++];
365+
code+=c & 0x3F;
366+
}
367+
else
368+
{
369+
// The string is not a valid UTF8 string! Either it has some characters
370+
// missing from a multi-character unicode symbol, or it has a char with
371+
// too high value.
372+
// For now, let's replace the character with a space
373+
code=32;
374+
}
375+
376+
utf16_append_code(code, swap_bytes, result);
377+
}
378+
379+
return result;
264380
}
265381

266382
/*******************************************************************\
@@ -271,14 +387,14 @@ Function: utf8_to_utf16_big_endian
271387
272388
Outputs: String in UTF-16BE format
273389
274-
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
390+
Purpose:
275391
276392
\*******************************************************************/
277393

278394
std::wstring utf8_to_utf16_big_endian(const std::string& in)
279395
{
280-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t> > converter;
281-
return converter.from_bytes(in);
396+
bool swap_bytes=is_little_endian_arch();
397+
return utf8_to_utf16(in, swap_bytes);
282398
}
283399

284400
/*******************************************************************\
@@ -289,21 +405,14 @@ Function: utf8_to_utf16_little_endian
289405
290406
Outputs: String in UTF-16LE format
291407
292-
Purpose: Note this requires g++-5 libstdc++ / libc++ / MSVC2010+
408+
Purpose:
293409
294410
\*******************************************************************/
295411

296412
std::wstring utf8_to_utf16_little_endian(const std::string& in)
297413
{
298-
const std::codecvt_mode mode=std::codecvt_mode::little_endian;
299-
300-
// default largest value codecvt_utf8_utf16 reads without error is 0x10ffff
301-
// see: http://en.cppreference.com/w/cpp/locale/codecvt_utf8_utf16
302-
const unsigned long maxcode=0x10ffff;
303-
304-
typedef std::codecvt_utf8_utf16<wchar_t, maxcode, mode> codecvt_utf8_utf16t;
305-
std::wstring_convert<codecvt_utf8_utf16t> converter;
306-
return converter.from_bytes(in);
414+
bool swap_bytes=!is_little_endian_arch();
415+
return utf8_to_utf16(in, swap_bytes);
307416
}
308417

309418
/*******************************************************************\

src/util/unicode.h

-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ std::string narrow(const std::wstring &s);
2020
std::wstring widen(const std::string &s);
2121

2222
std::string utf32_to_utf8(const std::basic_string<unsigned int> &s);
23-
std::string utf16_to_utf8(const std::basic_string<unsigned short int> &s);
2423

2524
std::wstring utf8_to_utf16_big_endian(const std::string &);
2625
std::wstring utf8_to_utf16_little_endian(const std::string &);

unit/Makefile

+4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ SRC = cpp_parser.cpp \
1010
sharing_node.cpp \
1111
smt2_parser.cpp \
1212
string_utils.cpp \
13+
unicode.cpp \
1314
wp.cpp \
1415
# Empty last line
1516

@@ -76,3 +77,6 @@ sharing_map$(EXEEXT): sharing_map$(OBJEXT)
7677
sharing_node$(EXEEXT): sharing_node$(OBJEXT)
7778
$(LINKBIN)
7879

80+
unicode$(EXEEXT): unicode$(OBJEXT)
81+
$(LINKBIN)
82+

0 commit comments

Comments
 (0)