Skip to content

Commit 5170153

Browse files
author
Alan Wright
committed
Fix logic error when detecting next valid utf8 sequence.
Closes #31.
1 parent c39e211 commit 5170153

File tree

1 file changed

+52
-52
lines changed

1 file changed

+52
-52
lines changed

src/core/util/UTF8Stream.cpp

Lines changed: 52 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,22 @@ namespace Lucene
1616
const uint16_t UTF8Base::TRAIL_SURROGATE_MAX = 0xdfffu;
1717
const uint16_t UTF8Base::LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
1818
const uint32_t UTF8Base::SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
19-
19+
2020
// Maximum valid value for a Unicode code point
2121
const uint32_t UTF8Base::CODE_POINT_MAX = 0x0010ffffu;
22-
22+
2323
#ifdef LPP_UNICODE_CHAR_SIZE_2
2424
const wchar_t UTF8Base::UNICODE_REPLACEMENT_CHAR = (wchar_t)0xfffd;
2525
const wchar_t UTF8Base::UNICODE_TERMINATOR = (wchar_t)0xffff;
2626
#else
2727
const wchar_t UTF8Base::UNICODE_REPLACEMENT_CHAR = (wchar_t)0x0001fffd;
2828
const wchar_t UTF8Base::UNICODE_TERMINATOR = (wchar_t)0x0001ffff;
2929
#endif
30-
30+
3131
UTF8Base::~UTF8Base()
3232
{
3333
}
34-
34+
3535
inline uint8_t UTF8Base::mask8(uint32_t b)
3636
{
3737
return static_cast<uint8_t>(0xff & b);
@@ -46,7 +46,7 @@ namespace Lucene
4646
{
4747
return ((mask8(b) >> 6) == 0x2);
4848
}
49-
49+
5050
inline bool UTF8Base::isSurrogate(uint32_t cp)
5151
{
5252
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
@@ -61,47 +61,47 @@ namespace Lucene
6161
{
6262
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
6363
}
64-
64+
6565
inline bool UTF8Base::isValidCodePoint(uint32_t cp)
6666
{
6767
return (cp <= CODE_POINT_MAX && !isSurrogate(cp) && cp != 0xfffe && cp != 0xffff);
6868
}
69-
69+
7070
inline bool UTF8Base::isOverlongSequence(uint32_t cp, int32_t length)
7171
{
7272
if (cp < 0x80)
7373
{
74-
if (length != 1)
74+
if (length != 1)
7575
return true;
7676
}
7777
else if (cp < 0x800)
7878
{
79-
if (length != 2)
79+
if (length != 2)
8080
return true;
8181
}
8282
else if (cp < 0x10000)
8383
{
84-
if (length != 3)
84+
if (length != 3)
8585
return true;
8686
}
8787
return false;
8888
}
89-
89+
9090
UTF8Encoder::UTF8Encoder(const wchar_t* unicodeBegin, const wchar_t* unicodeEnd)
9191
{
9292
this->unicodeBegin = unicodeBegin;
9393
this->unicodeEnd = unicodeEnd;
9494
}
95-
95+
9696
UTF8Encoder::~UTF8Encoder()
9797
{
9898
}
99-
99+
100100
uint32_t UTF8Encoder::readNext()
101101
{
102102
return unicodeBegin == unicodeEnd ? (uint32_t)UNICODE_TERMINATOR : (uint32_t)*unicodeBegin++;
103103
}
104-
104+
105105
inline uint8_t* UTF8Encoder::appendChar(uint8_t* utf8, uint32_t cp)
106106
{
107107
if (cp < 0x80) // one octet
@@ -126,12 +126,12 @@ namespace Lucene
126126
}
127127
return utf8;
128128
}
129-
129+
130130
int32_t UTF8Encoder::utf16to8(uint8_t* utf8, int32_t length)
131131
{
132132
uint8_t* start = utf8;
133133
uint32_t next = readNext();
134-
134+
135135
while (next != UNICODE_TERMINATOR)
136136
{
137137
uint32_t cp = mask16(next);
@@ -154,15 +154,15 @@ namespace Lucene
154154
break;
155155
next = readNext();
156156
}
157-
157+
158158
return ((utf8 - start) == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : (utf8 - start);
159159
}
160-
160+
161161
int32_t UTF8Encoder::utf32to8(uint8_t* utf8, int32_t length)
162162
{
163163
uint8_t* start = utf8;
164164
uint32_t next = readNext();
165-
165+
166166
while (next != UNICODE_TERMINATOR)
167167
{
168168
if (!isValidCodePoint(next))
@@ -172,10 +172,10 @@ namespace Lucene
172172
break;
173173
next = readNext();
174174
}
175-
175+
176176
return ((utf8 - start) == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : (utf8 - start);
177177
}
178-
178+
179179
int32_t UTF8Encoder::encode(uint8_t* utf8, int32_t length)
180180
{
181181
#ifdef LPP_UNICODE_CHAR_SIZE_2
@@ -184,37 +184,37 @@ namespace Lucene
184184
return utf32to8(utf8, length);
185185
#endif
186186
}
187-
187+
188188
UTF8EncoderStream::UTF8EncoderStream(ReaderPtr reader) : UTF8Encoder(NULL, NULL)
189189
{
190190
this->reader = reader;
191191
}
192-
192+
193193
UTF8EncoderStream::~UTF8EncoderStream()
194194
{
195195
}
196-
196+
197197
uint32_t UTF8EncoderStream::readNext()
198198
{
199199
int32_t next = reader->read();
200200
return next == Reader::READER_EOF ? UNICODE_TERMINATOR : (uint32_t)next;
201201
}
202-
202+
203203
UTF8Decoder::UTF8Decoder(const uint8_t* utf8Begin, const uint8_t* utf8End)
204204
{
205205
this->utf8Begin = utf8Begin;
206206
this->utf8End = utf8End;
207207
}
208-
208+
209209
UTF8Decoder::~UTF8Decoder()
210210
{
211211
}
212-
212+
213213
uint32_t UTF8Decoder::readNext()
214214
{
215215
return utf8Begin == utf8End ? (uint32_t)UNICODE_TERMINATOR : (uint32_t)*utf8Begin++;
216216
}
217-
217+
218218
inline int32_t UTF8Decoder::sequenceLength(uint32_t cp)
219219
{
220220
uint8_t lead = mask8(cp);
@@ -228,7 +228,7 @@ namespace Lucene
228228
return 4;
229229
return 0;
230230
}
231-
231+
232232
inline bool UTF8Decoder::getSequence(uint32_t& cp, int32_t length)
233233
{
234234
cp = mask8(cp);
@@ -267,27 +267,27 @@ namespace Lucene
267267
cp += next & 0x3f;
268268
return true;
269269
}
270-
270+
271271
inline bool UTF8Decoder::isValidNext(uint32_t& cp)
272272
{
273273
// Determine the sequence length based on the lead octet
274274
int32_t length = sequenceLength(cp);
275-
if (length < 1 && length > 4)
275+
if (length < 1 || length > 4)
276276
return false;
277277

278278
// Now that we have a valid sequence length, get trail octets and calculate the code point
279279
if (!getSequence(cp, length))
280280
return false;
281-
281+
282282
// Decoding succeeded, now security checks
283283
return (isValidCodePoint(cp) && !isOverlongSequence(cp, length));
284284
}
285-
285+
286286
int32_t UTF8Decoder::utf8to16(wchar_t* unicode, int32_t length)
287287
{
288288
int32_t position = 0;
289289
uint32_t next = readNext();
290-
290+
291291
while (next != UNICODE_TERMINATOR)
292292
{
293293
if (!isValidNext(next))
@@ -303,15 +303,15 @@ namespace Lucene
303303
break;
304304
next = readNext();
305305
}
306-
306+
307307
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
308308
}
309-
309+
310310
int32_t UTF8Decoder::utf8to32(wchar_t* unicode, int32_t length)
311311
{
312312
int32_t position = 0;
313313
uint32_t next = readNext();
314-
314+
315315
while (next != UNICODE_TERMINATOR)
316316
{
317317
if (!isValidNext(next))
@@ -321,10 +321,10 @@ namespace Lucene
321321
break;
322322
next = readNext();
323323
}
324-
324+
325325
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
326326
}
327-
327+
328328
int32_t UTF8Decoder::decode(wchar_t* unicode, int32_t length)
329329
{
330330
#ifdef LPP_UNICODE_CHAR_SIZE_2
@@ -333,42 +333,42 @@ namespace Lucene
333333
return utf8to32(unicode, length);
334334
#endif
335335
}
336-
336+
337337
UTF8DecoderStream::UTF8DecoderStream(ReaderPtr reader) : UTF8Decoder(NULL, NULL)
338338
{
339339
this->reader = reader;
340340
}
341-
341+
342342
UTF8DecoderStream::~UTF8DecoderStream()
343343
{
344344
}
345-
345+
346346
uint32_t UTF8DecoderStream::readNext()
347347
{
348348
int32_t next = reader->read();
349349
return next == Reader::READER_EOF ? UNICODE_TERMINATOR : (uint32_t)next;
350350
}
351-
351+
352352
UTF16Decoder::UTF16Decoder(const uint16_t* utf16Begin, const uint16_t* utf16End)
353353
{
354354
this->utf16Begin = utf16Begin;
355355
this->utf16End = utf16End;
356356
}
357-
357+
358358
UTF16Decoder::~UTF16Decoder()
359359
{
360360
}
361-
361+
362362
uint32_t UTF16Decoder::readNext()
363363
{
364364
return utf16Begin == utf16End ? (uint32_t)UNICODE_TERMINATOR : (uint32_t)*utf16Begin++;
365365
}
366-
366+
367367
int32_t UTF16Decoder::utf16to32(wchar_t* unicode, int32_t length)
368368
{
369369
int32_t position = 0;
370370
uint32_t next = readNext();
371-
371+
372372
while (next != UNICODE_TERMINATOR)
373373
{
374374
uint32_t cp = mask16(next);
@@ -390,26 +390,26 @@ namespace Lucene
390390
break;
391391
next = readNext();
392392
}
393-
393+
394394
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
395395
}
396-
396+
397397
int32_t UTF16Decoder::utf16to16(wchar_t* unicode, int32_t length)
398398
{
399399
int32_t position = 0;
400400
uint32_t next = readNext();
401-
401+
402402
while (next != UNICODE_TERMINATOR)
403403
{
404404
unicode[position++] = static_cast<wchar_t>(next);
405405
if (position >= length)
406406
break;
407407
next = readNext();
408408
}
409-
409+
410410
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
411411
}
412-
412+
413413
int32_t UTF16Decoder::decode(wchar_t* unicode, int32_t length)
414414
{
415415
#ifdef LPP_UNICODE_CHAR_SIZE_2

0 commit comments

Comments
 (0)