@@ -16,22 +16,22 @@ namespace Lucene
16
16
const uint16_t UTF8Base::TRAIL_SURROGATE_MAX = 0xdfffu ;
17
17
const uint16_t UTF8Base::LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10 );
18
18
const uint32_t UTF8Base::SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10 ) - TRAIL_SURROGATE_MIN;
19
-
19
+
20
20
// Maximum valid value for a Unicode code point
21
21
const uint32_t UTF8Base::CODE_POINT_MAX = 0x0010ffffu ;
22
-
22
+
23
23
#ifdef LPP_UNICODE_CHAR_SIZE_2
24
24
const wchar_t UTF8Base::UNICODE_REPLACEMENT_CHAR = (wchar_t )0xfffd ;
25
25
const wchar_t UTF8Base::UNICODE_TERMINATOR = (wchar_t )0xffff ;
26
26
#else
27
27
const wchar_t UTF8Base::UNICODE_REPLACEMENT_CHAR = (wchar_t )0x0001fffd ;
28
28
const wchar_t UTF8Base::UNICODE_TERMINATOR = (wchar_t )0x0001ffff ;
29
29
#endif
30
-
30
+
31
31
UTF8Base::~UTF8Base ()
32
32
{
33
33
}
34
-
34
+
35
35
inline uint8_t UTF8Base::mask8 (uint32_t b)
36
36
{
37
37
return static_cast <uint8_t >(0xff & b);
@@ -46,7 +46,7 @@ namespace Lucene
46
46
{
47
47
return ((mask8 (b) >> 6 ) == 0x2 );
48
48
}
49
-
49
+
50
50
inline bool UTF8Base::isSurrogate (uint32_t cp)
51
51
{
52
52
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
@@ -61,47 +61,47 @@ namespace Lucene
61
61
{
62
62
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
63
63
}
64
-
64
+
65
65
inline bool UTF8Base::isValidCodePoint (uint32_t cp)
66
66
{
67
67
return (cp <= CODE_POINT_MAX && !isSurrogate (cp) && cp != 0xfffe && cp != 0xffff );
68
68
}
69
-
69
+
70
70
inline bool UTF8Base::isOverlongSequence (uint32_t cp, int32_t length)
71
71
{
72
72
if (cp < 0x80 )
73
73
{
74
- if (length != 1 )
74
+ if (length != 1 )
75
75
return true ;
76
76
}
77
77
else if (cp < 0x800 )
78
78
{
79
- if (length != 2 )
79
+ if (length != 2 )
80
80
return true ;
81
81
}
82
82
else if (cp < 0x10000 )
83
83
{
84
- if (length != 3 )
84
+ if (length != 3 )
85
85
return true ;
86
86
}
87
87
return false ;
88
88
}
89
-
89
+
90
90
UTF8Encoder::UTF8Encoder (const wchar_t * unicodeBegin, const wchar_t * unicodeEnd)
91
91
{
92
92
this ->unicodeBegin = unicodeBegin;
93
93
this ->unicodeEnd = unicodeEnd;
94
94
}
95
-
95
+
96
96
UTF8Encoder::~UTF8Encoder ()
97
97
{
98
98
}
99
-
99
+
100
100
uint32_t UTF8Encoder::readNext ()
101
101
{
102
102
return unicodeBegin == unicodeEnd ? (uint32_t )UNICODE_TERMINATOR : (uint32_t )*unicodeBegin++;
103
103
}
104
-
104
+
105
105
inline uint8_t * UTF8Encoder::appendChar (uint8_t * utf8, uint32_t cp)
106
106
{
107
107
if (cp < 0x80 ) // one octet
@@ -126,12 +126,12 @@ namespace Lucene
126
126
}
127
127
return utf8;
128
128
}
129
-
129
+
130
130
int32_t UTF8Encoder::utf16to8 (uint8_t * utf8, int32_t length)
131
131
{
132
132
uint8_t * start = utf8;
133
133
uint32_t next = readNext ();
134
-
134
+
135
135
while (next != UNICODE_TERMINATOR)
136
136
{
137
137
uint32_t cp = mask16 (next);
@@ -154,15 +154,15 @@ namespace Lucene
154
154
break ;
155
155
next = readNext ();
156
156
}
157
-
157
+
158
158
return ((utf8 - start) == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : (utf8 - start);
159
159
}
160
-
160
+
161
161
int32_t UTF8Encoder::utf32to8 (uint8_t * utf8, int32_t length)
162
162
{
163
163
uint8_t * start = utf8;
164
164
uint32_t next = readNext ();
165
-
165
+
166
166
while (next != UNICODE_TERMINATOR)
167
167
{
168
168
if (!isValidCodePoint (next))
@@ -172,10 +172,10 @@ namespace Lucene
172
172
break ;
173
173
next = readNext ();
174
174
}
175
-
175
+
176
176
return ((utf8 - start) == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : (utf8 - start);
177
177
}
178
-
178
+
179
179
int32_t UTF8Encoder::encode (uint8_t * utf8, int32_t length)
180
180
{
181
181
#ifdef LPP_UNICODE_CHAR_SIZE_2
@@ -184,37 +184,37 @@ namespace Lucene
184
184
return utf32to8 (utf8, length);
185
185
#endif
186
186
}
187
-
187
+
188
188
UTF8EncoderStream::UTF8EncoderStream (ReaderPtr reader) : UTF8Encoder(NULL , NULL )
189
189
{
190
190
this ->reader = reader;
191
191
}
192
-
192
+
193
193
UTF8EncoderStream::~UTF8EncoderStream ()
194
194
{
195
195
}
196
-
196
+
197
197
uint32_t UTF8EncoderStream::readNext ()
198
198
{
199
199
int32_t next = reader->read ();
200
200
return next == Reader::READER_EOF ? UNICODE_TERMINATOR : (uint32_t )next;
201
201
}
202
-
202
+
203
203
UTF8Decoder::UTF8Decoder (const uint8_t * utf8Begin, const uint8_t * utf8End)
204
204
{
205
205
this ->utf8Begin = utf8Begin;
206
206
this ->utf8End = utf8End;
207
207
}
208
-
208
+
209
209
UTF8Decoder::~UTF8Decoder ()
210
210
{
211
211
}
212
-
212
+
213
213
uint32_t UTF8Decoder::readNext ()
214
214
{
215
215
return utf8Begin == utf8End ? (uint32_t )UNICODE_TERMINATOR : (uint32_t )*utf8Begin++;
216
216
}
217
-
217
+
218
218
inline int32_t UTF8Decoder::sequenceLength (uint32_t cp)
219
219
{
220
220
uint8_t lead = mask8 (cp);
@@ -228,7 +228,7 @@ namespace Lucene
228
228
return 4 ;
229
229
return 0 ;
230
230
}
231
-
231
+
232
232
inline bool UTF8Decoder::getSequence (uint32_t & cp, int32_t length)
233
233
{
234
234
cp = mask8 (cp);
@@ -267,27 +267,27 @@ namespace Lucene
267
267
cp += next & 0x3f ;
268
268
return true ;
269
269
}
270
-
270
+
271
271
inline bool UTF8Decoder::isValidNext (uint32_t & cp)
272
272
{
273
273
// Determine the sequence length based on the lead octet
274
274
int32_t length = sequenceLength (cp);
275
- if (length < 1 && length > 4 )
275
+ if (length < 1 || length > 4 )
276
276
return false ;
277
277
278
278
// Now that we have a valid sequence length, get trail octets and calculate the code point
279
279
if (!getSequence (cp, length))
280
280
return false ;
281
-
281
+
282
282
// Decoding succeeded, now security checks
283
283
return (isValidCodePoint (cp) && !isOverlongSequence (cp, length));
284
284
}
285
-
285
+
286
286
int32_t UTF8Decoder::utf8to16 (wchar_t * unicode, int32_t length)
287
287
{
288
288
int32_t position = 0 ;
289
289
uint32_t next = readNext ();
290
-
290
+
291
291
while (next != UNICODE_TERMINATOR)
292
292
{
293
293
if (!isValidNext (next))
@@ -303,15 +303,15 @@ namespace Lucene
303
303
break ;
304
304
next = readNext ();
305
305
}
306
-
306
+
307
307
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
308
308
}
309
-
309
+
310
310
int32_t UTF8Decoder::utf8to32 (wchar_t * unicode, int32_t length)
311
311
{
312
312
int32_t position = 0 ;
313
313
uint32_t next = readNext ();
314
-
314
+
315
315
while (next != UNICODE_TERMINATOR)
316
316
{
317
317
if (!isValidNext (next))
@@ -321,10 +321,10 @@ namespace Lucene
321
321
break ;
322
322
next = readNext ();
323
323
}
324
-
324
+
325
325
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
326
326
}
327
-
327
+
328
328
int32_t UTF8Decoder::decode (wchar_t * unicode, int32_t length)
329
329
{
330
330
#ifdef LPP_UNICODE_CHAR_SIZE_2
@@ -333,42 +333,42 @@ namespace Lucene
333
333
return utf8to32 (unicode, length);
334
334
#endif
335
335
}
336
-
336
+
337
337
UTF8DecoderStream::UTF8DecoderStream (ReaderPtr reader) : UTF8Decoder(NULL , NULL )
338
338
{
339
339
this ->reader = reader;
340
340
}
341
-
341
+
342
342
UTF8DecoderStream::~UTF8DecoderStream ()
343
343
{
344
344
}
345
-
345
+
346
346
uint32_t UTF8DecoderStream::readNext ()
347
347
{
348
348
int32_t next = reader->read ();
349
349
return next == Reader::READER_EOF ? UNICODE_TERMINATOR : (uint32_t )next;
350
350
}
351
-
351
+
352
352
UTF16Decoder::UTF16Decoder (const uint16_t * utf16Begin, const uint16_t * utf16End)
353
353
{
354
354
this ->utf16Begin = utf16Begin;
355
355
this ->utf16End = utf16End;
356
356
}
357
-
357
+
358
358
UTF16Decoder::~UTF16Decoder ()
359
359
{
360
360
}
361
-
361
+
362
362
uint32_t UTF16Decoder::readNext ()
363
363
{
364
364
return utf16Begin == utf16End ? (uint32_t )UNICODE_TERMINATOR : (uint32_t )*utf16Begin++;
365
365
}
366
-
366
+
367
367
int32_t UTF16Decoder::utf16to32 (wchar_t * unicode, int32_t length)
368
368
{
369
369
int32_t position = 0 ;
370
370
uint32_t next = readNext ();
371
-
371
+
372
372
while (next != UNICODE_TERMINATOR)
373
373
{
374
374
uint32_t cp = mask16 (next);
@@ -390,26 +390,26 @@ namespace Lucene
390
390
break ;
391
391
next = readNext ();
392
392
}
393
-
393
+
394
394
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
395
395
}
396
-
396
+
397
397
int32_t UTF16Decoder::utf16to16 (wchar_t * unicode, int32_t length)
398
398
{
399
399
int32_t position = 0 ;
400
400
uint32_t next = readNext ();
401
-
401
+
402
402
while (next != UNICODE_TERMINATOR)
403
403
{
404
404
unicode[position++] = static_cast <wchar_t >(next);
405
405
if (position >= length)
406
406
break ;
407
407
next = readNext ();
408
408
}
409
-
409
+
410
410
return (position == 0 && next == UNICODE_TERMINATOR) ? Reader::READER_EOF : position;
411
411
}
412
-
412
+
413
413
int32_t UTF16Decoder::decode (wchar_t * unicode, int32_t length)
414
414
{
415
415
#ifdef LPP_UNICODE_CHAR_SIZE_2
0 commit comments