@@ -228,52 +228,80 @@ static inline bool IsContinuationCharacter(byte chr) {
228
228
// This method decodes an UTF-8 value according to RFC 3629.
229
229
uchar Utf8::CalculateValue (const byte* str, size_t max_length, size_t * cursor) {
230
230
size_t length = NonASCIISequenceLength (str[0 ]);
231
-
232
- // Check continuation characters.
233
- size_t max_count = std::min (length, max_length);
234
- size_t count = 1 ;
235
- while (count < max_count && IsContinuationCharacter (str[count])) {
236
- count++;
231
+ if (length == 0 || max_length < length) {
232
+ *cursor += 1 ;
233
+ return kBadChar ;
237
234
}
238
- *cursor += count;
239
-
240
- // There must be enough continuation characters.
241
- if (count != length) return kBadChar ;
242
-
243
- // Check overly long sequences & other conditions.
244
- if (length == 3 ) {
245
- if (str[0 ] == 0xE0 && (str[1 ] < 0xA0 || str[1 ] > 0xBF )) {
246
- // Overlong three-byte sequence?
235
+ if (length == 2 ) {
236
+ if (!IsContinuationCharacter (str[1 ])) {
237
+ *cursor += 1 ;
247
238
return kBadChar ;
248
- } else if (str[0 ] == 0xED && (str[1 ] < 0x80 || str[1 ] > 0x9F )) {
249
- // High and low surrogate halves?
239
+ }
240
+ *cursor += 2 ;
241
+ return ((str[0 ] << 6 ) + str[1 ]) - 0x00003080 ;
242
+ }
243
+ if (length == 3 ) {
244
+ switch (str[0 ]) {
245
+ case 0xE0 :
246
+ // Overlong three-byte sequence.
247
+ if (str[1 ] < 0xA0 || str[1 ] > 0xBF ) {
248
+ *cursor += 1 ;
249
+ return kBadChar ;
250
+ }
251
+ break ;
252
+ case 0xED :
253
+ // High and low surrogate halves.
254
+ if (str[1 ] < 0x80 || str[1 ] > 0x9F ) {
255
+ *cursor += 1 ;
256
+ return kBadChar ;
257
+ }
258
+ break ;
259
+ default :
260
+ if (!IsContinuationCharacter (str[1 ])) {
261
+ *cursor += 1 ;
262
+ return kBadChar ;
263
+ }
264
+ }
265
+ if (!IsContinuationCharacter (str[2 ])) {
266
+ *cursor += 1 ;
250
267
return kBadChar ;
251
268
}
252
- } else if (length == 4 ) {
253
- if (str[0 ] == 0xF0 && (str[1 ] < 0x90 || str[1 ] > 0xBF )) {
269
+ *cursor += 3 ;
270
+ return ((str[0 ] << 12 ) + (str[1 ] << 6 ) + str[2 ]) - 0x000E2080 ;
271
+ }
272
+ DCHECK (length == 4 );
273
+ switch (str[0 ]) {
274
+ case 0xF0 :
254
275
// Overlong four-byte sequence.
255
- return kBadChar ;
256
- } else if (str[0 ] == 0xF4 && (str[1 ] < 0x80 || str[1 ] > 0x8F )) {
276
+ if (str[1 ] < 0x90 || str[1 ] > 0xBF ) {
277
+ *cursor += 1 ;
278
+ return kBadChar ;
279
+ }
280
+ break ;
281
+ case 0xF4 :
257
282
// Code points outside of the unicode range.
258
- return kBadChar ;
259
- }
283
+ if (str[1 ] < 0x80 || str[1 ] > 0x8F ) {
284
+ *cursor += 1 ;
285
+ return kBadChar ;
286
+ }
287
+ break ;
288
+ default :
289
+ if (!IsContinuationCharacter (str[1 ])) {
290
+ *cursor += 1 ;
291
+ return kBadChar ;
292
+ }
260
293
}
261
-
262
- // All errors have been handled, so we only have to assemble the result.
263
- switch (length) {
264
- case 1 :
265
- return str[0 ];
266
- case 2 :
267
- return ((str[0 ] << 6 ) + str[1 ]) - 0x00003080 ;
268
- case 3 :
269
- return ((str[0 ] << 12 ) + (str[1 ] << 6 ) + str[2 ]) - 0x000E2080 ;
270
- case 4 :
271
- return ((str[0 ] << 18 ) + (str[1 ] << 12 ) + (str[2 ] << 6 ) + str[3 ]) -
272
- 0x03C82080 ;
294
+ if (!IsContinuationCharacter (str[2 ])) {
295
+ *cursor += 1 ;
296
+ return kBadChar ;
273
297
}
274
-
275
- UNREACHABLE ();
276
- return kBadChar ;
298
+ if (!IsContinuationCharacter (str[3 ])) {
299
+ *cursor += 1 ;
300
+ return kBadChar ;
301
+ }
302
+ *cursor += 4 ;
303
+ return ((str[0 ] << 18 ) + (str[1 ] << 12 ) + (str[2 ] << 6 ) + str[3 ]) -
304
+ 0x03C82080 ;
277
305
}
278
306
279
307
uchar Utf8::ValueOfIncremental (byte next, Utf8IncrementalBuffer* buffer) {
@@ -295,10 +323,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
295
323
// with one shift.
296
324
uint8_t mask = 0x7f >> kind;
297
325
298
- // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
299
- // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
300
- // intended as a counter about how many bytes are still needed.
301
- *buffer = kind << 28 | (kind - 1 ) << 24 | (next & mask);
326
+ // Store the kind - 1 (i.e., remaining bytes) in the top byte, value
327
+ // in the bottom three.
328
+ *buffer = (kind - 1 ) << 24 | (next & mask);
302
329
return kIncomplete ;
303
330
} else {
304
331
// No buffer, and not the start of a 1-byte char (handled at the
@@ -327,19 +354,15 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
327
354
// We're inside of a character, as described by buffer.
328
355
329
356
// How many bytes (excluding this one) do we still expect?
330
- uint8_t bytes_expected = *buffer >> 28 ;
331
- uint8_t bytes_left = (*buffer >> 24 ) & 0x0f ;
332
- bytes_left--;
357
+ uint8_t count = (*buffer >> 24 ) - 1 ;
333
358
// Update the value.
334
359
uint32_t value = ((*buffer & 0xffffff ) << 6 ) | (next & 0x3F );
335
- if (bytes_left ) {
336
- *buffer = (bytes_expected << 28 | bytes_left << 24 | value) ;
360
+ if (count ) {
361
+ *buffer = count << 24 | value;
337
362
return kIncomplete ;
338
363
} else {
339
364
*buffer = 0 ;
340
- bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80 ) ||
341
- (bytes_expected == 3 && value < 0x800 );
342
- return sequence_was_too_long ? kBadChar : value;
365
+ return value;
343
366
}
344
367
} else {
345
368
// Within a character, but not a continuation character? Then the
0 commit comments