Skip to content

Commit 028bb63

Browse files
targositaloacasas
authored andcommitted
deps: revert breaking UTF-8 decoder changes in V8
Refs: v8/v8@7c46245 Refs: v8/v8@aadb1c8 PR-URL: #11029 Reviewed-By: Ben Noordhuis <[email protected]> Reviewed-By: Anna Henningsen <[email protected]> Reviewed-By: Myles Borins <[email protected]>
1 parent 22e2288 commit 028bb63

File tree

6 files changed

+302
-310
lines changed

6 files changed

+302
-310
lines changed

deps/v8/src/unicode-decoder.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,10 @@
77

88
#include <sys/types.h>
99
#include "src/globals.h"
10-
#include "src/utils.h"
1110

1211
namespace unibrow {
1312

14-
class V8_EXPORT_PRIVATE Utf8DecoderBase {
13+
class Utf8DecoderBase {
1514
public:
1615
// Initialization done in subclass.
1716
inline Utf8DecoderBase();

deps/v8/src/unicode.cc

+73-50
Original file line numberDiff line numberDiff line change
@@ -228,52 +228,80 @@ static inline bool IsContinuationCharacter(byte chr) {
228228
// This method decodes an UTF-8 value according to RFC 3629.
229229
uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
230230
size_t length = NonASCIISequenceLength(str[0]);
231-
232-
// Check continuation characters.
233-
size_t max_count = std::min(length, max_length);
234-
size_t count = 1;
235-
while (count < max_count && IsContinuationCharacter(str[count])) {
236-
count++;
231+
if (length == 0 || max_length < length) {
232+
*cursor += 1;
233+
return kBadChar;
237234
}
238-
*cursor += count;
239-
240-
// There must be enough continuation characters.
241-
if (count != length) return kBadChar;
242-
243-
// Check overly long sequences & other conditions.
244-
if (length == 3) {
245-
if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
246-
// Overlong three-byte sequence?
235+
if (length == 2) {
236+
if (!IsContinuationCharacter(str[1])) {
237+
*cursor += 1;
247238
return kBadChar;
248-
} else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
249-
// High and low surrogate halves?
239+
}
240+
*cursor += 2;
241+
return ((str[0] << 6) + str[1]) - 0x00003080;
242+
}
243+
if (length == 3) {
244+
switch (str[0]) {
245+
case 0xE0:
246+
// Overlong three-byte sequence.
247+
if (str[1] < 0xA0 || str[1] > 0xBF) {
248+
*cursor += 1;
249+
return kBadChar;
250+
}
251+
break;
252+
case 0xED:
253+
// High and low surrogate halves.
254+
if (str[1] < 0x80 || str[1] > 0x9F) {
255+
*cursor += 1;
256+
return kBadChar;
257+
}
258+
break;
259+
default:
260+
if (!IsContinuationCharacter(str[1])) {
261+
*cursor += 1;
262+
return kBadChar;
263+
}
264+
}
265+
if (!IsContinuationCharacter(str[2])) {
266+
*cursor += 1;
250267
return kBadChar;
251268
}
252-
} else if (length == 4) {
253-
if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
269+
*cursor += 3;
270+
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
271+
}
272+
DCHECK(length == 4);
273+
switch (str[0]) {
274+
case 0xF0:
254275
// Overlong four-byte sequence.
255-
return kBadChar;
256-
} else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
276+
if (str[1] < 0x90 || str[1] > 0xBF) {
277+
*cursor += 1;
278+
return kBadChar;
279+
}
280+
break;
281+
case 0xF4:
257282
// Code points outside of the unicode range.
258-
return kBadChar;
259-
}
283+
if (str[1] < 0x80 || str[1] > 0x8F) {
284+
*cursor += 1;
285+
return kBadChar;
286+
}
287+
break;
288+
default:
289+
if (!IsContinuationCharacter(str[1])) {
290+
*cursor += 1;
291+
return kBadChar;
292+
}
260293
}
261-
262-
// All errors have been handled, so we only have to assemble the result.
263-
switch (length) {
264-
case 1:
265-
return str[0];
266-
case 2:
267-
return ((str[0] << 6) + str[1]) - 0x00003080;
268-
case 3:
269-
return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
270-
case 4:
271-
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
272-
0x03C82080;
294+
if (!IsContinuationCharacter(str[2])) {
295+
*cursor += 1;
296+
return kBadChar;
273297
}
274-
275-
UNREACHABLE();
276-
return kBadChar;
298+
if (!IsContinuationCharacter(str[3])) {
299+
*cursor += 1;
300+
return kBadChar;
301+
}
302+
*cursor += 4;
303+
return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
304+
0x03C82080;
277305
}
278306

279307
uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
@@ -295,10 +323,9 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
295323
// with one shift.
296324
uint8_t mask = 0x7f >> kind;
297325

298-
// Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
299-
// in 2nd nibble, and the value in the bottom three. The 2nd nibble is
300-
// intended as a counter about how many bytes are still needed.
301-
*buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
326+
// Store the kind - 1 (i.e., remaining bytes) in the top byte, value
327+
// in the bottom three.
328+
*buffer = (kind - 1) << 24 | (next & mask);
302329
return kIncomplete;
303330
} else {
304331
// No buffer, and not the start of a 1-byte char (handled at the
@@ -327,19 +354,15 @@ uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
327354
// We're inside of a character, as described by buffer.
328355

329356
// How many bytes (excluding this one) do we still expect?
330-
uint8_t bytes_expected = *buffer >> 28;
331-
uint8_t bytes_left = (*buffer >> 24) & 0x0f;
332-
bytes_left--;
357+
uint8_t count = (*buffer >> 24) - 1;
333358
// Update the value.
334359
uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
335-
if (bytes_left) {
336-
*buffer = (bytes_expected << 28 | bytes_left << 24 | value);
360+
if (count) {
361+
*buffer = count << 24 | value;
337362
return kIncomplete;
338363
} else {
339364
*buffer = 0;
340-
bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
341-
(bytes_expected == 3 && value < 0x800);
342-
return sequence_was_too_long ? kBadChar : value;
365+
return value;
343366
}
344367
} else {
345368
// Within a character, but not a continuation character? Then the

0 commit comments

Comments
 (0)