Skip to content

Commit b4ff955

Browse files
author
blake2-ppc
committed
std: Deny overlong encodings in UTF-8
An 'overlong encoding' is a codepoint encoded non-minimally using the utf-8 format. Denying these enforce each codepoint to have only one valid representation in utf-8. An example is byte sequence 0xE0 0x80 0x80 which could be interpreted as U+0, but it's an overlong encoding since the canonical form is just 0x00. Another example is 0xE0 0x80 0xAF which was previously accepted and is an overlong encoding of the solidus "/". Directory traversal characters like / and . form the most compelling argument for why this commit is security critical. Factor out common UTF-8 decoding expressions as macros. This commit will partly duplicate UTF-8 decoding, so it is now present in both fn is_utf8() and .char_range_at(); the latter using an assumption of a valid str.
1 parent 6dd1859 commit b4ff955

File tree

1 file changed

+45
-8
lines changed

1 file changed

+45
-8
lines changed

src/libstd/str.rs

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,18 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
564564
Section: Misc
565565
*/
566566

567+
// Return the initial codepoint accumulator for the first byte.
568+
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
569+
// for width 3, and 3 bits for width 4
570+
macro_rules! utf8_first_byte(
571+
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
572+
)
573+
574+
// return the value of $ch updated with continuation byte $byte
575+
macro_rules! utf8_acc_cont_byte(
576+
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
577+
)
578+
567579
/// Determines if a vector of bytes contains valid UTF-8
568580
pub fn is_utf8(v: &[u8]) -> bool {
569581
let mut i = 0u;
@@ -577,11 +589,26 @@ pub fn is_utf8(v: &[u8]) -> bool {
577589

578590
let nexti = i + w;
579591
if nexti > total { return false; }
592+
// 1. Make sure the correct number of continuation bytes are present
593+
// 2. Check codepoint ranges (deny overlong encodings)
594+
// 2-byte encoding is for codepoints \u0080 to \u07ff
595+
// 3-byte encoding is for codepoints \u0800 to \uffff
596+
// 4-byte encoding is for codepoints \u10000 to \u10ffff
580597

598+
// 2-byte encodings are correct if the width and continuation match up
581599
if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
582600
if w > 2 {
601+
let mut ch;
602+
ch = utf8_first_byte!(v[i], w);
603+
ch = utf8_acc_cont_byte!(ch, v[i + 1]);
583604
if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
584-
if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
605+
ch = utf8_acc_cont_byte!(ch, v[i + 2]);
606+
if w == 3 && ch < MAX_TWO_B { return false; }
607+
if w > 3 {
608+
if v[i + 3] & 192u8 != TAG_CONT_U8 { return false; }
609+
ch = utf8_acc_cont_byte!(ch, v[i + 3]);
610+
if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false; }
611+
}
585612
}
586613

587614
i = nexti;
@@ -738,6 +765,7 @@ static MAX_TWO_B: uint = 2048u;
738765
static TAG_THREE_B: uint = 224u;
739766
static MAX_THREE_B: uint = 65536u;
740767
static TAG_FOUR_B: uint = 240u;
768+
static MAX_UNICODE: uint = 1114112u;
741769

742770
/// Unsafe operations
743771
pub mod raw {
@@ -1665,12 +1693,10 @@ impl<'self> StrSlice<'self> for &'self str {
16651693
let w = UTF8_CHAR_WIDTH[val] as uint;
16661694
assert!((w != 0));
16671695
1668-
// First byte is special, only want bottom 5 bits for width 2, 4 bits
1669-
// for width 3, and 3 bits for width 4
1670-
val &= 0x7Fu >> w;
1671-
val = (val << 6) | (s[i + 1] & 63u8) as uint;
1672-
if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1673-
if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1696+
val = utf8_first_byte!(val, w);
1697+
val = utf8_acc_cont_byte!(val, s[i + 1]);
1698+
if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
1699+
if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
16741700
16751701
return CharRange {ch: val as char, next: i + w};
16761702
}
@@ -2035,7 +2061,7 @@ impl OwnedStr for ~str {
20352061
/// Appends a character to the back of a string
20362062
#[inline]
20372063
fn push_char(&mut self, c: char) {
2038-
assert!(c as uint <= 0x10ffff); // FIXME: #7609: should be enforced on all `char`
2064+
assert!((c as uint) < MAX_UNICODE); // FIXME: #7609: should be enforced on all `char`
20392065
unsafe {
20402066
let code = c as uint;
20412067
let nb = if code < MAX_ONE_B { 1u }
@@ -2802,6 +2828,17 @@ mod tests {
28022828
assert_eq!(ss, from_bytes(bb));
28032829
}
28042830
2831+
#[test]
2832+
fn test_is_utf8_deny_overlong() {
2833+
assert!(!is_utf8([0xc0, 0x80]));
2834+
assert!(!is_utf8([0xc0, 0xae]));
2835+
assert!(!is_utf8([0xe0, 0x80, 0x80]));
2836+
assert!(!is_utf8([0xe0, 0x80, 0xaf]));
2837+
assert!(!is_utf8([0xe0, 0x81, 0x81]));
2838+
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
2839+
}
2840+
2841+
28052842
#[test]
28062843
#[ignore(cfg(windows))]
28072844
fn test_from_bytes_fail() {

0 commit comments

Comments
 (0)