Skip to content

Commit 65b5cba

Browse files
authored
Rollup merge of #113898 - ajtribick:encode_utf16_size_hint, r=cuviper
Fix size_hint for EncodeUtf16 More realistic upper and lower bounds, and handle the case where the iterator is located within a surrogate pair. Resolves #113897
2 parents 746d507 + f777339 commit 65b5cba

File tree

2 files changed

+38
-5
lines changed

2 files changed

+38
-5
lines changed

library/alloc/tests/str.rs

+22
Original file line numberDiff line numberDiff line change
@@ -1738,6 +1738,28 @@ fn test_utf16_code_units() {
17381738
assert_eq!(\u{1F4A9}".encode_utf16().collect::<Vec<u16>>(), [0xE9, 0xD83D, 0xDCA9])
17391739
}
17401740

1741+
#[test]
1742+
fn test_utf16_size_hint() {
1743+
assert_eq!("".encode_utf16().size_hint(), (0, Some(0)));
1744+
assert_eq!("123".encode_utf16().size_hint(), (1, Some(3)));
1745+
assert_eq!("1234".encode_utf16().size_hint(), (2, Some(4)));
1746+
assert_eq!("12345678".encode_utf16().size_hint(), (3, Some(8)));
1747+
1748+
fn hint_vec(src: &str) -> Vec<(usize, Option<usize>)> {
1749+
let mut it = src.encode_utf16();
1750+
let mut result = Vec::new();
1751+
result.push(it.size_hint());
1752+
while it.next().is_some() {
1753+
result.push(it.size_hint())
1754+
}
1755+
result
1756+
}
1757+
1758+
assert_eq!(hint_vec("12"), [(1, Some(2)), (1, Some(1)), (0, Some(0))]);
1759+
assert_eq!(hint_vec("\u{101234}"), [(2, Some(4)), (1, Some(1)), (0, Some(0))]);
1760+
assert_eq!(hint_vec("\u{101234}a"), [(2, Some(5)), (2, Some(2)), (1, Some(1)), (0, Some(0))]);
1761+
}
1762+
17411763
#[test]
17421764
fn starts_with_in_unicode() {
17431765
assert!(!"├── Cargo.toml".starts_with("# "));

library/core/src/str/iter.rs

+16-5
Original file line numberDiff line numberDiff line change
@@ -1439,11 +1439,22 @@ impl<'a> Iterator for EncodeUtf16<'a> {
14391439

14401440
#[inline]
14411441
fn size_hint(&self) -> (usize, Option<usize>) {
1442-
let (low, high) = self.chars.size_hint();
1443-
// every char gets either one u16 or two u16,
1444-
// so this iterator is between 1 or 2 times as
1445-
// long as the underlying iterator.
1446-
(low, high.and_then(|n| n.checked_mul(2)))
1442+
let len = self.chars.iter.len();
1443+
// The highest bytes:code units ratio occurs for 3-byte sequences,
1444+
// since a 4-byte sequence results in 2 code units. The lower bound
1445+
// is therefore determined by assuming the remaining bytes contain as
1446+
// many 3-byte sequences as possible. The highest bytes:code units
1447+
// ratio is for 1-byte sequences, so use this for the upper bound.
1448+
// `(len + 2)` can't overflow, because we know that the `slice::Iter`
1449+
// belongs to a slice in memory which has a maximum length of
1450+
// `isize::MAX` (that's well below `usize::MAX`)
1451+
if self.extra == 0 {
1452+
((len + 2) / 3, Some(len))
1453+
} else {
1454+
// We're in the middle of a surrogate pair, so add the remaining
1455+
// surrogate to the bounds.
1456+
((len + 2) / 3 + 1, Some(len + 1))
1457+
}
14471458
}
14481459
}
14491460

0 commit comments

Comments
 (0)