Skip to content

Commit 42f5419

Browse files
committed
Auto merge of #113954 - matthiaskrgr:rollup-e2r9suz, r=matthiaskrgr
Rollup of 6 pull requests Successful merges: - #112490 (Remove `#[cfg(all())]` workarounds from `c_char`) - #113252 (Update the tracking issue for `const_cstr_from_ptr`) - #113442 (Allow limited access to `OsString` bytes) - #113876 (fix docs & example for `std::os::unix::prelude::FileExt::write_at`) - #113898 (Fix size_hint for EncodeUtf16) - #113934 (Multibyte character removal in String::pop and String::remove doctests) r? `@ghost` `@rustbot` modify labels: rollup
2 parents dcb8104 + 37cd634 commit 42f5419

File tree

11 files changed

+177
-25
lines changed

11 files changed

+177
-25
lines changed

library/alloc/src/string.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -1290,11 +1290,11 @@ impl String {
12901290
/// Basic usage:
12911291
///
12921292
/// ```
1293-
/// let mut s = String::from("foo");
1293+
/// let mut s = String::from("abč");
12941294
///
1295-
/// assert_eq!(s.pop(), Some('o'));
1296-
/// assert_eq!(s.pop(), Some('o'));
1297-
/// assert_eq!(s.pop(), Some('f'));
1295+
/// assert_eq!(s.pop(), Some('č'));
1296+
/// assert_eq!(s.pop(), Some('b'));
1297+
/// assert_eq!(s.pop(), Some('a'));
12981298
///
12991299
/// assert_eq!(s.pop(), None);
13001300
/// ```
@@ -1324,11 +1324,11 @@ impl String {
13241324
/// Basic usage:
13251325
///
13261326
/// ```
1327-
/// let mut s = String::from("foo");
1327+
/// let mut s = String::from("abç");
13281328
///
1329-
/// assert_eq!(s.remove(0), 'f');
1330-
/// assert_eq!(s.remove(1), 'o');
1331-
/// assert_eq!(s.remove(0), 'o');
1329+
/// assert_eq!(s.remove(0), 'a');
1330+
/// assert_eq!(s.remove(1), 'ç');
1331+
/// assert_eq!(s.remove(0), 'b');
13321332
/// ```
13331333
#[inline]
13341334
#[stable(feature = "rust1", since = "1.0.0")]

library/alloc/tests/str.rs

+22
Original file line numberDiff line numberDiff line change
@@ -1738,6 +1738,28 @@ fn test_utf16_code_units() {
17381738
assert_eq!(\u{1F4A9}".encode_utf16().collect::<Vec<u16>>(), [0xE9, 0xD83D, 0xDCA9])
17391739
}
17401740

1741+
#[test]
1742+
fn test_utf16_size_hint() {
1743+
assert_eq!("".encode_utf16().size_hint(), (0, Some(0)));
1744+
assert_eq!("123".encode_utf16().size_hint(), (1, Some(3)));
1745+
assert_eq!("1234".encode_utf16().size_hint(), (2, Some(4)));
1746+
assert_eq!("12345678".encode_utf16().size_hint(), (3, Some(8)));
1747+
1748+
fn hint_vec(src: &str) -> Vec<(usize, Option<usize>)> {
1749+
let mut it = src.encode_utf16();
1750+
let mut result = Vec::new();
1751+
result.push(it.size_hint());
1752+
while it.next().is_some() {
1753+
result.push(it.size_hint())
1754+
}
1755+
result
1756+
}
1757+
1758+
assert_eq!(hint_vec("12"), [(1, Some(2)), (1, Some(1)), (0, Some(0))]);
1759+
assert_eq!(hint_vec("\u{101234}"), [(2, Some(4)), (1, Some(1)), (0, Some(0))]);
1760+
assert_eq!(hint_vec("\u{101234}a"), [(2, Some(5)), (2, Some(2)), (1, Some(1)), (0, Some(0))]);
1761+
}
1762+
17411763
#[test]
17421764
fn starts_with_in_unicode() {
17431765
assert!(!"├── Cargo.toml".starts_with("# "));

library/core/src/ffi/c_str.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ impl CStr {
256256
#[inline]
257257
#[must_use]
258258
#[stable(feature = "rust1", since = "1.0.0")]
259-
#[rustc_const_unstable(feature = "const_cstr_from_ptr", issue = "101719")]
259+
#[rustc_const_unstable(feature = "const_cstr_from_ptr", issue = "113219")]
260260
pub const unsafe fn from_ptr<'a>(ptr: *const c_char) -> &'a CStr {
261261
// SAFETY: The caller has provided a pointer that points to a valid C
262262
// string with a NUL terminator of size less than `isize::MAX`, whose

library/core/src/ffi/mod.rs

-5
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,6 @@ macro_rules! type_alias {
5252
}
5353

5454
type_alias! { "c_char.md", c_char = c_char_definition::c_char, NonZero_c_char = c_char_definition::NonZero_c_char;
55-
// Make this type alias appear cfg-dependent so that Clippy does not suggest
56-
// replacing `0 as c_char` with `0_i8`/`0_u8`. This #[cfg(all())] can be removed
57-
// after the false positive in https://github.com/rust-lang/rust-clippy/issues/8093
58-
// is fixed.
59-
#[cfg(all())]
6055
#[doc(cfg(all()))] }
6156

6257
type_alias! { "c_schar.md", c_schar = i8, NonZero_c_schar = NonZeroI8; }

library/core/src/str/iter.rs

+16-5
Original file line numberDiff line numberDiff line change
@@ -1439,11 +1439,22 @@ impl<'a> Iterator for EncodeUtf16<'a> {
14391439

14401440
#[inline]
14411441
fn size_hint(&self) -> (usize, Option<usize>) {
1442-
let (low, high) = self.chars.size_hint();
1443-
// every char gets either one u16 or two u16,
1444-
// so this iterator is between 1 or 2 times as
1445-
// long as the underlying iterator.
1446-
(low, high.and_then(|n| n.checked_mul(2)))
1442+
let len = self.chars.iter.len();
1443+
// The highest bytes:code units ratio occurs for 3-byte sequences,
1444+
// since a 4-byte sequence results in 2 code units. The lower bound
1445+
// is therefore determined by assuming the remaining bytes contain as
1446+
// many 3-byte sequences as possible. The highest bytes:code units
1447+
// ratio is for 1-byte sequences, so use this for the upper bound.
1448+
// `(len + 2)` can't overflow, because we know that the `slice::Iter`
1449+
// belongs to a slice in memory which has a maximum length of
1450+
// `isize::MAX` (that's well below `usize::MAX`)
1451+
if self.extra == 0 {
1452+
((len + 2) / 3, Some(len))
1453+
} else {
1454+
// We're in the middle of a surrogate pair, so add the remaining
1455+
// surrogate to the bounds.
1456+
((len + 2) / 3 + 1, Some(len + 1))
1457+
}
14471458
}
14481459
}
14491460

library/std/src/ffi/os_str.rs

+65
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,51 @@ impl OsString {
141141
OsString { inner: Buf::from_string(String::new()) }
142142
}
143143

144+
/// Converts bytes to an `OsString` without checking that the bytes contains
145+
/// valid [`OsStr`]-encoded data.
146+
///
147+
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
148+
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
149+
/// ASCII.
150+
///
151+
/// See the [module's toplevel documentation about conversions][conversions] for safe,
152+
/// cross-platform [conversions] from/to native representations.
153+
///
154+
/// # Safety
155+
///
156+
/// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
157+
/// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
158+
/// built for the same target platform. For example, reconstructing an `OsString` from bytes sent
159+
/// over the network or stored in a file will likely violate these safety rules.
160+
///
161+
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
162+
/// split either immediately before or immediately after any valid non-empty UTF-8 substring.
163+
///
164+
/// # Example
165+
///
166+
/// ```
167+
/// #![feature(os_str_bytes)]
168+
///
169+
/// use std::ffi::OsStr;
170+
///
171+
/// let os_str = OsStr::new("Mary had a little lamb");
172+
/// let bytes = os_str.as_os_str_bytes();
173+
/// let words = bytes.split(|b| *b == b' ');
174+
/// let words: Vec<&OsStr> = words.map(|word| {
175+
/// // SAFETY:
176+
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
177+
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
178+
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
179+
/// }).collect();
180+
/// ```
181+
///
182+
/// [conversions]: super#conversions
183+
#[inline]
184+
#[unstable(feature = "os_str_bytes", issue = "111544")]
185+
pub unsafe fn from_os_str_bytes_unchecked(bytes: Vec<u8>) -> Self {
186+
OsString { inner: Buf::from_os_str_bytes_unchecked(bytes) }
187+
}
188+
144189
/// Converts to an [`OsStr`] slice.
145190
///
146191
/// # Examples
@@ -159,6 +204,26 @@ impl OsString {
159204
self
160205
}
161206

207+
/// Converts the `OsString` into a byte slice. To convert the byte slice back into an
208+
/// `OsString`, use the [`OsStr::from_os_str_bytes_unchecked`] function.
209+
///
210+
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
211+
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
212+
/// ASCII.
213+
///
214+
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
215+
/// be treated as opaque and only comparable within the same rust version built for the same
216+
/// target platform. For example, sending the bytes over the network or storing it in a file
217+
/// will likely result in incompatible data. See [`OsString`] for more encoding details
218+
/// and [`std::ffi`] for platform-specific, specified conversions.
219+
///
220+
/// [`std::ffi`]: crate::ffi
221+
#[inline]
222+
#[unstable(feature = "os_str_bytes", issue = "111544")]
223+
pub fn into_os_str_bytes(self) -> Vec<u8> {
224+
self.inner.into_os_str_bytes()
225+
}
226+
162227
/// Converts the `OsString` into a [`String`] if it contains valid Unicode data.
163228
///
164229
/// On failure, ownership of the original `OsString` is returned.

library/std/src/os/raw/mod.rs

-5
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,6 @@ macro_rules! alias_core_ffi {
99
($($t:ident)*) => {$(
1010
#[stable(feature = "raw_os", since = "1.1.0")]
1111
#[doc = include_str!(concat!("../../../../core/src/ffi/", stringify!($t), ".md"))]
12-
// Make this type alias appear cfg-dependent so that Clippy does not suggest
13-
// replacing expressions like `0 as c_char` with `0_i8`/`0_u8`. This #[cfg(all())] can be
14-
// removed after the false positive in https://github.com/rust-lang/rust-clippy/issues/8093
15-
// is fixed.
16-
#[cfg(all())]
1712
#[doc(cfg(all()))]
1813
pub type $t = core::ffi::$t;
1914
)*}

library/std/src/os/unix/fs.rs

+30-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,36 @@ pub trait FileExt {
149149
/// Note that similar to [`File::write`], it is not an error to return a
150150
/// short write.
151151
///
152+
/// # Bug
153+
/// On some systems, `write_at` utilises [`pwrite64`] to write to files.
154+
/// However, this syscall has a [bug] where files opened with the `O_APPEND`
155+
/// flag fail to respect the offset parameter, always appending to the end
156+
/// of the file instead.
157+
///
158+
/// It is possible to inadvertantly set this flag, like in the example below.
159+
/// Therefore, it is important to be vigilant while changing options to mitigate
160+
/// unexpected behaviour.
161+
///
162+
/// ```no_run
163+
/// use std::fs::File;
164+
/// use std::io;
165+
/// use std::os::unix::prelude::FileExt;
166+
///
167+
/// fn main() -> io::Result<()> {
168+
/// // Open a file with the append option (sets the `O_APPEND` flag)
169+
/// let file = File::options().append(true).open("foo.txt")?;
170+
///
171+
/// // We attempt to write at offset 10; instead appended to EOF
172+
/// file.write_at(b"sushi", 10)?;
173+
///
174+
/// // foo.txt is 5 bytes long instead of 15
175+
/// Ok(())
176+
/// }
177+
/// ```
178+
///
152179
/// [`File::write`]: fs::File::write
180+
/// [`pwrite64`]: https://man7.org/linux/man-pages/man2/pwrite.2.html
181+
/// [bug]: https://man7.org/linux/man-pages/man2/pwrite.2.html#BUGS
153182
///
154183
/// # Examples
155184
///
@@ -159,7 +188,7 @@ pub trait FileExt {
159188
/// use std::os::unix::prelude::FileExt;
160189
///
161190
/// fn main() -> io::Result<()> {
162-
/// let file = File::open("foo.txt")?;
191+
/// let file = File::create("foo.txt")?;
163192
///
164193
/// // We now write at the offset 10.
165194
/// file.write_at(b"sushi", 10)?;

library/std/src/sys/unix/os_str.rs

+10
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,16 @@ impl AsInner<[u8]> for Buf {
9696
}
9797

9898
impl Buf {
99+
#[inline]
100+
pub fn into_os_str_bytes(self) -> Vec<u8> {
101+
self.inner
102+
}
103+
104+
#[inline]
105+
pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
106+
Self { inner: s }
107+
}
108+
99109
pub fn from_string(s: String) -> Buf {
100110
Buf { inner: s.into_bytes() }
101111
}

library/std/src/sys/windows/os_str.rs

+10
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ impl fmt::Display for Slice {
6363
}
6464

6565
impl Buf {
66+
#[inline]
67+
pub fn into_os_str_bytes(self) -> Vec<u8> {
68+
self.inner.into_bytes()
69+
}
70+
71+
#[inline]
72+
pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
73+
Self { inner: Wtf8Buf::from_bytes_unchecked(s) }
74+
}
75+
6676
pub fn with_capacity(capacity: usize) -> Buf {
6777
Buf { inner: Wtf8Buf::with_capacity(capacity) }
6878
}

library/std/src/sys_common/wtf8.rs

+15
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,15 @@ impl Wtf8Buf {
182182
Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
183183
}
184184

185+
/// Creates a WTF-8 string from a WTF-8 byte vec.
186+
///
187+
/// Since the byte vec is not checked for valid WTF-8, this functions is
188+
/// marked unsafe.
189+
#[inline]
190+
pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
191+
Wtf8Buf { bytes: value, is_known_utf8: false }
192+
}
193+
185194
/// Creates a WTF-8 string from a UTF-8 `String`.
186195
///
187196
/// This takes ownership of the `String` and does not copy.
@@ -402,6 +411,12 @@ impl Wtf8Buf {
402411
self.bytes.truncate(new_len)
403412
}
404413

414+
/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
415+
#[inline]
416+
pub fn into_bytes(self) -> Vec<u8> {
417+
self.bytes
418+
}
419+
405420
/// Consumes the WTF-8 string and tries to convert it to UTF-8.
406421
///
407422
/// This does not copy the data.

0 commit comments

Comments
 (0)