Skip to content

Commit dcc3c17

Browse files
authored
Rollup merge of rust-lang#94713 - clarfonthey:is_char_surrogate, r=scottmcm
Add u16::is_utf16_surrogate Right now, there are methods in the standard library for encoding and decoding UTF-16, but at least for the moment, there aren't any methods specifically for `u16` to help work with UTF-16 data. Since the full logic already exists, this wouldn't really add any code, just expose what's already there. This method in particular is useful for working with the data returned by Windows `OsStrExt::encode_wide`. Initially, I was planning to also offer a `TryFrom<u16> for char`, but decided against it for now. There is plenty of code in rustc that could be rewritten to use this method, but I only checked within the standard library to replace them. I think that offering more UTF-16-related methods to u16 would be useful, but I think this one is a good start. For example, one useful method might be `u16::is_pattern_whitespace`, which would check if something is the Unicode `Pattern_Whitespace` category. We can get away with this because all of the `Pattern_Whitespace` characters are in the basic multilingual plane, and hence we don't need to check for surrogates.
2 parents af11c71 + 5fea53e commit dcc3c17

File tree

3 files changed

+30
-3
lines changed

3 files changed

+30
-3
lines changed

core/src/char/decode.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
9191
None => self.iter.next()?,
9292
};
9393

94-
if u < 0xD800 || 0xDFFF < u {
94+
if !u.is_utf16_surrogate() {
9595
// SAFETY: not a surrogate
9696
Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
9797
} else if u >= 0xDC00 {
@@ -125,7 +125,7 @@ impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
125125
// buf is empty, no additional elements from it.
126126
None => (0, 0),
127127
// `u` is a non surrogate, so it's always an additional character.
128-
Some(u) if u < 0xD800 || 0xDFFF < u => (1, 1),
128+
Some(u) if !u.is_utf16_surrogate() => (1, 1),
129129
// `u` is a leading surrogate (it can never be a trailing surrogate and
130130
// it's a surrogate due to the previous branch) and `self.iter` is empty.
131131
//

core/src/lib.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
#![warn(missing_docs)]
9494
#![allow(explicit_outlives_requirements)]
9595
//
96-
// Library features for const fns:
96+
// Library features:
9797
#![feature(const_align_offset)]
9898
#![feature(const_align_of_val)]
9999
#![feature(const_alloc_layout)]
@@ -146,6 +146,8 @@
146146
#![feature(ptr_metadata)]
147147
#![feature(slice_ptr_get)]
148148
#![feature(str_internals)]
149+
#![feature(utf16_extra)]
150+
#![feature(utf16_extra_const)]
149151
#![feature(variant_count)]
150152
#![feature(const_array_from_ref)]
151153
#![feature(const_slice_from_ref)]

core/src/num/mod.rs

+25
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,31 @@ impl u16 {
820820
uint_impl! { u16, u16, i16, NonZeroU16, 16, 65535, 4, "0xa003", "0x3a", "0x1234", "0x3412", "0x2c48",
821821
"[0x34, 0x12]", "[0x12, 0x34]", "", "" }
822822
widening_impl! { u16, u32, 16, unsigned }
823+
824+
/// Checks if the value is a Unicode surrogate code point, which are disallowed values for [`char`].
825+
///
826+
/// # Examples
827+
///
828+
/// ```
829+
/// #![feature(utf16_extra)]
830+
///
831+
/// let low_non_surrogate = 0xA000u16;
832+
/// let low_surrogate = 0xD800u16;
833+
/// let high_surrogate = 0xDC00u16;
834+
/// let high_non_surrogate = 0xE000u16;
835+
///
836+
/// assert!(!low_non_surrogate.is_utf16_surrogate());
837+
/// assert!(low_surrogate.is_utf16_surrogate());
838+
/// assert!(high_surrogate.is_utf16_surrogate());
839+
/// assert!(!high_non_surrogate.is_utf16_surrogate());
840+
/// ```
841+
#[must_use]
842+
#[unstable(feature = "utf16_extra", issue = "94919")]
843+
#[rustc_const_unstable(feature = "utf16_extra_const", issue = "94919")]
844+
#[inline]
845+
pub const fn is_utf16_surrogate(self) -> bool {
846+
matches!(self, 0xD800..=0xDFFF)
847+
}
823848
}
824849

825850
#[lang = "u32"]

0 commit comments

Comments
 (0)