Skip to content

Commit 02e468e

Browse files
authored
Rollup merge of rust-lang#78833 - CDirkx:parse_prefix, r=dtolnay
Refactor and fix `parse_prefix` on Windows This PR is an extension of rust-lang#78692 as well as a general refactor of `parse_prefix`: **Fixes**: There are two errors in the current implementation of `parse_prefix`: Firstly, in the current implementation only `\` is recognized as a separator character in device namespace prefixes. This behavior is only correct for verbatim paths; `"\\.\C:/foo"` should be parsed as `"C:"` instead of `"C:/foo"`. Secondly, the current implementation only handles single separator characters. In non-verbatim paths a series of separator characters should be recognized as a single boundary, e.g. the UNC path `"\\localhost\\\\\\C$\foo"` should be parsed as `"\\localhost\\\\\\C$"` and then `UNC(server: "localhost", share: "C$")`, but currently it is not parsed at all, because it starts being parsed as `\\localhost\` and then has an invalid empty share location. Paths like `"\\.\C:/foo"` and `"\\localhost\\\\\\C$\foo"` are valid on Windows, they are equivalent to just `"C:\foo"`. **Refactoring**: All uses of `&[u8]` within `parse_prefix` are extracted to helper functions and`&OsStr` is used instead. This reduces the number of places unsafe is used: - `get_first_two_components` is adapted to the more general `parse_next_component` and used in more places - code for parsing drive prefixes is extracted to `parse_drive`
2 parents 7feab00 + 94d73d4 commit 02e468e

File tree

4 files changed

+141
-84
lines changed

4 files changed

+141
-84
lines changed

Diff for: library/std/src/ffi/os_str.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -667,10 +667,10 @@ impl OsStr {
667667

668668
/// Gets the underlying byte representation.
669669
///
670-
/// Note: it is *crucial* that this API is private, to avoid
670+
/// Note: it is *crucial* that this API is not externally public, to avoid
671671
/// revealing the internal, platform-specific encodings.
672672
#[inline]
673-
fn bytes(&self) -> &[u8] {
673+
pub(crate) fn bytes(&self) -> &[u8] {
674674
unsafe { &*(&self.inner as *const _ as *const [u8]) }
675675
}
676676

Diff for: library/std/src/path/tests.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -873,12 +873,12 @@ pub fn test_decompositions_windows() {
873873
);
874874

875875
t!("\\\\.\\foo/bar",
876-
iter: ["\\\\.\\foo/bar", "\\"],
876+
iter: ["\\\\.\\foo", "\\", "bar"],
877877
has_root: true,
878878
is_absolute: true,
879-
parent: None,
880-
file_name: None,
881-
file_stem: None,
879+
parent: Some("\\\\.\\foo/"),
880+
file_name: Some("bar"),
881+
file_stem: Some("bar"),
882882
extension: None
883883
);
884884

Diff for: library/std/src/sys/windows/path.rs

+104-70
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,12 @@ mod tests;
88
pub const MAIN_SEP_STR: &str = "\\";
99
pub const MAIN_SEP: char = '\\';
1010

11-
// The unsafety here stems from converting between `&OsStr` and `&[u8]`
12-
// and back. This is safe to do because (1) we only look at ASCII
13-
// contents of the encoding and (2) new &OsStr values are produced
14-
// only from ASCII-bounded slices of existing &OsStr values.
15-
fn os_str_as_u8_slice(s: &OsStr) -> &[u8] {
16-
unsafe { mem::transmute(s) }
17-
}
18-
unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
19-
mem::transmute(s)
11+
// Safety: `bytes` must be a valid wtf8 encoded slice
12+
#[inline]
13+
unsafe fn bytes_as_os_str(bytes: &[u8]) -> &OsStr {
14+
// &OsStr is layout compatible with &Slice, which is compatible with &Wtf8,
15+
// which is compatible with &[u8].
16+
mem::transmute(bytes)
2017
}
2118

2219
#[inline]
@@ -29,79 +26,116 @@ pub fn is_verbatim_sep(b: u8) -> bool {
2926
b == b'\\'
3027
}
3128

32-
// In most DOS systems, it is not possible to have more than 26 drive letters.
33-
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
34-
pub fn is_valid_drive_letter(disk: u8) -> bool {
35-
disk.is_ascii_alphabetic()
36-
}
37-
3829
pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
3930
use Prefix::{DeviceNS, Disk, Verbatim, VerbatimDisk, VerbatimUNC, UNC};
4031

41-
let path = os_str_as_u8_slice(path);
42-
43-
// \\
44-
if let Some(path) = path.strip_prefix(br"\\") {
45-
// \\?\
46-
if let Some(path) = path.strip_prefix(br"?\") {
47-
// \\?\UNC\server\share
48-
if let Some(path) = path.strip_prefix(br"UNC\") {
49-
let (server, share) = match get_first_two_components(path, is_verbatim_sep) {
50-
Some((server, share)) => unsafe {
51-
(u8_slice_as_os_str(server), u8_slice_as_os_str(share))
52-
},
53-
None => (unsafe { u8_slice_as_os_str(path) }, OsStr::new("")),
54-
};
55-
return Some(VerbatimUNC(server, share));
32+
if let Some(path) = strip_prefix(path, r"\\") {
33+
// \\
34+
if let Some(path) = strip_prefix(path, r"?\") {
35+
// \\?\
36+
if let Some(path) = strip_prefix(path, r"UNC\") {
37+
// \\?\UNC\server\share
38+
39+
let (server, path) = parse_next_component(path, true);
40+
let (share, _) = parse_next_component(path, true);
41+
42+
Some(VerbatimUNC(server, share))
5643
} else {
57-
// \\?\path
58-
match path {
59-
// \\?\C:\path
60-
[c, b':', b'\\', ..] if is_valid_drive_letter(*c) => {
61-
return Some(VerbatimDisk(c.to_ascii_uppercase()));
62-
}
63-
// \\?\cat_pics
64-
_ => {
65-
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
66-
let slice = &path[..idx];
67-
return Some(Verbatim(unsafe { u8_slice_as_os_str(slice) }));
68-
}
44+
let (prefix, _) = parse_next_component(path, true);
45+
46+
// in verbatim paths only recognize an exact drive prefix
47+
if let Some(drive) = parse_drive_exact(prefix) {
48+
// \\?\C:
49+
Some(VerbatimDisk(drive))
50+
} else {
51+
// \\?\prefix
52+
Some(Verbatim(prefix))
6953
}
7054
}
71-
} else if let Some(path) = path.strip_prefix(b".\\") {
55+
} else if let Some(path) = strip_prefix(path, r".\") {
7256
// \\.\COM42
73-
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
74-
let slice = &path[..idx];
75-
return Some(DeviceNS(unsafe { u8_slice_as_os_str(slice) }));
76-
}
77-
match get_first_two_components(path, is_sep_byte) {
78-
Some((server, share)) if !server.is_empty() && !share.is_empty() => {
57+
let (prefix, _) = parse_next_component(path, false);
58+
Some(DeviceNS(prefix))
59+
} else {
60+
let (server, path) = parse_next_component(path, false);
61+
let (share, _) = parse_next_component(path, false);
62+
63+
if !server.is_empty() && !share.is_empty() {
7964
// \\server\share
80-
return Some(unsafe { UNC(u8_slice_as_os_str(server), u8_slice_as_os_str(share)) });
65+
Some(UNC(server, share))
66+
} else {
67+
// no valid prefix beginning with "\\" recognized
68+
None
8169
}
82-
_ => {}
8370
}
84-
} else if let [c, b':', ..] = path {
71+
} else if let Some(drive) = parse_drive(path) {
8572
// C:
86-
if is_valid_drive_letter(*c) {
87-
return Some(Disk(c.to_ascii_uppercase()));
88-
}
73+
Some(Disk(drive))
74+
} else {
75+
// no prefix
76+
None
8977
}
90-
None
9178
}
9279

93-
/// Returns the first two path components with predicate `f`.
94-
///
95-
/// The two components returned will be use by caller
96-
/// to construct `VerbatimUNC` or `UNC` Windows path prefix.
97-
///
98-
/// Returns [`None`] if there are no separators in path.
99-
fn get_first_two_components(path: &[u8], f: fn(u8) -> bool) -> Option<(&[u8], &[u8])> {
100-
let idx = path.iter().position(|&x| f(x))?;
101-
// Panic safe
102-
// The max `idx+1` is `path.len()` and `path[path.len()..]` is a valid index.
103-
let (first, path) = (&path[..idx], &path[idx + 1..]);
104-
let idx = path.iter().position(|&x| f(x)).unwrap_or(path.len());
105-
let second = &path[..idx];
106-
Some((first, second))
80+
// Parses a drive prefix, e.g. "C:" and "C:\whatever"
81+
fn parse_drive(prefix: &OsStr) -> Option<u8> {
82+
// In most DOS systems, it is not possible to have more than 26 drive letters.
83+
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
84+
fn is_valid_drive_letter(drive: &u8) -> bool {
85+
drive.is_ascii_alphabetic()
86+
}
87+
88+
match prefix.bytes() {
89+
[drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
90+
_ => None,
91+
}
92+
}
93+
94+
// Parses a drive prefix exactly, e.g. "C:"
95+
fn parse_drive_exact(prefix: &OsStr) -> Option<u8> {
96+
// only parse two bytes: the drive letter and the drive separator
97+
if prefix.len() == 2 { parse_drive(prefix) } else { None }
98+
}
99+
100+
fn strip_prefix<'a>(path: &'a OsStr, prefix: &str) -> Option<&'a OsStr> {
101+
// `path` and `prefix` are valid wtf8 and utf8 encoded slices respectively, `path[prefix.len()]`
102+
// is thus a code point boundary and `path[prefix.len()..]` is a valid wtf8 encoded slice.
103+
match path.bytes().strip_prefix(prefix.as_bytes()) {
104+
Some(path) => unsafe { Some(bytes_as_os_str(path)) },
105+
None => None,
106+
}
107+
}
108+
109+
// Parse the next path component.
110+
//
111+
// Returns the next component and the rest of the path excluding the component and separator.
112+
// Does not recognize `/` as a separator character if `verbatim` is true.
113+
fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
114+
let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };
115+
116+
match path.bytes().iter().position(|&x| separator(x)) {
117+
Some(separator_start) => {
118+
let mut separator_end = separator_start + 1;
119+
120+
// a series of multiple separator characters is treated as a single separator,
121+
// except in verbatim paths
122+
while !verbatim && separator_end < path.len() && separator(path.bytes()[separator_end])
123+
{
124+
separator_end += 1;
125+
}
126+
127+
let component = &path.bytes()[..separator_start];
128+
129+
// Panic safe
130+
// The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
131+
let path = &path.bytes()[separator_end..];
132+
133+
// Safety: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
134+
// is encoded in a single byte, therefore `bytes[separator_start]` and
135+
// `bytes[separator_end]` must be code point boundaries and thus
136+
// `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
137+
unsafe { (bytes_as_os_str(component), bytes_as_os_str(path)) }
138+
}
139+
None => (path, OsStr::new("")),
140+
}
107141
}

Diff for: library/std/src/sys/windows/path/tests.rs

+31-8
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,44 @@
11
use super::*;
22

33
#[test]
4-
fn test_get_first_two_components() {
4+
fn test_parse_next_component() {
55
assert_eq!(
6-
get_first_two_components(br"server\share", is_verbatim_sep),
7-
Some((&b"server"[..], &b"share"[..])),
6+
parse_next_component(OsStr::new(r"server\share"), true),
7+
(OsStr::new(r"server"), OsStr::new(r"share"))
88
);
99

1010
assert_eq!(
11-
get_first_two_components(br"server\", is_verbatim_sep),
12-
Some((&b"server"[..], &b""[..]))
11+
parse_next_component(OsStr::new(r"server/share"), true),
12+
(OsStr::new(r"server/share"), OsStr::new(r""))
1313
);
1414

1515
assert_eq!(
16-
get_first_two_components(br"\server\", is_verbatim_sep),
17-
Some((&b""[..], &b"server"[..]))
16+
parse_next_component(OsStr::new(r"server/share"), false),
17+
(OsStr::new(r"server"), OsStr::new(r"share"))
1818
);
1919

20-
assert_eq!(get_first_two_components(br"there are no separators here", is_verbatim_sep), None,);
20+
assert_eq!(
21+
parse_next_component(OsStr::new(r"server\"), false),
22+
(OsStr::new(r"server"), OsStr::new(r""))
23+
);
24+
25+
assert_eq!(
26+
parse_next_component(OsStr::new(r"\server\"), false),
27+
(OsStr::new(r""), OsStr::new(r"server\"))
28+
);
29+
30+
assert_eq!(
31+
parse_next_component(OsStr::new(r"servershare"), false),
32+
(OsStr::new(r"servershare"), OsStr::new(""))
33+
);
34+
35+
assert_eq!(
36+
parse_next_component(OsStr::new(r"server/\//\/\\\\/////\/share"), false),
37+
(OsStr::new(r"server"), OsStr::new(r"share"))
38+
);
39+
40+
assert_eq!(
41+
parse_next_component(OsStr::new(r"server\\\\\\\\\\\\\\share"), true),
42+
(OsStr::new(r"server"), OsStr::new(r"\\\\\\\\\\\\\share"))
43+
);
2144
}

0 commit comments

Comments
 (0)