diff --git a/src/comp/driver/driver.rs b/src/comp/driver/driver.rs index 78ec0330dd96a..c138ef7085546 100644 --- a/src/comp/driver/driver.rs +++ b/src/comp/driver/driver.rs @@ -80,7 +80,7 @@ fn parse_input(sess: session, cfg: ast::crate_cfg, input: str) if !input_is_stdin(input) { parser::parse_crate_from_file(input, cfg, sess.parse_sess) } else { - let src = @str::unsafe_from_bytes(io::stdin().read_whole_stream()); + let src = @str::from_bytes(io::stdin().read_whole_stream()); parser::parse_crate_from_source_str(input, src, cfg, sess.parse_sess) } } diff --git a/src/libcore/str.rs b/src/libcore/str.rs index dad10eced7f2b..9f40dd5c3a48c 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -12,9 +12,7 @@ For some heavy-duty uses, we recommend trying std::rope. export // Creating a string from_bytes, - unsafe_from_bytes, from_byte, - unsafe_from_byte, //push_utf8_bytes, from_char, from_chars, @@ -37,7 +35,7 @@ export // Transforming strings bytes, - to_chars, + chars, substr, char_slice, slice, @@ -62,13 +60,13 @@ export hash, // Iterating through strings - loop_chars, all, any, map, bytes_iter, - iter_chars, chars_iter, + split_chars_iter, + splitn_chars_iter, words_iter, lines_iter, @@ -96,7 +94,7 @@ export utf8_char_width, char_range_at, char_at, - loop_chars_sub, + substr_all, escape_char, as_buf, //buf, @@ -120,37 +118,11 @@ Function: from_bytes Convert a vector of bytes to a UTF-8 string. Fails if invalid UTF-8. */ -fn from_bytes(vv: [u8]) -> str { +fn from_bytes(vv: [u8]) -> str unsafe { assert is_utf8(vv); - ret unsafe_from_bytes(vv); + ret unsafe::from_bytes(vv); } -/* -Function: unsafe_from_bytes - -Converts a vector of bytes to a string. Does not verify that the -vector contains valid UTF-8. - -FIXME: stop exporting -*/ -fn unsafe_from_bytes(v: [const u8]) -> str unsafe { - let vcopy: [u8] = v + [0u8]; - let scopy: str = unsafe::reinterpret_cast(vcopy); - unsafe::leak(vcopy); - ret scopy; -} - -/* -Function: unsafe_from_byte - -Converts a byte to a string. Does not verify that the byte is -valid UTF-8. - -FIXME: stop exporting -*/ -fn unsafe_from_byte(u: u8) -> str { unsafe_from_bytes([u]) } - - /* Function: from_byte @@ -370,7 +342,7 @@ fn trim_left(s: str) -> str { } ret i; } - let chars = to_chars(s); + let chars = chars(s); let whities = count_whities(chars); ret from_chars(vec::slice(chars, whities, vec::len(chars))); } @@ -389,7 +361,7 @@ fn trim_right(s: str) -> str { } ret i; } - let chars = to_chars(s); + let chars = chars(s); let whities = count_whities(chars); ret from_chars(vec::slice(chars, 0u, whities)); } @@ -413,20 +385,20 @@ Converts a string to a vector of bytes. The result vector is not null-terminated. */ fn bytes(s: str) -> [u8] unsafe { - let v = unsafe::reinterpret_cast(s); + let v = ::unsafe::reinterpret_cast(s); let vcopy = vec::slice(v, 0u, vec::len(v) - 1u); - unsafe::leak(v); + ::unsafe::leak(v); ret vcopy; } /* -Function: to_chars +Function: chars Convert a string to a vector of characters FIXME: rename to 'chars' */ -fn to_chars(s: str) -> [char] { +fn chars(s: str) -> [char] { let buf: [char] = []; let i = 0u; let len = byte_len(s); @@ -469,7 +441,7 @@ Failure: FIXME: rename to slice(), make faster by avoiding char conversion */ fn char_slice(s: str, begin: uint, end: uint) -> str { - from_chars(vec::slice(to_chars(s), begin, end)) + from_chars(vec::slice(chars(s), begin, end)) } /* @@ -492,12 +464,12 @@ fn slice(s: str, begin: uint, end: uint) -> str unsafe { assert (begin <= end); assert (end <= byte_len(s)); - let v: [u8] = unsafe::reinterpret_cast(s); + let v: [u8] = ::unsafe::reinterpret_cast(s); let v2 = vec::slice(v, begin, end); - unsafe::leak(v); + ::unsafe::leak(v); v2 += [0u8]; - let s2: str = unsafe::reinterpret_cast(v2); - unsafe::leak(v2); + let s2: str = ::unsafe::reinterpret_cast(v2); + ::unsafe::leak(v2); ret s2; } @@ -620,7 +592,7 @@ fn split_func(ss: str, sepfn: fn(cc: char)->bool) -> [str] { let accum: str = ""; let ends_with_sep: bool = false; - str::iter_chars(ss, {|cc| if sepfn(cc) { + chars_iter(ss, {|cc| if sepfn(cc) { vv += [accum]; accum = ""; ends_with_sep = true; @@ -702,30 +674,18 @@ fn windowed(nn: uint, ss: str) -> [str] { Function: to_lower Convert a string to lowercase - -FIXME: rewrite with map */ fn to_lower(s: str) -> str { - let outstr = ""; - iter_chars(s) { |c| - push_char(outstr, char::to_lower(c)); - } - ret outstr; + map(s, char::to_lower) } /* Function: to_upper Convert a string to uppercase - -FIXME: rewrite with map */ fn to_upper(s: str) -> str { - let outstr = ""; - iter_chars(s) { |c| - push_char(outstr, char::to_upper(c)); - } - ret outstr; + map(s, char::to_upper) } // FIXME: This is super-inefficient @@ -769,7 +729,7 @@ Escapes special characters inside the string, making it safe for transfer. */ fn escape(s: str) -> str { let r = ""; - loop_chars(s, { |c| r += escape_char(c); true }); + all(s, { |c| r += escape_char(c); true }); r } @@ -809,37 +769,14 @@ fn hash(&&s: str) -> uint { Section: Iterating through strings */ -/* -Function: loop_chars - -Loop through a string, char by char - -Parameters: -s - A string to traverse. It may be empty. -it - A block to execute with each consecutive character of `s`. -Return `true` to continue, `false` to stop. - -Returns: - -`true` If execution proceeded correctly, `false` if it was interrupted, -that is if `it` returned `false` at any point. - -FIXME: rename to 'chars_loop' (change? currently a synonym to 'all') - */ -fn loop_chars(s: str, it: fn(char) -> bool) -> bool{ - ret loop_chars_sub(s, 0u, byte_len(s), it); -} - /* Function: all Return true if a predicate matches all characters or if the string contains no characters - -// FIXME: a synonym to loop_chars */ -fn all(ss: str, ff: fn(char) -> bool) -> bool { - str::loop_chars(ss, ff) +fn all(s: str, it: fn(char) -> bool) -> bool{ + ret substr_all(s, 0u, byte_len(s), it); } /* @@ -860,7 +797,7 @@ Apply a function to each character fn map(ss: str, ff: fn(char) -> char) -> str { let result = ""; - str::iter_chars(ss, {|cc| + chars_iter(ss, {|cc| str::push_char(result, ff(cc)); }); @@ -871,8 +808,6 @@ fn map(ss: str, ff: fn(char) -> char) -> str { Function: bytes_iter Iterate over the bytes in a string - -FIXME: Should it really include the last byte '\0'? */ fn bytes_iter(ss: str, it: fn(u8)) { let pos = 0u; @@ -885,13 +820,11 @@ fn bytes_iter(ss: str, it: fn(u8)) { } /* -Function: iter_chars +Function: chars_iter Iterate over the characters in a string - -FIXME: rename to 'chars_iter' */ -fn iter_chars(s: str, it: fn(char)) { +fn chars_iter(s: str, it: fn(char)) { let pos = 0u, len = byte_len(s); while (pos < len) { let {ch, next} = char_range_at(s, pos); @@ -901,14 +834,25 @@ fn iter_chars(s: str, it: fn(char)) { } /* -Function: chars_iter +Function: split_chars_iter -Iterate over the characters in a string +Apply a function to each substring after splitting +by character +*/ +fn split_chars_iter(ss: str, cc: char, ff: fn(&&str)) { + vec::iter(split_char(ss, cc), ff) +} + +/* +Function: splitn_chars_iter -FIXME: A synonym to iter_chars +Apply a function to each substring after splitting +by character, up to nn times + +FIXME: make this use chars when splitn/splitn_char is fixed */ -fn chars_iter(ss: str, it: fn(char)) { - iter_chars(ss, it) +fn splitn_chars_iter(ss: str, sep: u8, count: uint, ff: fn(&&str)) { + vec::iter(splitn(ss, sep, count), ff) } /* @@ -929,9 +873,6 @@ fn lines_iter(ss: str, ff: fn(&&str)) { vec::iter(lines(ss), ff) } -// FIXME: ADD split_char_iter -// FIXME: ADD splitn_char_iter - /* Section: Searching */ @@ -1082,7 +1023,7 @@ Function: is_whitespace Returns true if the string contains only whitespace */ fn is_whitespace(s: str) -> bool { - ret loop_chars(s, char::is_whitespace); + ret all(s, char::is_whitespace); } /* @@ -1093,9 +1034,9 @@ Returns the length in bytes of a string FIXME: rename to 'len_bytes'? */ pure fn byte_len(s: str) -> uint unsafe { - let v: [u8] = unsafe::reinterpret_cast(s); + let v: [u8] = ::unsafe::reinterpret_cast(s); let vlen = vec::len(v); - unsafe::leak(v); + ::unsafe::leak(v); // There should always be a null terminator assert (vlen > 0u); ret vlen - 1u; @@ -1298,7 +1239,7 @@ Pluck a character out of a string fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; } /* -Function: loop_chars_sub +Function: substr_all Loop through a substring, char by char @@ -1318,10 +1259,8 @@ Safety note: - This function does not check whether the substring is valid. - This function fails if `byte_offset` or `byte_len` do not represent valid positions inside `s` - -FIXME: rename to 'substr_all' */ -fn loop_chars_sub(s: str, byte_offset: uint, byte_len: uint, +fn substr_all(s: str, byte_offset: uint, byte_len: uint, it: fn(char) -> bool) -> bool { let i = byte_offset; let result = true; @@ -1371,7 +1310,7 @@ const tag_six_b: uint = 252u; // no guarantee that the string is rooted). Instead, use as_buf below. unsafe fn buf(s: str) -> sbuf { let saddr = ptr::addr_of(s); - let vaddr: *[u8] = unsafe::reinterpret_cast(saddr); + let vaddr: *[u8] = ::unsafe::reinterpret_cast(saddr); let buf = vec::to_ptr(*vaddr); ret buf; } @@ -1398,6 +1337,33 @@ An unsafe buffer of bytes. Corresponds to a C char pointer. */ type sbuf = *u8; +// Module: unsafe +// +// These functions may create invalid UTF-8 strings and eat your baby. +mod unsafe { + export + // UNSAFE + from_bytes, + from_byte; + + // Function: unsafe::from_bytes + // + // Converts a vector of bytes to a string. Does not verify that the + // vector contains valid UTF-8. + unsafe fn from_bytes(v: [const u8]) -> str unsafe { + let vcopy: [u8] = v + [0u8]; + let scopy: str = ::unsafe::reinterpret_cast(vcopy); + ::unsafe::leak(vcopy); + ret scopy; + } + + // Function: unsafe::from_byte + // + // Converts a byte to a string. Does not verify that the byte is + // valid UTF-8. + unsafe fn from_byte(u: u8) -> str { unsafe::from_bytes([u]) } +} + #[cfg(test)] mod tests { @@ -1605,8 +1571,9 @@ mod tests { #[test] fn test_to_upper() { - // to_upper doesn't understand unicode yet, - // but we need to at least preserve it + // char::to_upper, and hence str::to_upper + // are culturally insensitive: I'm not sure they + // really work for anything but English ASCII, but YMMV let unicode = "\u65e5\u672c"; let input = "abcDEF" + unicode + "xyz:.;"; @@ -1615,6 +1582,12 @@ mod tests { assert (eq(expected, actual)); } + #[test] + fn test_to_lower() { + assert "" == map("", char::to_lower); + assert "ymca" == map("YMCA", char::to_lower); + } + #[test] fn test_slice() { assert (eq("ab", slice("abc", 0u, 2u))); @@ -1771,9 +1744,9 @@ mod tests { } #[test] - fn test_unsafe_from_bytes() { + fn test_unsafe_from_bytes() unsafe { let a = [65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8]; - let b = unsafe_from_bytes(a); + let b = unsafe::from_bytes(a); assert (b == "AAAAAAA"); } @@ -1872,21 +1845,6 @@ mod tests { assert !contains("", "a"); } - #[test] - fn test_iter_chars() { - let i = 0; - iter_chars("x\u03c0y") {|ch| - alt i { - 0 { assert ch == 'x'; } - 1 { assert ch == '\u03c0'; } - 2 { assert ch == 'y'; } - } - i += 1; - } - - iter_chars("") {|_ch| fail; } // should not fail - } - #[test] fn test_chars_iter() { let i = 0; @@ -1918,6 +1876,41 @@ mod tests { bytes_iter("") {|bb| assert bb == 0u8; } } + #[test] + fn test_split_chars_iter() { + let data = "\nMary had a little lamb\nLittle lamb\n"; + + let ii = 0; + + split_chars_iter(data, ' ') {|xx| + alt ii { + 0 { assert "\nMary" == xx; } + 1 { assert "had" == xx; } + 2 { assert "a" == xx; } + 3 { assert "little" == xx; } + _ { () } + } + ii += 1; + } + } + + #[test] + fn test_splitn_chars_iter() { + let data = "\nMary had a little lamb\nLittle lamb\n"; + + let ii = 0; + + splitn_chars_iter(data, ' ' as u8, 2u) {|xx| + alt ii { + 0 { assert "\nMary" == xx; } + 1 { assert "had" == xx; } + 2 { assert "a little lamb\nLittle lamb\n" == xx; } + _ { () } + } + ii += 1; + } + } + #[test] fn test_words_iter() { let data = "\nMary had a little lamb\nLittle lamb\n"; @@ -2005,4 +1998,11 @@ mod tests { fn test_windowed_() { let _x = windowed(0u, "abcd"); } + + #[test] + fn test_chars() { + let ss = "ศไทย中华Việt Nam"; + assert ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'] + == chars(ss); + } } diff --git a/src/libstd/rope.rs b/src/libstd/rope.rs index 5e856ff5e0e0b..b181ba7475ca1 100644 --- a/src/libstd/rope.rs +++ b/src/libstd/rope.rs @@ -1137,7 +1137,7 @@ mod node { fn loop_chars(node: @node, it: fn(char) -> bool) -> bool { ret loop_leaves(node, {|leaf| - ret str::loop_chars_sub(*leaf.content, + ret str::substr_all(*leaf.content, leaf.byte_offset, leaf.byte_len, it) }) @@ -1494,4 +1494,4 @@ mod tests { assert eq(r, r2); } -} \ No newline at end of file +} diff --git a/src/rustdoc/unindent_pass.rs b/src/rustdoc/unindent_pass.rs index ea439fd94c470..8d6aec7330039 100644 --- a/src/rustdoc/unindent_pass.rs +++ b/src/rustdoc/unindent_pass.rs @@ -47,7 +47,7 @@ fn unindent(s: str) -> str { } else { saw_first_line = true; let spaces = 0u; - str::loop_chars(line) {|char| + str::all(line) {|char| // Only comparing against space because I wouldn't // know what to do with mixed whitespace chars if char == ' ' { @@ -117,4 +117,4 @@ fn should_not_ignore_first_line_indent_in_a_single_line_para() { let s = "line1\n\n line2"; let r = unindent(s); assert r == "line1\n\n line2"; -} \ No newline at end of file +} diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs index 67e3dbcb46cfe..7b4e9bf6d1c4d 100644 --- a/src/test/run-pass/utf8_chars.rs +++ b/src/test/run-pass/utf8_chars.rs @@ -9,8 +9,8 @@ fn main() { assert (str::byte_len(s) == 10u); assert (str::char_len(s) == 4u); - assert (vec::len::(str::to_chars(s)) == 4u); - assert (str::eq(str::from_chars(str::to_chars(s)), s)); + assert (vec::len::(str::chars(s)) == 4u); + assert (str::eq(str::from_chars(str::chars(s)), s)); assert (str::char_at(s, 0u) == 'e'); assert (str::char_at(s, 1u) == 'é');