From 8450bc3e3b9c5cd58b7d62350a0f543edaa6f037 Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Mon, 30 Jan 2012 19:52:38 -0800 Subject: [PATCH 1/7] Rename str::from_byte(s) to str::unsafe::from_byte(s), mark them as unsafe, make comp/driver/driver.rs use str::from_bytes... --- src/comp/driver/driver.rs | 2 +- src/libcore/str.rs | 87 +++++++++++++++++++++------------------ 2 files changed, 47 insertions(+), 42 deletions(-) diff --git a/src/comp/driver/driver.rs b/src/comp/driver/driver.rs index 78ec0330dd96a..c138ef7085546 100644 --- a/src/comp/driver/driver.rs +++ b/src/comp/driver/driver.rs @@ -80,7 +80,7 @@ fn parse_input(sess: session, cfg: ast::crate_cfg, input: str) if !input_is_stdin(input) { parser::parse_crate_from_file(input, cfg, sess.parse_sess) } else { - let src = @str::unsafe_from_bytes(io::stdin().read_whole_stream()); + let src = @str::from_bytes(io::stdin().read_whole_stream()); parser::parse_crate_from_source_str(input, src, cfg, sess.parse_sess) } } diff --git a/src/libcore/str.rs b/src/libcore/str.rs index dad10eced7f2b..5ca66e427ea5b 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -12,9 +12,7 @@ For some heavy-duty uses, we recommend trying std::rope. export // Creating a string from_bytes, - unsafe_from_bytes, from_byte, - unsafe_from_byte, //push_utf8_bytes, from_char, from_chars, @@ -120,37 +118,11 @@ Function: from_bytes Convert a vector of bytes to a UTF-8 string. Fails if invalid UTF-8. */ -fn from_bytes(vv: [u8]) -> str { +fn from_bytes(vv: [u8]) -> str unsafe { assert is_utf8(vv); - ret unsafe_from_bytes(vv); + ret unsafe::from_bytes(vv); } -/* -Function: unsafe_from_bytes - -Converts a vector of bytes to a string. Does not verify that the -vector contains valid UTF-8. - -FIXME: stop exporting -*/ -fn unsafe_from_bytes(v: [const u8]) -> str unsafe { - let vcopy: [u8] = v + [0u8]; - let scopy: str = unsafe::reinterpret_cast(vcopy); - unsafe::leak(vcopy); - ret scopy; -} - -/* -Function: unsafe_from_byte - -Converts a byte to a string. Does not verify that the byte is -valid UTF-8. - -FIXME: stop exporting -*/ -fn unsafe_from_byte(u: u8) -> str { unsafe_from_bytes([u]) } - - /* Function: from_byte @@ -211,6 +183,7 @@ fn from_chars(chs: [char]) -> str { ret buf; } +// FIXME: not unsafe now /* Function: from_cstr @@ -413,9 +386,9 @@ Converts a string to a vector of bytes. The result vector is not null-terminated. */ fn bytes(s: str) -> [u8] unsafe { - let v = unsafe::reinterpret_cast(s); + let v = ::unsafe::reinterpret_cast(s); let vcopy = vec::slice(v, 0u, vec::len(v) - 1u); - unsafe::leak(v); + ::unsafe::leak(v); ret vcopy; } @@ -492,12 +465,12 @@ fn slice(s: str, begin: uint, end: uint) -> str unsafe { assert (begin <= end); assert (end <= byte_len(s)); - let v: [u8] = unsafe::reinterpret_cast(s); + let v: [u8] = ::unsafe::reinterpret_cast(s); let v2 = vec::slice(v, begin, end); - unsafe::leak(v); + ::unsafe::leak(v); v2 += [0u8]; - let s2: str = unsafe::reinterpret_cast(v2); - unsafe::leak(v2); + let s2: str = ::unsafe::reinterpret_cast(v2); + ::unsafe::leak(v2); ret s2; } @@ -1093,9 +1066,9 @@ Returns the length in bytes of a string FIXME: rename to 'len_bytes'? */ pure fn byte_len(s: str) -> uint unsafe { - let v: [u8] = unsafe::reinterpret_cast(s); + let v: [u8] = ::unsafe::reinterpret_cast(s); let vlen = vec::len(v); - unsafe::leak(v); + ::unsafe::leak(v); // There should always be a null terminator assert (vlen > 0u); ret vlen - 1u; @@ -1371,7 +1344,7 @@ const tag_six_b: uint = 252u; // no guarantee that the string is rooted). Instead, use as_buf below. unsafe fn buf(s: str) -> sbuf { let saddr = ptr::addr_of(s); - let vaddr: *[u8] = unsafe::reinterpret_cast(saddr); + let vaddr: *[u8] = ::unsafe::reinterpret_cast(saddr); let buf = vec::to_ptr(*vaddr); ret buf; } @@ -1398,6 +1371,38 @@ An unsafe buffer of bytes. Corresponds to a C char pointer. */ type sbuf = *u8; +mod unsafe { + export + // UNSAFE + from_bytes, + from_byte; + + /* + Function: unsafe::from_bytes + + Converts a vector of bytes to a string. Does not verify that the + vector contains valid UTF-8. + + FIXME: stop exporting + */ + unsafe fn from_bytes(v: [const u8]) -> str unsafe { + let vcopy: [u8] = v + [0u8]; + let scopy: str = ::unsafe::reinterpret_cast(vcopy); + ::unsafe::leak(vcopy); + ret scopy; + } + + /* + Function: unsafe::from_byte + + Converts a byte to a string. Does not verify that the byte is + valid UTF-8. + + FIXME: stop exporting + */ + unsafe fn from_byte(u: u8) -> str { unsafe::from_bytes([u]) } +} + #[cfg(test)] mod tests { @@ -1771,9 +1776,9 @@ mod tests { } #[test] - fn test_unsafe_from_bytes() { + fn test_unsafe_from_bytes() unsafe { let a = [65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8]; - let b = unsafe_from_bytes(a); + let b = unsafe::from_bytes(a); assert (b == "AAAAAAA"); } From bcbf42159649c12eb6dc6abc050d3e82da6e9cbf Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Mon, 30 Jan 2012 20:06:38 -0800 Subject: [PATCH 2/7] Comment fixes in str --- src/libcore/str.rs | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 5ca66e427ea5b..7c588b139708c 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -183,7 +183,6 @@ fn from_chars(chs: [char]) -> str { ret buf; } -// FIXME: not unsafe now /* Function: from_cstr @@ -1371,20 +1370,19 @@ An unsafe buffer of bytes. Corresponds to a C char pointer. */ type sbuf = *u8; +// Module: unsafe +// +// These functions may create invalid UTF-8 strings and eat your baby. mod unsafe { export // UNSAFE from_bytes, from_byte; - /* - Function: unsafe::from_bytes - - Converts a vector of bytes to a string. Does not verify that the - vector contains valid UTF-8. - - FIXME: stop exporting - */ + // Function: unsafe::from_bytes + // + // Converts a vector of bytes to a string. Does not verify that the + // vector contains valid UTF-8. unsafe fn from_bytes(v: [const u8]) -> str unsafe { let vcopy: [u8] = v + [0u8]; let scopy: str = ::unsafe::reinterpret_cast(vcopy); @@ -1392,14 +1390,10 @@ mod unsafe { ret scopy; } - /* - Function: unsafe::from_byte - - Converts a byte to a string. Does not verify that the byte is - valid UTF-8. - - FIXME: stop exporting - */ + // Function: unsafe::from_byte + // + // Converts a byte to a string. Does not verify that the byte is + // valid UTF-8. unsafe fn from_byte(u: u8) -> str { unsafe::from_bytes([u]) } } From 9dbb1e32e476de423b71e3ab8a200bf543e09bfc Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Mon, 30 Jan 2012 20:27:16 -0800 Subject: [PATCH 3/7] Rename str::loop_chars to str::all, rename str::loop_chars_sub to str::substr_all, and propagate this change to std::rope and rustdoc's calls to these --- src/libcore/str.rs | 40 +++++++----------------------------- src/libstd/rope.rs | 4 ++-- src/rustdoc/unindent_pass.rs | 4 ++-- 3 files changed, 11 insertions(+), 37 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 7c588b139708c..c05348b67e523 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -60,7 +60,6 @@ export hash, // Iterating through strings - loop_chars, all, any, map, @@ -94,7 +93,7 @@ export utf8_char_width, char_range_at, char_at, - loop_chars_sub, + substr_all, escape_char, as_buf, //buf, @@ -741,7 +740,7 @@ Escapes special characters inside the string, making it safe for transfer. */ fn escape(s: str) -> str { let r = ""; - loop_chars(s, { |c| r += escape_char(c); true }); + all(s, { |c| r += escape_char(c); true }); r } @@ -781,37 +780,14 @@ fn hash(&&s: str) -> uint { Section: Iterating through strings */ -/* -Function: loop_chars - -Loop through a string, char by char - -Parameters: -s - A string to traverse. It may be empty. -it - A block to execute with each consecutive character of `s`. -Return `true` to continue, `false` to stop. - -Returns: - -`true` If execution proceeded correctly, `false` if it was interrupted, -that is if `it` returned `false` at any point. - -FIXME: rename to 'chars_loop' (change? currently a synonym to 'all') - */ -fn loop_chars(s: str, it: fn(char) -> bool) -> bool{ - ret loop_chars_sub(s, 0u, byte_len(s), it); -} - /* Function: all Return true if a predicate matches all characters or if the string contains no characters - -// FIXME: a synonym to loop_chars */ -fn all(ss: str, ff: fn(char) -> bool) -> bool { - str::loop_chars(ss, ff) +fn all(s: str, it: fn(char) -> bool) -> bool{ + ret substr_all(s, 0u, byte_len(s), it); } /* @@ -1054,7 +1030,7 @@ Function: is_whitespace Returns true if the string contains only whitespace */ fn is_whitespace(s: str) -> bool { - ret loop_chars(s, char::is_whitespace); + ret all(s, char::is_whitespace); } /* @@ -1270,7 +1246,7 @@ Pluck a character out of a string fn char_at(s: str, i: uint) -> char { ret char_range_at(s, i).ch; } /* -Function: loop_chars_sub +Function: substr_all Loop through a substring, char by char @@ -1290,10 +1266,8 @@ Safety note: - This function does not check whether the substring is valid. - This function fails if `byte_offset` or `byte_len` do not represent valid positions inside `s` - -FIXME: rename to 'substr_all' */ -fn loop_chars_sub(s: str, byte_offset: uint, byte_len: uint, +fn substr_all(s: str, byte_offset: uint, byte_len: uint, it: fn(char) -> bool) -> bool { let i = byte_offset; let result = true; diff --git a/src/libstd/rope.rs b/src/libstd/rope.rs index 5e856ff5e0e0b..b181ba7475ca1 100644 --- a/src/libstd/rope.rs +++ b/src/libstd/rope.rs @@ -1137,7 +1137,7 @@ mod node { fn loop_chars(node: @node, it: fn(char) -> bool) -> bool { ret loop_leaves(node, {|leaf| - ret str::loop_chars_sub(*leaf.content, + ret str::substr_all(*leaf.content, leaf.byte_offset, leaf.byte_len, it) }) @@ -1494,4 +1494,4 @@ mod tests { assert eq(r, r2); } -} \ No newline at end of file +} diff --git a/src/rustdoc/unindent_pass.rs b/src/rustdoc/unindent_pass.rs index ea439fd94c470..8d6aec7330039 100644 --- a/src/rustdoc/unindent_pass.rs +++ b/src/rustdoc/unindent_pass.rs @@ -47,7 +47,7 @@ fn unindent(s: str) -> str { } else { saw_first_line = true; let spaces = 0u; - str::loop_chars(line) {|char| + str::all(line) {|char| // Only comparing against space because I wouldn't // know what to do with mixed whitespace chars if char == ' ' { @@ -117,4 +117,4 @@ fn should_not_ignore_first_line_indent_in_a_single_line_para() { let s = "line1\n\n line2"; let r = unindent(s); assert r == "line1\n\n line2"; -} \ No newline at end of file +} From 833b94d018b89224ac99d77f7302ea861c42cc94 Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Mon, 30 Jan 2012 20:44:48 -0800 Subject: [PATCH 4/7] Rename str::to_chars -> str::chars --- src/libcore/str.rs | 19 +++++++++++++------ src/test/run-pass/utf8_chars.rs | 4 ++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index c05348b67e523..baf7737ca8cce 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -35,7 +35,7 @@ export // Transforming strings bytes, - to_chars, + chars, substr, char_slice, slice, @@ -341,7 +341,7 @@ fn trim_left(s: str) -> str { } ret i; } - let chars = to_chars(s); + let chars = chars(s); let whities = count_whities(chars); ret from_chars(vec::slice(chars, whities, vec::len(chars))); } @@ -360,7 +360,7 @@ fn trim_right(s: str) -> str { } ret i; } - let chars = to_chars(s); + let chars = chars(s); let whities = count_whities(chars); ret from_chars(vec::slice(chars, 0u, whities)); } @@ -391,13 +391,13 @@ fn bytes(s: str) -> [u8] unsafe { } /* -Function: to_chars +Function: chars Convert a string to a vector of characters FIXME: rename to 'chars' */ -fn to_chars(s: str) -> [char] { +fn chars(s: str) -> [char] { let buf: [char] = []; let i = 0u; let len = byte_len(s); @@ -440,7 +440,7 @@ Failure: FIXME: rename to slice(), make faster by avoiding char conversion */ fn char_slice(s: str, begin: uint, end: uint) -> str { - from_chars(vec::slice(to_chars(s), begin, end)) + from_chars(vec::slice(chars(s), begin, end)) } /* @@ -1978,4 +1978,11 @@ mod tests { fn test_windowed_() { let _x = windowed(0u, "abcd"); } + + #[test] + fn test_chars() { + let ss = "ศไทย中华Việt Nam"; + assert ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'] + == chars(ss); + } } diff --git a/src/test/run-pass/utf8_chars.rs b/src/test/run-pass/utf8_chars.rs index 67e3dbcb46cfe..7b4e9bf6d1c4d 100644 --- a/src/test/run-pass/utf8_chars.rs +++ b/src/test/run-pass/utf8_chars.rs @@ -9,8 +9,8 @@ fn main() { assert (str::byte_len(s) == 10u); assert (str::char_len(s) == 4u); - assert (vec::len::(str::to_chars(s)) == 4u); - assert (str::eq(str::from_chars(str::to_chars(s)), s)); + assert (vec::len::(str::chars(s)) == 4u); + assert (str::eq(str::from_chars(str::chars(s)), s)); assert (str::char_at(s, 0u) == 'e'); assert (str::char_at(s, 1u) == 'é'); From f944c8631324f2e41fd6d0d6838cbbf150496056 Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Mon, 30 Jan 2012 20:55:17 -0800 Subject: [PATCH 5/7] Rename str::iter_chars -> str::chars_iter --- src/libcore/str.rs | 41 ++++++----------------------------------- 1 file changed, 6 insertions(+), 35 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index baf7737ca8cce..cc5de8e91f999 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -64,7 +64,6 @@ export any, map, bytes_iter, - iter_chars, chars_iter, words_iter, lines_iter, @@ -591,7 +590,7 @@ fn split_func(ss: str, sepfn: fn(cc: char)->bool) -> [str] { let accum: str = ""; let ends_with_sep: bool = false; - str::iter_chars(ss, {|cc| if sepfn(cc) { + chars_iter(ss, {|cc| if sepfn(cc) { vv += [accum]; accum = ""; ends_with_sep = true; @@ -678,7 +677,7 @@ FIXME: rewrite with map */ fn to_lower(s: str) -> str { let outstr = ""; - iter_chars(s) { |c| + chars_iter(s) { |c| push_char(outstr, char::to_lower(c)); } ret outstr; @@ -693,7 +692,7 @@ FIXME: rewrite with map */ fn to_upper(s: str) -> str { let outstr = ""; - iter_chars(s) { |c| + chars_iter(s) { |c| push_char(outstr, char::to_upper(c)); } ret outstr; @@ -808,7 +807,7 @@ Apply a function to each character fn map(ss: str, ff: fn(char) -> char) -> str { let result = ""; - str::iter_chars(ss, {|cc| + chars_iter(ss, {|cc| str::push_char(result, ff(cc)); }); @@ -833,13 +832,11 @@ fn bytes_iter(ss: str, it: fn(u8)) { } /* -Function: iter_chars +Function: chars_iter Iterate over the characters in a string - -FIXME: rename to 'chars_iter' */ -fn iter_chars(s: str, it: fn(char)) { +fn chars_iter(s: str, it: fn(char)) { let pos = 0u, len = byte_len(s); while (pos < len) { let {ch, next} = char_range_at(s, pos); @@ -848,17 +845,6 @@ fn iter_chars(s: str, it: fn(char)) { } } -/* -Function: chars_iter - -Iterate over the characters in a string - -FIXME: A synonym to iter_chars -*/ -fn chars_iter(ss: str, it: fn(char)) { - iter_chars(ss, it) -} - /* Function: words_iter @@ -1845,21 +1831,6 @@ mod tests { assert !contains("", "a"); } - #[test] - fn test_iter_chars() { - let i = 0; - iter_chars("x\u03c0y") {|ch| - alt i { - 0 { assert ch == 'x'; } - 1 { assert ch == '\u03c0'; } - 2 { assert ch == 'y'; } - } - i += 1; - } - - iter_chars("") {|_ch| fail; } // should not fail - } - #[test] fn test_chars_iter() { let i = 0; From 6d670f5a30ba56895b2dcdd631453bcc7cef663b Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Mon, 30 Jan 2012 22:14:07 -0800 Subject: [PATCH 6/7] Add str::split_chars_iter and str::splitn_chars_iter --- src/libcore/str.rs | 64 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index cc5de8e91f999..1c7fe1a9ced9e 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -65,6 +65,8 @@ export map, bytes_iter, chars_iter, + split_chars_iter, + splitn_chars_iter, words_iter, lines_iter, @@ -818,8 +820,6 @@ fn map(ss: str, ff: fn(char) -> char) -> str { Function: bytes_iter Iterate over the bytes in a string - -FIXME: Should it really include the last byte '\0'? */ fn bytes_iter(ss: str, it: fn(u8)) { let pos = 0u; @@ -845,6 +845,28 @@ fn chars_iter(s: str, it: fn(char)) { } } +/* +Function: split_chars_iter + +Apply a function to each substring after splitting +by character +*/ +fn split_chars_iter(ss: str, cc: char, ff: fn(&&str)) { + vec::iter(split_char(ss, cc), ff) +} + +/* +Function: splitn_chars_iter + +Apply a function to each substring after splitting +by character, up to nn times + +FIXME: make this use chars when splitn/splitn_char is fixed +*/ +fn splitn_chars_iter(ss: str, sep: u8, count: uint, ff: fn(&&str)) { + vec::iter(splitn(ss, sep, count), ff) +} + /* Function: words_iter @@ -863,9 +885,6 @@ fn lines_iter(ss: str, ff: fn(&&str)) { vec::iter(lines(ss), ff) } -// FIXME: ADD split_char_iter -// FIXME: ADD splitn_char_iter - /* Section: Searching */ @@ -1862,6 +1881,41 @@ mod tests { bytes_iter("") {|bb| assert bb == 0u8; } } + #[test] + fn test_split_chars_iter() { + let data = "\nMary had a little lamb\nLittle lamb\n"; + + let ii = 0; + + split_chars_iter(data, ' ') {|xx| + alt ii { + 0 { assert "\nMary" == xx; } + 1 { assert "had" == xx; } + 2 { assert "a" == xx; } + 3 { assert "little" == xx; } + _ { () } + } + ii += 1; + } + } + + #[test] + fn test_splitn_chars_iter() { + let data = "\nMary had a little lamb\nLittle lamb\n"; + + let ii = 0; + + splitn_chars_iter(data, ' ' as u8, 2u) {|xx| + alt ii { + 0 { assert "\nMary" == xx; } + 1 { assert "had" == xx; } + 2 { assert "a little lamb\nLittle lamb\n" == xx; } + _ { () } + } + ii += 1; + } + } + #[test] fn test_words_iter() { let data = "\nMary had a little lamb\nLittle lamb\n"; From 9351e5ecf2b95ca0af885869be6b080055ddf838 Mon Sep 17 00:00:00 2001 From: Kevin Cantu Date: Mon, 30 Jan 2012 22:26:42 -0800 Subject: [PATCH 7/7] Re-implementing str::to_upper and str::to_lower using str::map --- src/libcore/str.rs | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 1c7fe1a9ced9e..9f40dd5c3a48c 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -674,30 +674,18 @@ fn windowed(nn: uint, ss: str) -> [str] { Function: to_lower Convert a string to lowercase - -FIXME: rewrite with map */ fn to_lower(s: str) -> str { - let outstr = ""; - chars_iter(s) { |c| - push_char(outstr, char::to_lower(c)); - } - ret outstr; + map(s, char::to_lower) } /* Function: to_upper Convert a string to uppercase - -FIXME: rewrite with map */ fn to_upper(s: str) -> str { - let outstr = ""; - chars_iter(s) { |c| - push_char(outstr, char::to_upper(c)); - } - ret outstr; + map(s, char::to_upper) } // FIXME: This is super-inefficient @@ -1583,8 +1571,9 @@ mod tests { #[test] fn test_to_upper() { - // to_upper doesn't understand unicode yet, - // but we need to at least preserve it + // char::to_upper, and hence str::to_upper + // are culturally insensitive: I'm not sure they + // really work for anything but English ASCII, but YMMV let unicode = "\u65e5\u672c"; let input = "abcDEF" + unicode + "xyz:.;"; @@ -1593,6 +1582,12 @@ mod tests { assert (eq(expected, actual)); } + #[test] + fn test_to_lower() { + assert "" == map("", char::to_lower); + assert "ymca" == map("YMCA", char::to_lower); + } + #[test] fn test_slice() { assert (eq("ab", slice("abc", 0u, 2u)));