Skip to content

Commit 39cb5b1

Browse files
author
Jonathan S
committed
Switched to the two-way algorithm for string searching
test str::bench::bench_contains_bad_naive ... bench: 300 ns/iter (+/- 12) from 1309 ns/iter (+/- 36) test str::bench::bench_contains_equal ... bench: 154 ns/iter (+/- 7) from 137 ns/iter (+/- 2) test str::bench::bench_contains_short_long ... bench: 2998 ns/iter (+/- 74) from 5473 ns/iter (+/- 14) test str::bench::bench_contains_short_short ... bench: 65 ns/iter (+/- 2) from 57 ns/iter (+/- 6)
1 parent 8a32a2a commit 39cb5b1

File tree

1 file changed

+206
-26
lines changed

1 file changed

+206
-26
lines changed

src/libcore/str.rs

Lines changed: 206 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,19 @@
1515
use mem;
1616
use char;
1717
use clone::Clone;
18+
use cmp;
1819
use cmp::{Eq, TotalEq};
1920
use container::Container;
2021
use default::Default;
2122
use iter::{Filter, Map, Iterator};
2223
use iter::{Rev, DoubleEndedIterator, ExactSize};
24+
use iter::range;
2325
use num::Saturating;
2426
use option::{None, Option, Some};
2527
use raw::Repr;
2628
use slice::{ImmutableVector, Vector};
2729
use slice;
30+
use uint;
2831

2932
/*
3033
Section: Creating a string
@@ -316,13 +319,207 @@ impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
316319
}
317320
}
318321

322+
/// The internal state of an iterator that searches for matches of a substring
323+
/// within a larger string using naive search
324+
#[deriving(Clone)]
325+
struct NaiveSearcher {
326+
position: uint
327+
}
328+
329+
impl NaiveSearcher {
330+
fn new() -> NaiveSearcher {
331+
NaiveSearcher { position: 0 }
332+
}
333+
334+
fn next(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(uint, uint)> {
335+
while self.position + needle.len() <= haystack.len() {
336+
if haystack.slice(self.position, self.position + needle.len()) == needle {
337+
let matchPos = self.position;
338+
self.position += needle.len(); // add 1 for all matches
339+
return Some((matchPos, matchPos + needle.len()));
340+
} else {
341+
self.position += 1;
342+
}
343+
}
344+
None
345+
}
346+
}
347+
348+
/// The internal state of an iterator that searches for matches of a substring
349+
/// within a larger string using two-way search
350+
#[deriving(Clone)]
351+
struct TwoWaySearcher {
352+
// constants
353+
critPos: uint,
354+
period: uint,
355+
byteset: u64,
356+
357+
// variables
358+
position: uint,
359+
memory: uint
360+
}
361+
362+
impl TwoWaySearcher {
363+
fn new(needle: &[u8]) -> TwoWaySearcher {
364+
let (critPos1, period1) = TwoWaySearcher::maximal_suffix(needle, false);
365+
let (critPos2, period2) = TwoWaySearcher::maximal_suffix(needle, true);
366+
367+
let critPos;
368+
let period;
369+
if critPos1 > critPos2 {
370+
critPos = critPos1;
371+
period = period1;
372+
} else {
373+
critPos = critPos2;
374+
period = period2;
375+
}
376+
377+
let byteset = needle.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a);
378+
379+
if needle.slice_to(critPos) == needle.slice_from(needle.len() - critPos) {
380+
TwoWaySearcher {
381+
critPos: critPos,
382+
period: period,
383+
byteset: byteset,
384+
385+
position: 0,
386+
memory: 0
387+
}
388+
} else {
389+
TwoWaySearcher {
390+
critPos: critPos,
391+
period: cmp::max(critPos, needle.len() - critPos) + 1,
392+
byteset: byteset,
393+
394+
position: 0,
395+
memory: uint::MAX // Dummy value to signify that the period is long
396+
}
397+
}
398+
}
399+
400+
#[inline]
401+
fn next(&mut self, haystack: &[u8], needle: &[u8], longPeriod: bool) -> Option<(uint, uint)> {
402+
'search: loop {
403+
// Check that we have room to search in
404+
if self.position + needle.len() > haystack.len() {
405+
return None;
406+
}
407+
408+
// Quickly skip by large portions unrelated to our substring
409+
if (self.byteset >> (haystack[self.position + needle.len() - 1] & 0x3f)) & 1 == 0 {
410+
self.position += needle.len();
411+
continue 'search;
412+
}
413+
414+
// See if the right part of the needle matches
415+
let start = if longPeriod { self.critPos } else { cmp::max(self.critPos, self.memory) };
416+
for i in range(start, needle.len()) {
417+
if needle[i] != haystack[self.position + i] {
418+
self.position += i - self.critPos + 1;
419+
if !longPeriod {
420+
self.memory = 0;
421+
}
422+
continue 'search;
423+
}
424+
}
425+
426+
// See if the left part of the needle matches
427+
let start = if longPeriod { 0 } else { self.memory };
428+
for i in range(start, self.critPos).rev() {
429+
if needle[i] != haystack[self.position + i] {
430+
self.position += self.period;
431+
if !longPeriod {
432+
self.memory = needle.len() - self.period;
433+
}
434+
continue 'search;
435+
}
436+
}
437+
438+
// We have found a match!
439+
let matchPos = self.position;
440+
self.position += needle.len(); // add self.period for all matches
441+
if !longPeriod {
442+
self.memory = 0; // set to needle.len() - self.period for all matches
443+
}
444+
return Some((matchPos, matchPos + needle.len()));
445+
}
446+
}
447+
448+
#[inline]
449+
fn maximal_suffix(arr: &[u8], reversed: bool) -> (uint, uint) {
450+
let mut left = -1; // Corresponds to i in the paper
451+
let mut right = 0; // Corresponds to j in the paper
452+
let mut offset = 1; // Corresponds to k in the paper
453+
let mut period = 1; // Corresponds to p in the paper
454+
455+
while right + offset < arr.len() {
456+
let a;
457+
let b;
458+
if reversed {
459+
a = arr[left + offset];
460+
b = arr[right + offset];
461+
} else {
462+
a = arr[right + offset];
463+
b = arr[left + offset];
464+
}
465+
if a < b {
466+
// Suffix is smaller, period is entire prefix so far.
467+
right += offset;
468+
offset = 1;
469+
period = right - left;
470+
} else if a == b {
471+
// Advance through repetition of the current period.
472+
if offset == period {
473+
right += offset;
474+
offset = 1;
475+
} else {
476+
offset += 1;
477+
}
478+
} else {
479+
// Suffix is larger, start over from current location.
480+
left = right;
481+
right += 1;
482+
offset = 1;
483+
period = 1;
484+
}
485+
}
486+
(left + 1, period)
487+
}
488+
}
489+
490+
/// The internal state of an iterator that searches for matches of a substring
491+
/// within a larger string using a dynamically chosed search algorithm
492+
#[deriving(Clone)]
493+
enum Searcher {
494+
Naive(NaiveSearcher),
495+
TwoWay(TwoWaySearcher),
496+
TwoWayLong(TwoWaySearcher)
497+
}
498+
499+
impl Searcher {
500+
fn new(haystack: &[u8], needle: &[u8]) -> Searcher {
501+
// FIXME: Tune this.
502+
if needle.len() > haystack.len() - 20 {
503+
Naive(NaiveSearcher::new())
504+
} else {
505+
let searcher = TwoWaySearcher::new(needle);
506+
if searcher.memory == uint::MAX { // If the period is long
507+
TwoWayLong(searcher)
508+
} else {
509+
TwoWay(searcher)
510+
}
511+
}
512+
}
513+
}
514+
319515
/// An iterator over the start and end indices of the matches of a
320516
/// substring within a larger string
321517
#[deriving(Clone)]
322518
pub struct MatchIndices<'a> {
519+
// constants
323520
haystack: &'a str,
324521
needle: &'a str,
325-
position: uint,
522+
searcher: Searcher
326523
}
327524

328525
/// An iterator over the substrings of a string separated by a given
@@ -337,31 +534,14 @@ pub struct StrSplits<'a> {
337534
impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
338535
#[inline]
339536
fn next(&mut self) -> Option<(uint, uint)> {
340-
// See Issue #1932 for why this is a naive search
341-
let (h_len, n_len) = (self.haystack.len(), self.needle.len());
342-
let mut match_start = 0;
343-
let mut match_i = 0;
344-
345-
while self.position < h_len {
346-
if self.haystack[self.position] == self.needle[match_i] {
347-
if match_i == 0 { match_start = self.position; }
348-
match_i += 1;
349-
self.position += 1;
350-
351-
if match_i == n_len {
352-
// found a match!
353-
return Some((match_start, self.position));
354-
}
355-
} else {
356-
// failed match, backtrack
357-
if match_i > 0 {
358-
match_i = 0;
359-
self.position = match_start;
360-
}
361-
self.position += 1;
362-
}
537+
match self.searcher {
538+
Naive(ref mut searcher)
539+
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes()),
540+
TwoWay(ref mut searcher)
541+
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
542+
TwoWayLong(ref mut searcher)
543+
=> searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true)
363544
}
364-
None
365545
}
366546
}
367547

@@ -1581,7 +1761,7 @@ impl<'a> StrSlice<'a> for &'a str {
15811761
MatchIndices {
15821762
haystack: *self,
15831763
needle: sep,
1584-
position: 0
1764+
searcher: Searcher::new(self.as_bytes(), sep.as_bytes())
15851765
}
15861766
}
15871767

0 commit comments

Comments
 (0)