Skip to content

Commit 7ebae85

Browse files
committed
StrSearcher: Implement the full two way algorithm in reverse for rfind
Fix quadratic behavior in StrSearcher in reverse search with periodic needles. This commit adds the missing pieces for the "short period" case in reverse search. The short case will show up when the needle is literally periodic, for example "abababab". Two way uses a "critical factorization" of the needle: x = u v. Searching matches v first, if mismatch at character k, skip k forward. Matching u, if mismatch, skip period(x) forward. To avoid O(mn) behavior after mismatch in u, memorize the already matched prefix. The short period case requires that |u| < period(x). For the reverse search we need to compute a different critical factorization x = u' v' where |v'| < period(x), because we are searching for the reversed needle. A short v' also benefits the algorithm in general. The reverse critical factorization is computed quickly by using the same maximal suffix algorithm, but terminating as soon as we have a location with local period equal to period(x). This adds extra fields crit_pos_back and memory_back for the reverse case. The new overhead for TwoWaySearcher::new is low, and additionally I think the "short period" case is uncommon in many applications of string search. The maximal_suffix methods were updated in documentation and the algorithms updated to not use !0 and wrapping add, variable left is now 1 larger, offset 1 smaller. Use periodicity when computing byteset: in the periodic case, just iterate over one period instead of the whole needle. Example before (rfind) after (twoway_rfind) benchmark shows the removal of quadratic behavior. needle: "ab" * 100, haystack: ("bb" + "ab" * 100) * 100 ``` test periodic::rfind ... bench: 1,926,595 ns/iter (+/- 11,390) = 10 MB/s test periodic::twoway_rfind ... bench: 51,740 ns/iter (+/- 66) = 386 MB/s ```
1 parent c5a1d8c commit 7ebae85

File tree

1 file changed

+158
-49
lines changed

1 file changed

+158
-49
lines changed

src/libcore/str/pattern.rs

Lines changed: 158 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -676,8 +676,10 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
676676
if searcher.end == 0 {
677677
return SearchStep::Done;
678678
}
679+
let is_long = searcher.memory == usize::MAX;
679680
match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(),
680-
self.needle.as_bytes())
681+
self.needle.as_bytes(),
682+
is_long)
681683
{
682684
SearchStep::Reject(mut a, b) => {
683685
// skip to next char boundary
@@ -706,8 +708,16 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
706708
}
707709
}
708710
StrSearcherImpl::TwoWay(ref mut searcher) => {
709-
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
710-
self.needle.as_bytes())
711+
let is_long = searcher.memory == usize::MAX;
712+
if is_long {
713+
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
714+
self.needle.as_bytes(),
715+
true)
716+
} else {
717+
searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
718+
self.needle.as_bytes(),
719+
false)
720+
}
711721
}
712722
}
713723
}
@@ -718,14 +728,21 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
718728
#[derive(Clone, Debug)]
719729
struct TwoWaySearcher {
720730
// constants
731+
/// critical factorization index
721732
crit_pos: usize,
733+
/// critical factorization index for reversed needle
734+
crit_pos_back: usize,
722735
period: usize,
736+
/// `byteset` is an extension (not part of the two way algorithm);
737+
/// it's a 64-bit "fingerprint" where each set bit `j` corresponds
738+
/// to a (byte & 63) == j present in the needle.
723739
byteset: u64,
724740

725741
// variables
726742
position: usize,
727743
end: usize,
728-
memory: usize
744+
memory: usize,
745+
memory_back: usize,
729746
}
730747

731748
/*
@@ -797,6 +814,9 @@ struct TwoWaySearcher {
797814
798815
The purpose of maximal_suffix is to find such a critical factorization.
799816
817+
If the period is short, compute another factorization x = u' v' to use
818+
for reverse search, chosen instead so that |v'| < period(x).
819+
800820
*/
801821
impl TwoWaySearcher {
802822
fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
@@ -810,10 +830,6 @@ impl TwoWaySearcher {
810830
(crit_pos_true, period_true)
811831
};
812832

813-
// This isn't in the original algorithm, as far as I'm aware.
814-
let byteset = needle.iter()
815-
.fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
816-
817833
// A particularly readable explanation of what's going on here can be found
818834
// in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
819835
// see the code for "Algorithm CP" on p. 323.
@@ -824,27 +840,51 @@ impl TwoWaySearcher {
824840
// "Algorithm CP2", which is optimized for when the period of the needle
825841
// is large.
826842
if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
827-
// short period case
843+
// short period case -- the period is exact
844+
let byteset = needle[..period].iter()
845+
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
846+
847+
// compute a separate critical factorization for the reversed needle
848+
// x = u' v' where |v'| < period(x).
849+
//
850+
// This is sped up by the period being known already.
851+
// Note that a case like x = "acba" may be factored exactly forwards
852+
// (crit_pos = 1, period = 3) while being factored with approximate
853+
// period in reverse (crit_pos = 2, period = 2). We use the given
854+
// reverse factorization but keep the exact period.
855+
let crit_pos_back = needle.len() - cmp::max(
856+
TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
857+
TwoWaySearcher::reverse_maximal_suffix(needle, period, true));
858+
828859
TwoWaySearcher {
829860
crit_pos: crit_pos,
861+
crit_pos_back: crit_pos_back,
830862
period: period,
831863
byteset: byteset,
832864

833865
position: 0,
834866
end: end,
835-
memory: 0
867+
memory: 0,
868+
// memory_back after which we have already matched
869+
memory_back: needle.len(),
836870
}
837871
} else {
838-
// long period case
839-
// we have an approximation to the actual period, and don't use memory.
872+
// long period case -- we have an approximation to the actual period,
873+
// and don't use memorization.
874+
875+
let byteset = needle.iter()
876+
.fold(0, |a, &b| (1 << (b & 0x3f)) | a);
877+
840878
TwoWaySearcher {
841879
crit_pos: crit_pos,
880+
crit_pos_back: crit_pos,
842881
period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
843882
byteset: byteset,
844883

845884
position: 0,
846885
end: end,
847-
memory: usize::MAX // Dummy value to signify that the period is long
886+
memory: usize::MAX, // Dummy value to signify that the period is long
887+
memory_back: usize::MAX,
848888
}
849889
}
850890
}
@@ -926,19 +966,18 @@ impl TwoWaySearcher {
926966

927967
// Follows the ideas in `next()`.
928968
//
929-
// All the definitions are completely symmetrical, with period(x) = period(reverse(x))
969+
// The definitions are symmetrical, with period(x) = period(reverse(x))
930970
// and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
931-
// is a critical factorization, so is (reverse(v), reverse(u)). Similarly,
932-
// the "period" stored in self.period is the real period if long_period is
933-
// false, and so is still valid for a reversed needle, and if long_period is
934-
// true, all the algorithm requires is that self.period is less than or
935-
// equal to the real period, which must be true for the forward case anyway.
971+
// is a critical factorization, so is (reverse(v), reverse(u)).
972+
//
973+
// For the short period case, using memorization, we rely on |u| < period(x).
974+
// For this case we have computed a critical factorization x = u' v'
975+
// where |v'| < period(x) instead (field `crit_pos_back`).
936976
//
937977
// To search in reverse through the haystack, we search forward through
938-
// a reversed haystack with a reversed needle, and the above paragraph shows
939-
// that the precomputed parameters can be left alone.
978+
// a reversed haystack with a reversed needle, matching first u' and then v'.
940979
#[inline]
941-
fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8])
980+
fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
942981
-> S::Output
943982
where S: TwoWayStrategy
944983
{
@@ -959,21 +998,34 @@ impl TwoWaySearcher {
959998
// Quickly skip by large portions unrelated to our substring
960999
if !self.byteset_contains(haystack[self.end - needle.len()]) {
9611000
self.end -= needle.len();
1001+
if !long_period {
1002+
self.memory_back = needle.len();
1003+
}
9621004
continue 'search;
9631005
}
9641006

9651007
// See if the left part of the needle matches
966-
for i in (0..self.crit_pos).rev() {
1008+
let crit = if long_period { self.crit_pos_back }
1009+
else { cmp::min(self.crit_pos_back, self.memory_back) };
1010+
for i in (0..crit).rev() {
9671011
if needle[i] != haystack[self.end - needle.len() + i] {
968-
self.end -= self.crit_pos - i;
1012+
self.end -= self.crit_pos_back - i;
1013+
if !long_period {
1014+
self.memory_back = needle.len();
1015+
}
9691016
continue 'search;
9701017
}
9711018
}
9721019

9731020
// See if the right part of the needle matches
974-
for i in self.crit_pos..needle.len() {
1021+
let needle_end = if long_period { needle.len() }
1022+
else { self.memory_back };
1023+
for i in self.crit_pos_back..needle_end {
9751024
if needle[i] != haystack[self.end - needle.len() + i] {
9761025
self.end -= self.period;
1026+
if !long_period {
1027+
self.memory_back = self.period;
1028+
}
9771029
continue 'search;
9781030
}
9791031
}
@@ -982,53 +1034,110 @@ impl TwoWaySearcher {
9821034
let match_pos = self.end - needle.len();
9831035
// Note: sub self.period instead of needle.len() to have overlapping matches
9841036
self.end -= needle.len();
1037+
if !long_period {
1038+
self.memory_back = needle.len();
1039+
}
9851040

9861041
return S::matching(match_pos, match_pos + needle.len());
9871042
}
9881043
}
9891044

990-
// Computes a critical factorization (u, v) of `arr`.
991-
// Specifically, returns (i, p), where i is the starting index of v in some
992-
// critical factorization (u, v) and p = period(v)
1045+
// Compute the maximal suffix of `arr`.
1046+
//
1047+
// The maximal suffix is a possible critical factorization (u, v) of `arr`.
1048+
//
1049+
// Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
1050+
// period of v.
1051+
//
1052+
// `order_greater` determines if lexical order is `<` or `>`. Both
1053+
// orders must be computed -- the ordering with the largest `i` gives
1054+
// a critical factorization.
1055+
//
1056+
// For long period cases, the resulting period is not exact (it is too short).
9931057
#[inline]
994-
fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
995-
let mut left: usize = !0; // Corresponds to i in the paper
996-
let mut right = 0; // Corresponds to j in the paper
997-
let mut offset = 1; // Corresponds to k in the paper
1058+
fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
1059+
let mut left = 0; // Corresponds to i in the paper
1060+
let mut right = 1; // Corresponds to j in the paper
1061+
let mut offset = 0; // Corresponds to k in the paper
9981062
let mut period = 1; // Corresponds to p in the paper
9991063

1000-
while right + offset < arr.len() {
1001-
let a;
1002-
let b;
1003-
if reversed {
1004-
a = arr[left.wrapping_add(offset)];
1005-
b = arr[right + offset];
1064+
while let Some(&a) = arr.get(right + offset) {
1065+
// `left` will be inbounds when `right` is.
1066+
let b = arr[left + offset];
1067+
if (a < b && !order_greater) || (a > b && order_greater) {
1068+
// Suffix is smaller, period is entire prefix so far.
1069+
right += offset + 1;
1070+
offset = 0;
1071+
period = right - left;
1072+
} else if a == b {
1073+
// Advance through repetition of the current period.
1074+
if offset + 1 == period {
1075+
right += offset + 1;
1076+
offset = 0;
1077+
} else {
1078+
offset += 1;
1079+
}
10061080
} else {
1007-
a = arr[right + offset];
1008-
b = arr[left.wrapping_add(offset)];
1081+
// Suffix is larger, start over from current location.
1082+
left = right;
1083+
right += 1;
1084+
offset = 0;
1085+
period = 1;
10091086
}
1010-
if a < b {
1087+
}
1088+
(left, period)
1089+
}
1090+
1091+
// Compute the maximal suffix of the reverse of `arr`.
1092+
//
1093+
// The maximal suffix is a possible critical factorization (u', v') of `arr`.
1094+
//
1095+
// Returns `i` where `i` is the starting index of v', from the back;
1096+
// returns immedately when a period of `known_period` is reached.
1097+
//
1098+
// `order_greater` determines if lexical order is `<` or `>`. Both
1099+
// orders must be computed -- the ordering with the largest `i` gives
1100+
// a critical factorization.
1101+
//
1102+
// For long period cases, the resulting period is not exact (it is too short).
1103+
fn reverse_maximal_suffix(arr: &[u8], known_period: usize,
1104+
order_greater: bool) -> usize
1105+
{
1106+
let mut left = 0; // Corresponds to i in the paper
1107+
let mut right = 1; // Corresponds to j in the paper
1108+
let mut offset = 0; // Corresponds to k in the paper
1109+
let mut period = 1; // Corresponds to p in the paper
1110+
let n = arr.len();
1111+
1112+
while right + offset < n {
1113+
let a = arr[n - (1 + right + offset)];
1114+
let b = arr[n - (1 + left + offset)];
1115+
if (a < b && !order_greater) || (a > b && order_greater) {
10111116
// Suffix is smaller, period is entire prefix so far.
1012-
right += offset;
1013-
offset = 1;
1014-
period = right.wrapping_sub(left);
1117+
right += offset + 1;
1118+
offset = 0;
1119+
period = right - left;
10151120
} else if a == b {
10161121
// Advance through repetition of the current period.
1017-
if offset == period {
1018-
right += offset;
1019-
offset = 1;
1122+
if offset + 1 == period {
1123+
right += offset + 1;
1124+
offset = 0;
10201125
} else {
10211126
offset += 1;
10221127
}
10231128
} else {
10241129
// Suffix is larger, start over from current location.
10251130
left = right;
10261131
right += 1;
1027-
offset = 1;
1132+
offset = 0;
10281133
period = 1;
10291134
}
1135+
if period == known_period {
1136+
break;
1137+
}
10301138
}
1031-
(left.wrapping_add(1), period)
1139+
debug_assert!(period <= known_period);
1140+
left
10321141
}
10331142
}
10341143

0 commit comments

Comments
 (0)