@@ -676,8 +676,10 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
676
676
if searcher. end == 0 {
677
677
return SearchStep :: Done ;
678
678
}
679
+ let is_long = searcher. memory == usize:: MAX ;
679
680
match searcher. next_back :: < RejectAndMatch > ( self . haystack . as_bytes ( ) ,
680
- self . needle . as_bytes ( ) )
681
+ self . needle . as_bytes ( ) ,
682
+ is_long)
681
683
{
682
684
SearchStep :: Reject ( mut a, b) => {
683
685
// skip to next char boundary
@@ -706,8 +708,16 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
706
708
}
707
709
}
708
710
StrSearcherImpl :: TwoWay ( ref mut searcher) => {
709
- searcher. next_back :: < MatchOnly > ( self . haystack . as_bytes ( ) ,
710
- self . needle . as_bytes ( ) )
711
+ let is_long = searcher. memory == usize:: MAX ;
712
+ if is_long {
713
+ searcher. next_back :: < MatchOnly > ( self . haystack . as_bytes ( ) ,
714
+ self . needle . as_bytes ( ) ,
715
+ true )
716
+ } else {
717
+ searcher. next_back :: < MatchOnly > ( self . haystack . as_bytes ( ) ,
718
+ self . needle . as_bytes ( ) ,
719
+ false )
720
+ }
711
721
}
712
722
}
713
723
}
@@ -718,14 +728,21 @@ unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
718
728
#[ derive( Clone , Debug ) ]
719
729
struct TwoWaySearcher {
720
730
// constants
731
+ /// critical factorization index
721
732
crit_pos : usize ,
733
+ /// critical factorization index for reversed needle
734
+ crit_pos_back : usize ,
722
735
period : usize ,
736
+ /// `byteset` is an extension (not part of the two way algorithm);
737
+ /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
738
+ /// to a (byte & 63) == j present in the needle.
723
739
byteset : u64 ,
724
740
725
741
// variables
726
742
position : usize ,
727
743
end : usize ,
728
- memory : usize
744
+ memory : usize ,
745
+ memory_back : usize ,
729
746
}
730
747
731
748
/*
@@ -797,6 +814,9 @@ struct TwoWaySearcher {
797
814
798
815
The purpose of maximal_suffix is to find such a critical factorization.
799
816
817
+ If the period is short, compute another factorization x = u' v' to use
818
+ for reverse search, chosen instead so that |v'| < period(x).
819
+
800
820
*/
801
821
impl TwoWaySearcher {
802
822
fn new ( needle : & [ u8 ] , end : usize ) -> TwoWaySearcher {
@@ -810,10 +830,6 @@ impl TwoWaySearcher {
810
830
( crit_pos_true, period_true)
811
831
} ;
812
832
813
- // This isn't in the original algorithm, as far as I'm aware.
814
- let byteset = needle. iter ( )
815
- . fold ( 0 , |a, & b| ( 1 << ( ( b & 0x3f ) as usize ) ) | a) ;
816
-
817
833
// A particularly readable explanation of what's going on here can be found
818
834
// in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
819
835
// see the code for "Algorithm CP" on p. 323.
@@ -824,27 +840,51 @@ impl TwoWaySearcher {
824
840
// "Algorithm CP2", which is optimized for when the period of the needle
825
841
// is large.
826
842
if & needle[ ..crit_pos] == & needle[ period.. period + crit_pos] {
827
- // short period case
843
+ // short period case -- the period is exact
844
+ let byteset = needle[ ..period] . iter ( )
845
+ . fold ( 0 , |a, & b| ( 1 << ( b & 0x3f ) ) | a) ;
846
+
847
+ // compute a separate critical factorization for the reversed needle
848
+ // x = u' v' where |v'| < period(x).
849
+ //
850
+ // This is sped up by the period being known already.
851
+ // Note that a case like x = "acba" may be factored exactly forwards
852
+ // (crit_pos = 1, period = 3) while being factored with approximate
853
+ // period in reverse (crit_pos = 2, period = 2). We use the given
854
+ // reverse factorization but keep the exact period.
855
+ let crit_pos_back = needle. len ( ) - cmp:: max (
856
+ TwoWaySearcher :: reverse_maximal_suffix ( needle, period, false ) ,
857
+ TwoWaySearcher :: reverse_maximal_suffix ( needle, period, true ) ) ;
858
+
828
859
TwoWaySearcher {
829
860
crit_pos : crit_pos,
861
+ crit_pos_back : crit_pos_back,
830
862
period : period,
831
863
byteset : byteset,
832
864
833
865
position : 0 ,
834
866
end : end,
835
- memory : 0
867
+ memory : 0 ,
868
+ // memory_back after which we have already matched
869
+ memory_back : needle. len ( ) ,
836
870
}
837
871
} else {
838
- // long period case
839
- // we have an approximation to the actual period, and don't use memory.
872
+ // long period case -- we have an approximation to the actual period,
873
+ // and don't use memorization.
874
+
875
+ let byteset = needle. iter ( )
876
+ . fold ( 0 , |a, & b| ( 1 << ( b & 0x3f ) ) | a) ;
877
+
840
878
TwoWaySearcher {
841
879
crit_pos : crit_pos,
880
+ crit_pos_back : crit_pos,
842
881
period : cmp:: max ( crit_pos, needle. len ( ) - crit_pos) + 1 ,
843
882
byteset : byteset,
844
883
845
884
position : 0 ,
846
885
end : end,
847
- memory : usize:: MAX // Dummy value to signify that the period is long
886
+ memory : usize:: MAX , // Dummy value to signify that the period is long
887
+ memory_back : usize:: MAX ,
848
888
}
849
889
}
850
890
}
@@ -926,19 +966,18 @@ impl TwoWaySearcher {
926
966
927
967
// Follows the ideas in `next()`.
928
968
//
929
- // All the definitions are completely symmetrical, with period(x) = period(reverse(x))
969
+ // The definitions are symmetrical, with period(x) = period(reverse(x))
930
970
// and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
931
- // is a critical factorization, so is (reverse(v), reverse(u)). Similarly,
932
- // the "period" stored in self.period is the real period if long_period is
933
- // false, and so is still valid for a reversed needle, and if long_period is
934
- // true, all the algorithm requires is that self.period is less than or
935
- // equal to the real period, which must be true for the forward case anyway .
971
+ // is a critical factorization, so is (reverse(v), reverse(u)).
972
+ //
973
+ // For the short period case, using memorization, we rely on |u| < period(x).
974
+ // For this case we have computed a critical factorization x = u' v'
975
+ // where |v'| < period(x) instead (field `crit_pos_back`) .
936
976
//
937
977
// To search in reverse through the haystack, we search forward through
938
- // a reversed haystack with a reversed needle, and the above paragraph shows
939
- // that the precomputed parameters can be left alone.
978
+ // a reversed haystack with a reversed needle, matching first u' and then v'.
940
979
#[ inline]
941
- fn next_back < S > ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] )
980
+ fn next_back < S > ( & mut self , haystack : & [ u8 ] , needle : & [ u8 ] , long_period : bool )
942
981
-> S :: Output
943
982
where S : TwoWayStrategy
944
983
{
@@ -959,21 +998,34 @@ impl TwoWaySearcher {
959
998
// Quickly skip by large portions unrelated to our substring
960
999
if !self . byteset_contains ( haystack[ self . end - needle. len ( ) ] ) {
961
1000
self . end -= needle. len ( ) ;
1001
+ if !long_period {
1002
+ self . memory_back = needle. len ( ) ;
1003
+ }
962
1004
continue ' search;
963
1005
}
964
1006
965
1007
// See if the left part of the needle matches
966
- for i in ( 0 ..self . crit_pos ) . rev ( ) {
1008
+ let crit = if long_period { self . crit_pos_back }
1009
+ else { cmp:: min ( self . crit_pos_back , self . memory_back ) } ;
1010
+ for i in ( 0 ..crit) . rev ( ) {
967
1011
if needle[ i] != haystack[ self . end - needle. len ( ) + i] {
968
- self . end -= self . crit_pos - i;
1012
+ self . end -= self . crit_pos_back - i;
1013
+ if !long_period {
1014
+ self . memory_back = needle. len ( ) ;
1015
+ }
969
1016
continue ' search;
970
1017
}
971
1018
}
972
1019
973
1020
// See if the right part of the needle matches
974
- for i in self . crit_pos ..needle. len ( ) {
1021
+ let needle_end = if long_period { needle. len ( ) }
1022
+ else { self . memory_back } ;
1023
+ for i in self . crit_pos_back ..needle_end {
975
1024
if needle[ i] != haystack[ self . end - needle. len ( ) + i] {
976
1025
self . end -= self . period ;
1026
+ if !long_period {
1027
+ self . memory_back = self . period ;
1028
+ }
977
1029
continue ' search;
978
1030
}
979
1031
}
@@ -982,53 +1034,110 @@ impl TwoWaySearcher {
982
1034
let match_pos = self . end - needle. len ( ) ;
983
1035
// Note: sub self.period instead of needle.len() to have overlapping matches
984
1036
self . end -= needle. len ( ) ;
1037
+ if !long_period {
1038
+ self . memory_back = needle. len ( ) ;
1039
+ }
985
1040
986
1041
return S :: matching ( match_pos, match_pos + needle. len ( ) ) ;
987
1042
}
988
1043
}
989
1044
990
- // Computes a critical factorization (u, v) of `arr`.
991
- // Specifically, returns (i, p), where i is the starting index of v in some
992
- // critical factorization (u, v) and p = period(v)
1045
+ // Compute the maximal suffix of `arr`.
1046
+ //
1047
+ // The maximal suffix is a possible critical factorization (u, v) of `arr`.
1048
+ //
1049
+ // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
1050
+ // period of v.
1051
+ //
1052
+ // `order_greater` determines if lexical order is `<` or `>`. Both
1053
+ // orders must be computed -- the ordering with the largest `i` gives
1054
+ // a critical factorization.
1055
+ //
1056
+ // For long period cases, the resulting period is not exact (it is too short).
993
1057
#[ inline]
994
- fn maximal_suffix ( arr : & [ u8 ] , reversed : bool ) -> ( usize , usize ) {
995
- let mut left: usize = ! 0 ; // Corresponds to i in the paper
996
- let mut right = 0 ; // Corresponds to j in the paper
997
- let mut offset = 1 ; // Corresponds to k in the paper
1058
+ fn maximal_suffix ( arr : & [ u8 ] , order_greater : bool ) -> ( usize , usize ) {
1059
+ let mut left = 0 ; // Corresponds to i in the paper
1060
+ let mut right = 1 ; // Corresponds to j in the paper
1061
+ let mut offset = 0 ; // Corresponds to k in the paper
998
1062
let mut period = 1 ; // Corresponds to p in the paper
999
1063
1000
- while right + offset < arr. len ( ) {
1001
- let a;
1002
- let b;
1003
- if reversed {
1004
- a = arr[ left. wrapping_add ( offset) ] ;
1005
- b = arr[ right + offset] ;
1064
+ while let Some ( & a) = arr. get ( right + offset) {
1065
+ // `left` will be inbounds when `right` is.
1066
+ let b = arr[ left + offset] ;
1067
+ if ( a < b && !order_greater) || ( a > b && order_greater) {
1068
+ // Suffix is smaller, period is entire prefix so far.
1069
+ right += offset + 1 ;
1070
+ offset = 0 ;
1071
+ period = right - left;
1072
+ } else if a == b {
1073
+ // Advance through repetition of the current period.
1074
+ if offset + 1 == period {
1075
+ right += offset + 1 ;
1076
+ offset = 0 ;
1077
+ } else {
1078
+ offset += 1 ;
1079
+ }
1006
1080
} else {
1007
- a = arr[ right + offset] ;
1008
- b = arr[ left. wrapping_add ( offset) ] ;
1081
+ // Suffix is larger, start over from current location.
1082
+ left = right;
1083
+ right += 1 ;
1084
+ offset = 0 ;
1085
+ period = 1 ;
1009
1086
}
1010
- if a < b {
1087
+ }
1088
+ ( left, period)
1089
+ }
1090
+
1091
+ // Compute the maximal suffix of the reverse of `arr`.
1092
+ //
1093
+ // The maximal suffix is a possible critical factorization (u', v') of `arr`.
1094
+ //
1095
+ // Returns `i` where `i` is the starting index of v', from the back;
1096
+ // returns immedately when a period of `known_period` is reached.
1097
+ //
1098
+ // `order_greater` determines if lexical order is `<` or `>`. Both
1099
+ // orders must be computed -- the ordering with the largest `i` gives
1100
+ // a critical factorization.
1101
+ //
1102
+ // For long period cases, the resulting period is not exact (it is too short).
1103
+ fn reverse_maximal_suffix ( arr : & [ u8 ] , known_period : usize ,
1104
+ order_greater : bool ) -> usize
1105
+ {
1106
+ let mut left = 0 ; // Corresponds to i in the paper
1107
+ let mut right = 1 ; // Corresponds to j in the paper
1108
+ let mut offset = 0 ; // Corresponds to k in the paper
1109
+ let mut period = 1 ; // Corresponds to p in the paper
1110
+ let n = arr. len ( ) ;
1111
+
1112
+ while right + offset < n {
1113
+ let a = arr[ n - ( 1 + right + offset) ] ;
1114
+ let b = arr[ n - ( 1 + left + offset) ] ;
1115
+ if ( a < b && !order_greater) || ( a > b && order_greater) {
1011
1116
// Suffix is smaller, period is entire prefix so far.
1012
- right += offset;
1013
- offset = 1 ;
1014
- period = right. wrapping_sub ( left) ;
1117
+ right += offset + 1 ;
1118
+ offset = 0 ;
1119
+ period = right - left;
1015
1120
} else if a == b {
1016
1121
// Advance through repetition of the current period.
1017
- if offset == period {
1018
- right += offset;
1019
- offset = 1 ;
1122
+ if offset + 1 == period {
1123
+ right += offset + 1 ;
1124
+ offset = 0 ;
1020
1125
} else {
1021
1126
offset += 1 ;
1022
1127
}
1023
1128
} else {
1024
1129
// Suffix is larger, start over from current location.
1025
1130
left = right;
1026
1131
right += 1 ;
1027
- offset = 1 ;
1132
+ offset = 0 ;
1028
1133
period = 1 ;
1029
1134
}
1135
+ if period == known_period {
1136
+ break ;
1137
+ }
1030
1138
}
1031
- ( left. wrapping_add ( 1 ) , period)
1139
+ debug_assert ! ( period <= known_period) ;
1140
+ left
1032
1141
}
1033
1142
}
1034
1143
0 commit comments