|
1 | 1 | """
|
2 |
| -This is a pure Python implementation |
| 2 | +This is a pure Python implementation |
3 | 3 | of the Commentz-Walter algorithm
|
4 | 4 | for searching multiple patterns in a single text.
|
5 | 5 |
|
6 |
| -The algorithm combines Boyer-Moore's and |
| 6 | +The algorithm combines Boyer-Moore's and |
7 | 7 | Aho-Corasick's techniques for
|
8 |
| -efficiently searching multiple patterns |
| 8 | +efficiently searching multiple patterns |
9 | 9 | by using pattern shifts and suffix automata.
|
10 | 10 |
|
11 | 11 | For doctests run:
|
|
18 | 18 |
|
19 | 19 | from typing import List, Dict, Set, Tuple
|
20 | 20 | from collections import defaultdict
|
| 21 | + |
| 22 | + |
21 | 23 | class CommentzWalter:
|
22 | 24 | """
|
23 |
| - Class to represent the Commentz-Walter algorithm |
| 25 | + Class to represent the Commentz-Walter algorithm |
24 | 26 | for multi-pattern string searching.
|
25 | 27 |
|
26 | 28 | Attributes:
|
27 | 29 | patterns (List[str]): List of patterns to search for.
|
28 | 30 | alphabet (Set[str]): Unique characters in the patterns.
|
29 |
| - shift_table (Dict[str, int]): Table to store |
| 31 | + shift_table (Dict[str, int]): Table to store |
30 | 32 | the shift values for characters.
|
31 |
| - automaton (Dict[int, Dict[str, int]]): |
| 33 | + automaton (Dict[int, Dict[str, int]]): |
32 | 34 | Automaton used for state transitions.
|
33 | 35 |
|
34 | 36 | Methods:
|
35 |
| - preprocess(): Builds the shift table |
| 37 | + preprocess(): Builds the shift table |
36 | 38 | and automaton for pattern matching.
|
37 |
| - search(text: str) -> List[Tuple[int, str]]: |
| 39 | + search(text: str) -> List[Tuple[int, str]]: |
38 | 40 | Searches patterns in the given text.
|
39 | 41 |
|
40 | 42 | Examples:
|
41 | 43 | >>> cw = CommentzWalter(["he", "she", "his", "hers"])
|
42 | 44 | >>> cw.search("ahishers")
|
43 | 45 | [(1, 'his'), (4, 'she'), (5, 'hers')]
|
44 | 46 | """
|
| 47 | + |
45 | 48 | def __init__(self, patterns: List[str]) -> None:
|
46 | 49 | self.patterns = patterns
|
47 | 50 | self.alphabet: Set[str] = set("".join(patterns))
|
48 | 51 | self.shift_table: Dict[str, int] = {}
|
49 | 52 | self.automaton: Dict[int, Dict[str, int]] = {}
|
50 | 53 | self.preprocess()
|
| 54 | + |
51 | 55 | def preprocess(self) -> None:
|
52 | 56 | """
|
53 |
| - Builds the shift table and automaton required |
| 57 | + Builds the shift table and automaton required |
54 | 58 | for the Commentz-Walter algorithm.
|
55 | 59 | """
|
56 | 60 | # Build the shift table for the rightmost occurrence of characters in patterns
|
@@ -102,14 +106,17 @@ def search(self, text: str) -> List[Tuple[int, str]]:
|
102 | 106 | break
|
103 | 107 | else:
|
104 | 108 | for pattern in self.patterns:
|
105 |
| - if text[i:i + len(pattern)] == pattern: |
| 109 | + if text[i : i + len(pattern)] == pattern: |
106 | 110 | results.append((i, pattern))
|
107 | 111 | i += self.shift_table.get(text[i + m - 1], m)
|
108 | 112 | else:
|
109 | 113 | i += self.shift_table.get(text[i + j], m)
|
110 | 114 | return results
|
| 115 | + |
| 116 | + |
111 | 117 | if __name__ == "__main__":
|
112 | 118 | import doctest
|
| 119 | + |
113 | 120 | doctest.testmod()
|
114 | 121 | # Example usage for manual testing
|
115 | 122 | patterns = ["abc", "bcd", "cde"]
|
|
0 commit comments