Skip to content

Commit 8200a79

Browse files
Add AhoCorasick (TheAlgorithms#4465)
* Added code to find Articulation Points and Bridges * tried to solve clang-formant test * removed new line at EOF to get lint to pass * feature: Added Ahocorasick Algorithm * fixed lint using clang-format * removed datastructures/graphs/ArticulationPointsAndBridge.java from this branch * removed main, since test-file is added. Also modified and renamed few functions. * Added test-file for AhoCorasick Algorithm * Modified some comments in test-file * Modified some comments in AhoCorasick.java * lint fix * added few more test cases * Modified some comments * Change all class fields to private, added initializeSuffixLinksForChildNodesOfTheRoot() method, hashmap string search position (also has previous index based search), removed java.util.* * Added Missing Test-Cases and more * minor text changes * added direct test check i.e. defining a variable expected and just checking if res and expected are equal. * Created New Class Trie, merged 'buildTrie and buildSuffixAndOutputLinks' with 'Trie constructor'. Merged setUpStartPoints with searchIn. Now AhoCorasick contains -> inner class: Trie, Node. Methods: search and convert. Trie has -> Methods : constructor and searchIn * Updated TestFile according to the updated AhoCorasick Class. Added Few more test cases * updated - broken down constructor to relavent parts, made string final, made res local to searchIn(), doxygen-like style * lint fix clang * Updated Tests Files * Added final field to Node class setters and Trie Constructor arguments, removed getTrieRoot() and some unnecessory comments, renamed [old -> new]: res -> positionByStringIndexValue, removed if condition from setupStartPoints() * updated test file * lint fix clang * minor chage - 'removed a comment' * added final fields to some arguments, class and variables, added a method initializePositionByStringIndexValue() * updated to remove * inclusion and added the required modules only * Implemented a new class PatternPositionRecorder to wrap up the position recording in searchIn() * Added final fields to PatternPositionRecorder Class * style: mark default constructor of `AhoCorasick` as `private` * style: remoce redundant `public` --------- Co-authored-by: Piotr Idzik <[email protected]>
1 parent 06aa834 commit 8200a79

File tree

2 files changed

+369
-0
lines changed

2 files changed

+369
-0
lines changed
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
/*
2+
* Aho-Corasick String Matching Algorithm Implementation
3+
*
4+
* This code implements the Aho-Corasick algorithm, which is used for efficient
5+
* string matching in a given text. It can find multiple patterns simultaneously
6+
* and records their positions in the text.
7+
*
8+
* Author: Prabhat-Kumar-42
9+
* GitHub: https://github.com/Prabhat-Kumar-42
10+
*/
11+
12+
package com.thealgorithms.strings;
13+
14+
import java.util.ArrayList;
15+
import java.util.HashMap;
16+
import java.util.LinkedList;
17+
import java.util.Map;
18+
import java.util.Queue;
19+
20+
public final class AhoCorasick {
21+
private AhoCorasick() {
22+
}
23+
24+
// Trie Node Class
25+
private static class Node {
26+
// Represents a character in the trie
27+
private HashMap<Character, Node> child = new HashMap<>(); // Child nodes of the current node
28+
private Node suffixLink; // Suffix link to another node in the trie
29+
private Node outputLink; // Output link to another node in the trie
30+
private int patternInd; // Index of the pattern that ends at this node
31+
32+
Node() {
33+
this.suffixLink = null;
34+
this.outputLink = null;
35+
this.patternInd = -1;
36+
}
37+
38+
public HashMap<Character, Node> getChild() {
39+
return child;
40+
}
41+
42+
public Node getSuffixLink() {
43+
return suffixLink;
44+
}
45+
46+
public void setSuffixLink(final Node suffixLink) {
47+
this.suffixLink = suffixLink;
48+
}
49+
50+
public Node getOutputLink() {
51+
return outputLink;
52+
}
53+
54+
public void setOutputLink(final Node outputLink) {
55+
this.outputLink = outputLink;
56+
}
57+
58+
public int getPatternInd() {
59+
return patternInd;
60+
}
61+
62+
public void setPatternInd(final int patternInd) {
63+
this.patternInd = patternInd;
64+
}
65+
}
66+
67+
// Trie Class
68+
public static class Trie {
69+
70+
private Node root = null; // Root node of the trie
71+
private final String[] patterns; // patterns according to which Trie is constructed
72+
73+
public Trie(final String[] patterns) {
74+
root = new Node(); // Initialize the root of the trie
75+
this.patterns = patterns;
76+
buildTrie();
77+
buildSuffixAndOutputLinks();
78+
}
79+
80+
// builds AhoCorasick Trie
81+
private void buildTrie() {
82+
83+
// Loop through each input pattern and building Trie
84+
for (int i = 0; i < patterns.length; i++) {
85+
Node curr = root; // Start at the root of the trie for each pattern
86+
87+
// Loop through each character in the current pattern
88+
for (int j = 0; j < patterns[i].length(); j++) {
89+
char c = patterns[i].charAt(j); // Get the current character
90+
91+
// Check if the current node has a child for the current character
92+
if (curr.getChild().containsKey(c)) {
93+
curr = curr.getChild().get(c); // Update the current node to the child node
94+
} else {
95+
// If no child node exists, create a new one and add it to the current node's children
96+
Node nn = new Node();
97+
curr.getChild().put(c, nn);
98+
curr = nn; // Update the current node to the new child node
99+
}
100+
}
101+
curr.setPatternInd(i); // Store the index of the pattern in the current leaf node
102+
}
103+
}
104+
105+
private void initializeSuffixLinksForChildNodesOfTheRoot(Queue<Node> q) {
106+
for (char rc : root.getChild().keySet()) {
107+
Node childNode = root.getChild().get(rc);
108+
q.add(childNode); // Add child node to the queue
109+
childNode.setSuffixLink(root); // Set suffix link to the root
110+
}
111+
}
112+
113+
private void buildSuffixAndOutputLinks() {
114+
root.setSuffixLink(root); // Initialize the suffix link of the root to itself
115+
Queue<Node> q = new LinkedList<>(); // Initialize a queue for BFS traversal
116+
117+
initializeSuffixLinksForChildNodesOfTheRoot(q);
118+
119+
while (!q.isEmpty()) {
120+
Node currentState = q.poll(); // Get the current node for processing
121+
122+
// Iterate through child nodes of the current node
123+
for (char cc : currentState.getChild().keySet()) {
124+
Node currentChild = currentState.getChild().get(cc); // Get the child node
125+
Node parentSuffix = currentState.getSuffixLink(); // Get the parent's suffix link
126+
127+
// Calculate the suffix link for the child based on the parent's suffix link
128+
while (!parentSuffix.getChild().containsKey(cc) && parentSuffix != root) {
129+
parentSuffix = parentSuffix.getSuffixLink();
130+
}
131+
132+
// Set the calculated suffix link or default to root
133+
if (parentSuffix.getChild().containsKey(cc)) {
134+
currentChild.setSuffixLink(parentSuffix.getChild().get(cc));
135+
} else {
136+
currentChild.setSuffixLink(root);
137+
}
138+
139+
q.add(currentChild); // Add the child node to the queue for further processing
140+
}
141+
142+
// Establish output links for nodes to efficiently identify patterns within patterns
143+
if (currentState.getSuffixLink().getPatternInd() >= 0) {
144+
currentState.setOutputLink(currentState.getSuffixLink());
145+
} else {
146+
currentState.setOutputLink(currentState.getSuffixLink().getOutputLink());
147+
}
148+
}
149+
}
150+
151+
private ArrayList<ArrayList<Integer>> initializePositionByStringIndexValue() {
152+
ArrayList<ArrayList<Integer>> positionByStringIndexValue = new ArrayList<>(patterns.length); // Stores positions where patterns are found in the text
153+
for (int i = 0; i < patterns.length; i++) {
154+
positionByStringIndexValue.add(new ArrayList<Integer>());
155+
}
156+
return positionByStringIndexValue;
157+
}
158+
159+
// Searches for patterns in the input text and records their positions
160+
public ArrayList<ArrayList<Integer>> searchIn(final String text) {
161+
var positionByStringIndexValue = initializePositionByStringIndexValue(); // Initialize a list to store positions of the current pattern
162+
Node parent = root; // Start searching from the root node
163+
164+
PatternPositionRecorder positionRecorder = new PatternPositionRecorder(positionByStringIndexValue);
165+
166+
for (int i = 0; i < text.length(); i++) {
167+
char ch = text.charAt(i); // Get the current character in the text
168+
169+
// Check if the current node has a child for the current character
170+
if (parent.getChild().containsKey(ch)) {
171+
parent = parent.getChild().get(ch); // Update the current node to the child node
172+
positionRecorder.recordPatternPositions(parent, i); // Use the method in PatternPositionRecorder to record positions
173+
} else {
174+
// If no child node exists for the character, backtrack using suffix links
175+
while (parent != root && !parent.getChild().containsKey(ch)) {
176+
parent = parent.getSuffixLink();
177+
}
178+
if (parent.getChild().containsKey(ch)) {
179+
i--; // Decrement i to reprocess the same character
180+
}
181+
}
182+
}
183+
184+
setUpStartPoints(positionByStringIndexValue);
185+
return positionByStringIndexValue;
186+
}
187+
188+
// by default positionByStringIndexValue contains end-points. This function converts those
189+
// endpoints to start points
190+
private void setUpStartPoints(ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
191+
for (int i = 0; i < patterns.length; i++) {
192+
for (int j = 0; j < positionByStringIndexValue.get(i).size(); j++) {
193+
int endpoint = positionByStringIndexValue.get(i).get(j);
194+
positionByStringIndexValue.get(i).set(j, endpoint - patterns[i].length() + 1);
195+
}
196+
}
197+
}
198+
}
199+
200+
// Class to handle pattern position recording
201+
private static class PatternPositionRecorder {
202+
private ArrayList<ArrayList<Integer>> positionByStringIndexValue;
203+
204+
// Constructor to initialize the recorder with the position list
205+
PatternPositionRecorder(final ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
206+
this.positionByStringIndexValue = positionByStringIndexValue;
207+
}
208+
209+
/**
210+
* Records positions for a pattern when it's found in the input text and follows
211+
* output links to record positions of other patterns.
212+
*
213+
* @param parent The current node representing a character in the pattern trie.
214+
* @param currentPosition The current position in the input text.
215+
*/
216+
public void recordPatternPositions(final Node parent, final int currentPosition) {
217+
// Check if the current node represents the end of a pattern
218+
if (parent.getPatternInd() > -1) {
219+
// Add the current position to the list of positions for the found pattern
220+
positionByStringIndexValue.get(parent.getPatternInd()).add(currentPosition);
221+
}
222+
223+
Node outputLink = parent.getOutputLink();
224+
// Follow output links to find and record positions of other patterns
225+
while (outputLink != null) {
226+
// Add the current position to the list of positions for the pattern linked by outputLink
227+
positionByStringIndexValue.get(outputLink.getPatternInd()).add(currentPosition);
228+
outputLink = outputLink.getOutputLink();
229+
}
230+
}
231+
}
232+
// method to search for patterns in text
233+
public static Map<String, ArrayList<Integer>> search(final String text, final String[] patterns) {
234+
final var trie = new Trie(patterns);
235+
final var positionByStringIndexValue = trie.searchIn(text);
236+
return convert(positionByStringIndexValue, patterns);
237+
}
238+
239+
// method for converting results to a map
240+
private static Map<String, ArrayList<Integer>> convert(final ArrayList<ArrayList<Integer>> positionByStringIndexValue, final String[] patterns) {
241+
Map<String, ArrayList<Integer>> positionByString = new HashMap<>();
242+
for (int i = 0; i < patterns.length; i++) {
243+
String pattern = patterns[i];
244+
ArrayList<Integer> positions = positionByStringIndexValue.get(i);
245+
positionByString.put(pattern, new ArrayList<>(positions));
246+
}
247+
return positionByString;
248+
}
249+
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Tests For Aho-Corasick String Matching Algorithm
3+
*
4+
* Author: Prabhat-Kumar-42
5+
* GitHub: https://github.com/Prabhat-Kumar-42
6+
*/
7+
8+
package com.thealgorithms.strings;
9+
10+
import static org.junit.jupiter.api.Assertions.assertEquals;
11+
import static org.junit.jupiter.api.Assertions.assertTrue;
12+
13+
import java.util.ArrayList;
14+
import java.util.Arrays;
15+
import java.util.Map;
16+
import org.junit.jupiter.api.BeforeEach;
17+
import org.junit.jupiter.api.Test;
18+
19+
/**
20+
* This class contains test cases for the Aho-Corasick String Matching Algorithm.
21+
* The Aho-Corasick algorithm is used to efficiently find all occurrences of multiple
22+
* patterns in a given text.
23+
*/
24+
class AhoCorasickTest {
25+
private String[] patterns; // The array of patterns to search for
26+
private String text; // The input text to search within
27+
28+
/**
29+
* This method sets up the test environment before each test case.
30+
* It initializes the patterns and text to be used for testing.
31+
*/
32+
@BeforeEach
33+
void setUp() {
34+
patterns = new String[] {"ACC", "ATC", "CAT", "GCG", "C", "T"};
35+
text = "GCATCG";
36+
}
37+
38+
/**
39+
* Test searching for multiple patterns in the input text.
40+
* The expected results are defined for each pattern.
41+
*/
42+
@Test
43+
void testSearch() {
44+
// Define the expected results for each pattern
45+
final var expected = Map.of("ACC", new ArrayList<>(Arrays.asList()), "ATC", new ArrayList<>(Arrays.asList(2)), "CAT", new ArrayList<>(Arrays.asList(1)), "GCG", new ArrayList<>(Arrays.asList()), "C", new ArrayList<>(Arrays.asList(1, 4)), "T", new ArrayList<>(Arrays.asList(3)));
46+
assertEquals(expected, AhoCorasick.search(text, patterns));
47+
}
48+
49+
/**
50+
* Test searching with an empty pattern array.
51+
* The result should be an empty map.
52+
*/
53+
@Test
54+
void testEmptyPatterns() {
55+
// Define an empty pattern array
56+
final var emptyPatterns = new String[] {};
57+
assertTrue(AhoCorasick.search(text, emptyPatterns).isEmpty());
58+
}
59+
60+
/**
61+
* Test searching for patterns that are not present in the input text.
62+
* The result should be an empty list for each pattern.
63+
*/
64+
@Test
65+
void testPatternNotFound() {
66+
// Define patterns that are not present in the text
67+
final var searchPatterns = new String[] {"XYZ", "123"};
68+
final var expected = Map.of("XYZ", new ArrayList<Integer>(), "123", new ArrayList<Integer>());
69+
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
70+
}
71+
72+
/**
73+
* Test searching for patterns that start at the beginning of the input text.
74+
* The expected position for each pattern is 0.
75+
*/
76+
@Test
77+
void testPatternAtBeginning() {
78+
// Define patterns that start at the beginning of the text
79+
final var searchPatterns = new String[] {"GC", "GCA", "GCAT"};
80+
final var expected = Map.of("GC", new ArrayList<Integer>(Arrays.asList(0)), "GCA", new ArrayList<Integer>(Arrays.asList(0)), "GCAT", new ArrayList<Integer>(Arrays.asList(0)));
81+
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
82+
}
83+
84+
/**
85+
* Test searching for patterns that end at the end of the input text.
86+
* The expected positions are 4, 3, and 2 for the patterns.
87+
*/
88+
@Test
89+
void testPatternAtEnd() {
90+
// Define patterns that end at the end of the text
91+
final var searchPatterns = new String[] {"CG", "TCG", "ATCG"};
92+
final var expected = Map.of("CG", new ArrayList<Integer>(Arrays.asList(4)), "TCG", new ArrayList<Integer>(Arrays.asList(3)), "ATCG", new ArrayList<Integer>(Arrays.asList(2)));
93+
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
94+
}
95+
96+
/**
97+
* Test searching for patterns with multiple occurrences in the input text.
98+
* The expected sizes are 1 and 1, and the expected positions are 2 and 3
99+
* for the patterns "AT" and "T" respectively.
100+
*/
101+
@Test
102+
void testMultipleOccurrencesOfPattern() {
103+
// Define patterns with multiple occurrences in the text
104+
final var searchPatterns = new String[] {"AT", "T"};
105+
final var expected = Map.of("AT", new ArrayList<Integer>(Arrays.asList(2)), "T", new ArrayList<Integer>(Arrays.asList(3)));
106+
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
107+
}
108+
109+
/**
110+
* Test searching for patterns in a case-insensitive manner.
111+
* The search should consider patterns regardless of their case.
112+
*/
113+
@Test
114+
void testCaseInsensitiveSearch() {
115+
// Define patterns with different cases
116+
final var searchPatterns = new String[] {"gca", "aTc", "C"};
117+
final var expected = Map.of("gca", new ArrayList<Integer>(), "aTc", new ArrayList<Integer>(), "C", new ArrayList<Integer>(Arrays.asList(1, 4)));
118+
assertEquals(expected, AhoCorasick.search(text, searchPatterns));
119+
}
120+
}

0 commit comments

Comments
 (0)