From 8b4151877c00e34ca5b9b5ecd6af14e46620ed05 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 6 Oct 2024 16:41:30 +0530 Subject: [PATCH 1/6] Inverted Index implementation with BM25 Scoring for movie search. --- .../searches/BM25InvertedIndex.java | 210 ++++++++++++++++++ .../searches/BM25InvertedIndexTest.java | 102 +++++++++ 2 files changed, 312 insertions(+) create mode 100644 src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java create mode 100644 src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java diff --git a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java new file mode 100644 index 000000000000..87bcfd5feea9 --- /dev/null +++ b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java @@ -0,0 +1,210 @@ +package com.thealgorithms.searches; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** + * Inverted Index implementation with BM25 Scoring for movie search. + * This class supports adding movie documents and searching for terms + * within those documents using the BM25 algorithm. + * @author Prayas Kumar (https://github.com/prayas7102) + */ + +class Movie { + int docId; // Unique identifier for the movie + String name; // Movie name + double imdbRating; // IMDb rating of the movie + int releaseYear; // Year the movie was released + String content; // Full text content (could be the description or script) + + /** + * Constructor for the Movie class. + * @param docId Unique identifier for the movie. + * @param name Name of the movie. + * @param imdbRating IMDb rating of the movie. + * @param releaseYear Release year of the movie. + * @param content Content or description of the movie. + */ + public Movie(int docId, String name, double imdbRating, int releaseYear, String content) { + this.docId = docId; + this.name = name; + this.imdbRating = imdbRating; + this.releaseYear = releaseYear; + this.content = content; + } + + /** + * Get all the words from the movie's name and content. + * Converts the name and content to lowercase and splits on non-word characters. + * @return Array of words from the movie name and content. + */ + public String[] getWords() { + return (name + " " + content).toLowerCase().split("\\W+"); + } + + @Override + public String toString() { + return "Movie{" + + "docId=" + docId + ", name='" + name + '\'' + ", imdbRating=" + imdbRating + ", releaseYear=" + releaseYear + '}'; + } +} + +class SearchResult { + int docId; // Unique identifier of the movie document + double relevanceScore; // Relevance score based on the BM25 algorithm + + /** + * Constructor for SearchResult class. + * @param docId Document ID (movie) for this search result. + * @param relevanceScore The relevance score based on BM25 scoring. + */ + public SearchResult(int docId, double relevanceScore) { + this.docId = docId; + this.relevanceScore = relevanceScore; + } + + public int getDocId() { + return docId; + } + + @Override + public String toString() { + return "SearchResult{" + + "docId=" + docId + ", relevanceScore=" + relevanceScore + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + SearchResult that = (SearchResult) o; + return docId == that.docId && Double.compare(that.relevanceScore, relevanceScore) == 0; + } + + @Override + public int hashCode() { + return Objects.hash(docId, relevanceScore); + } + + public double getRelevanceScore() { + return this.relevanceScore; + } +} + +public final class BM25InvertedIndex { + private Map> index; // Inverted index mapping terms to document frequencies + private Map movies; // Mapping of movie document IDs to Movie objects + private int totalDocuments; // Total number of movies/documents + private double avgDocumentLength; // Average length of documents (number of words) + private static final double k = 1.5; // BM25 tuning parameter, controls term frequency saturation + private static final double b = 0.75; // BM25 tuning parameter, controls length normalization + + /** + * Constructor for BM25InvertedIndex. + * Initializes the inverted index and movie storage. + */ + public BM25InvertedIndex() { + index = new HashMap<>(); + movies = new HashMap<>(); + totalDocuments = 0; + avgDocumentLength = 0.0; + } + + /** + * Add a movie to the index. + * @param docId Unique identifier for the movie. + * @param name Name of the movie. + * @param imdbRating IMDb rating of the movie. + * @param releaseYear Release year of the movie. + * @param content Content or description of the movie. + */ + public void addMovie(int docId, String name, double imdbRating, int releaseYear, String content) { + Movie movie = new Movie(docId, name, imdbRating, releaseYear, content); + movies.put(docId, movie); + totalDocuments++; + + // Get words (terms) from the movie's name and content + String[] terms = movie.getWords(); + int docLength = terms.length; + + // Update the average document length + avgDocumentLength = (avgDocumentLength * (totalDocuments - 1) + docLength) / totalDocuments; + + // Update the inverted index + for (String term : terms) { + // Create a new entry if the term is not yet in the index + index.putIfAbsent(term, new HashMap<>()); + + // Get the list of documents containing the term + Map docList = index.get(term); + + // Increment the term frequency in this document + docList.put(docId, docList.getOrDefault(docId, 0) + 1); + } + } + + public int getMoviesLength() { + return movies.size(); + } + + /** + * Search for documents containing a term using BM25 scoring. + * @param term The search term. + * @return A list of search results sorted by relevance score. + */ + public List search(String term) { + term = term.toLowerCase(); // Normalize search term + if (!index.containsKey(term)) { + return new ArrayList<>(); // Return empty list if term not found + } + + Map termDocs = index.get(term); // Documents containing the term + List results = new ArrayList<>(); + + // Compute IDF for the search term + double idf = computeIDF(termDocs.size()); + + // Calculate relevance scores for all documents containing the term + for (Map.Entry entry : termDocs.entrySet()) { + int docId = entry.getKey(); + int termFrequency = entry.getValue(); + Movie movie = movies.get(docId); + double docLength = movie.getWords().length; + + // Compute BM25 relevance score + double score = computeBM25Score(termFrequency, docLength, idf); + results.add(new SearchResult(docId, score)); + } + + // Sort the results by relevance score in descending order + results.sort((r1, r2) -> Double.compare(r2.relevanceScore, r1.relevanceScore)); + return results; + } + + /** + * Compute the BM25 score for a given term and document. + * @param termFrequency The frequency of the term in the document. + * @param docLength The length of the document. + * @param idf The inverse document frequency of the term. + * @return The BM25 relevance score for the term in the document. + */ + private double computeBM25Score(int termFrequency, double docLength, double idf) { + double numerator = termFrequency * (k + 1); + double denominator = termFrequency + k * (1 - b + b * (docLength / avgDocumentLength)); + return idf * (numerator / denominator); + } + + /** + * Compute the inverse document frequency (IDF) of a term. + * The IDF measures the importance of a term across the entire document set. + * @param docFrequency The number of documents that contain the term. + * @return The inverse document frequency (IDF) value. + */ + private double computeIDF(int docFrequency) { + // Total number of documents in the index + return Math.log((totalDocuments - docFrequency + 0.5) / (docFrequency + 0.5)); + } +} diff --git a/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java b/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java new file mode 100644 index 000000000000..7823543d8e60 --- /dev/null +++ b/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java @@ -0,0 +1,102 @@ +package com.thealgorithms.searches; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; + +/** + * Test Cases for Inverted Index with BM25 + * @author Prayas Kumar (https://github.com/prayas7102) + */ + +class BM25InvertedIndexTest { + + private static BM25InvertedIndex index; + + @BeforeAll + static void setUp() { + index = new BM25InvertedIndex(); + index.addMovie(1, "The Shawshank Redemption", 9.3, 1994, + "Hope is a good thing. Maybe the best of things. And no good thing ever dies."); + index.addMovie(2, "The Godfather", 9.2, 1972, + "I'm gonna make him an offer he can't refuse."); + index.addMovie(3, "The Dark Knight", 9.0, 2008, + "You either die a hero or live long enough to see yourself become the villain."); + index.addMovie(4, "Pulp Fiction", 8.9, 1994, + "You know what they call a Quarter Pounder with Cheese in Paris? They call it a Royale with Cheese."); + index.addMovie(5, "Good Will Hunting", 8.3, 1997, + "Will Hunting is a genius and he has a good heart. The best of his abilities is yet to be explored."); + index.addMovie(6, "It's a Wonderful Life", 8.6, 1946, + "Each man's life touches so many other lives. If he wasn't around, it would leave an awfully good hole."); + index.addMovie(7, "The Pursuit of Happyness", 8.0, 2006, + "It was the pursuit of a better life, and a good opportunity to change things for the better."); + index.addMovie(8, "A Few Good Men", 7.7, 1992, + "You can't handle the truth! This movie has a lot of good moments and intense drama."); + } + + @Test + void testAddMovie() { + // Check that the index contains the correct number of movies + int moviesLength = index.getMoviesLength(); + assertEquals(8, moviesLength); + } + + @Test + void testSearchForTermFound() { + int expected = 1; + List result = index.search("hope"); + int actual = result.getFirst().getDocId(); + assertEquals(expected, actual); + } + + @Test + void testSearchRanking() { + // Perform search for the term "good" + List results = index.search("good"); + assertFalse(results.isEmpty()); + + // Validate the ranking based on the provided relevance scores + assertEquals(6, results.get(0).getDocId()); // It's a Wonderful Life should be ranked 1st + assertEquals(7, results.get(1).getDocId()); // The Pursuit of Happyness should be ranked 2nd + assertEquals(5, results.get(2).getDocId()); // Good Will Hunting should be ranked 3rd + assertEquals(8, results.get(3).getDocId()); // A Few Good Men should be ranked 4th + assertEquals(1, results.get(4).getDocId()); // The Shawshank Redemption should be ranked 5th + + // Ensure the relevance scores are in descending order + for (int i = 0; i < results.size() - 1; i++) { + assertTrue(results.get(i).getRelevanceScore() > results.get(i + 1).getRelevanceScore()); + } + } + + @Test + void testSearchForTermNotFound() { + List results = index.search("nonexistent"); + assertTrue(results.isEmpty()); + } + + @Test + void testSearchForCommonTerm() { + List results = index.search("the"); + assertFalse(results.isEmpty()); + assertTrue(results.size() > 1); + } + + @Test + void testBM25ScoreCalculation() { + List results = index.search("cheese"); + assertEquals(1, results.size()); + assertEquals(4, results.getFirst().docId); // Pulp Fiction should have the highest score + } + + @Test + void testCaseInsensitivity() { + List resultsLowerCase = index.search("hope"); + List resultsUpperCase = index.search("HOPE"); + assertEquals(resultsLowerCase, resultsUpperCase); + } +} From 8b903816fef68a001aa517da2f862bcc42ec0f26 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 6 Oct 2024 17:21:21 +0530 Subject: [PATCH 2/6] comments improved --- src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java index 87bcfd5feea9..5d332cb71522 100644 --- a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java +++ b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java @@ -95,7 +95,7 @@ public double getRelevanceScore() { } public final class BM25InvertedIndex { - private Map> index; // Inverted index mapping terms to document frequencies + private Map> index; // Inverted index mapping terms to document id and frequency private Map movies; // Mapping of movie document IDs to Movie objects private int totalDocuments; // Total number of movies/documents private double avgDocumentLength; // Average length of documents (number of words) From 1fabcc7f72c29ab4196ddd08d2e02168d93eb576 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 6 Oct 2024 17:33:44 +0530 Subject: [PATCH 3/6] clang-format changes --- .../searches/BM25InvertedIndexTest.java | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java b/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java index 7823543d8e60..8595e0a00683 100644 --- a/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java +++ b/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java @@ -1,13 +1,12 @@ package com.thealgorithms.searches; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.util.List; - import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; /** * Test Cases for Inverted Index with BM25 @@ -21,22 +20,14 @@ class BM25InvertedIndexTest { @BeforeAll static void setUp() { index = new BM25InvertedIndex(); - index.addMovie(1, "The Shawshank Redemption", 9.3, 1994, - "Hope is a good thing. Maybe the best of things. And no good thing ever dies."); - index.addMovie(2, "The Godfather", 9.2, 1972, - "I'm gonna make him an offer he can't refuse."); - index.addMovie(3, "The Dark Knight", 9.0, 2008, - "You either die a hero or live long enough to see yourself become the villain."); - index.addMovie(4, "Pulp Fiction", 8.9, 1994, - "You know what they call a Quarter Pounder with Cheese in Paris? They call it a Royale with Cheese."); - index.addMovie(5, "Good Will Hunting", 8.3, 1997, - "Will Hunting is a genius and he has a good heart. The best of his abilities is yet to be explored."); - index.addMovie(6, "It's a Wonderful Life", 8.6, 1946, - "Each man's life touches so many other lives. If he wasn't around, it would leave an awfully good hole."); - index.addMovie(7, "The Pursuit of Happyness", 8.0, 2006, - "It was the pursuit of a better life, and a good opportunity to change things for the better."); - index.addMovie(8, "A Few Good Men", 7.7, 1992, - "You can't handle the truth! This movie has a lot of good moments and intense drama."); + index.addMovie(1, "The Shawshank Redemption", 9.3, 1994, "Hope is a good thing. Maybe the best of things. And no good thing ever dies."); + index.addMovie(2, "The Godfather", 9.2, 1972, "I'm gonna make him an offer he can't refuse."); + index.addMovie(3, "The Dark Knight", 9.0, 2008, "You either die a hero or live long enough to see yourself become the villain."); + index.addMovie(4, "Pulp Fiction", 8.9, 1994, "You know what they call a Quarter Pounder with Cheese in Paris? They call it a Royale with Cheese."); + index.addMovie(5, "Good Will Hunting", 8.3, 1997, "Will Hunting is a genius and he has a good heart. The best of his abilities is yet to be explored."); + index.addMovie(6, "It's a Wonderful Life", 8.6, 1946, "Each man's life touches so many other lives. If he wasn't around, it would leave an awfully good hole."); + index.addMovie(7, "The Pursuit of Happyness", 8.0, 2006, "It was the pursuit of a better life, and a good opportunity to change things for the better."); + index.addMovie(8, "A Few Good Men", 7.7, 1992, "You can't handle the truth! This movie has a lot of good moments and intense drama."); } @Test From 8162d6cda442225bc820ac55680080f2cab0a7b6 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 6 Oct 2024 17:41:40 +0530 Subject: [PATCH 4/6] checkstyle issues resolved --- .../searches/BM25InvertedIndex.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java index 5d332cb71522..ec34512a453f 100644 --- a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java +++ b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java @@ -28,7 +28,7 @@ class Movie { * @param releaseYear Release year of the movie. * @param content Content or description of the movie. */ - public Movie(int docId, String name, double imdbRating, int releaseYear, String content) { + Movie(int docId, String name, double imdbRating, int releaseYear, String content) { this.docId = docId; this.name = name; this.imdbRating = imdbRating; @@ -61,7 +61,7 @@ class SearchResult { * @param docId Document ID (movie) for this search result. * @param relevanceScore The relevance score based on BM25 scoring. */ - public SearchResult(int docId, double relevanceScore) { + SearchResult(int docId, double relevanceScore) { this.docId = docId; this.relevanceScore = relevanceScore; } @@ -78,8 +78,12 @@ public String toString() { @Override public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } SearchResult that = (SearchResult) o; return docId == that.docId && Double.compare(that.relevanceScore, relevanceScore) == 0; } @@ -99,14 +103,14 @@ public final class BM25InvertedIndex { private Map movies; // Mapping of movie document IDs to Movie objects private int totalDocuments; // Total number of movies/documents private double avgDocumentLength; // Average length of documents (number of words) - private static final double k = 1.5; // BM25 tuning parameter, controls term frequency saturation - private static final double b = 0.75; // BM25 tuning parameter, controls length normalization + private static final double K = 1.5; // BM25 tuning parameter, controls term frequency saturation + private static final double B = 0.75; // BM25 tuning parameter, controls length normalization /** * Constructor for BM25InvertedIndex. * Initializes the inverted index and movie storage. */ - public BM25InvertedIndex() { + BM25InvertedIndex() { index = new HashMap<>(); movies = new HashMap<>(); totalDocuments = 0; @@ -192,8 +196,8 @@ public List search(String term) { * @return The BM25 relevance score for the term in the document. */ private double computeBM25Score(int termFrequency, double docLength, double idf) { - double numerator = termFrequency * (k + 1); - double denominator = termFrequency + k * (1 - b + b * (docLength / avgDocumentLength)); + double numerator = termFrequency * (K + 1); + double denominator = termFrequency + K * (1 - B + B * (docLength / avgDocumentLength)); return idf * (numerator / denominator); } From 3695abb78f14c3fd997961a94accc02f3e2afd25 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 6 Oct 2024 17:55:44 +0530 Subject: [PATCH 5/6] run infer issues resolved --- .../com/thealgorithms/searches/BM25InvertedIndex.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java index ec34512a453f..fb1ce7b6794a 100644 --- a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java +++ b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java @@ -144,7 +144,10 @@ public void addMovie(int docId, String name, double imdbRating, int releaseYear, // Get the list of documents containing the term Map docList = index.get(term); - + if (docList == null) { + docList = new HashMap<>(); + index.put(term, docList); // Ensure docList is added to the index + } // Increment the term frequency in this document docList.put(docId, docList.getOrDefault(docId, 0) + 1); } @@ -176,6 +179,9 @@ public List search(String term) { int docId = entry.getKey(); int termFrequency = entry.getValue(); Movie movie = movies.get(docId); + if (movie == null) { + continue; // Skip this document if movie doesn't exist + } double docLength = movie.getWords().length; // Compute BM25 relevance score From 6f7b83c8d16723e170194de87482d69ecd589e81 Mon Sep 17 00:00:00 2001 From: prayas7102 Date: Sun, 6 Oct 2024 17:58:55 +0530 Subject: [PATCH 6/6] clang format issues resolved --- .../java/com/thealgorithms/searches/BM25InvertedIndex.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java index fb1ce7b6794a..1cfd2bbad8e4 100644 --- a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java +++ b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java @@ -146,7 +146,7 @@ public void addMovie(int docId, String name, double imdbRating, int releaseYear, Map docList = index.get(term); if (docList == null) { docList = new HashMap<>(); - index.put(term, docList); // Ensure docList is added to the index + index.put(term, docList); // Ensure docList is added to the index } // Increment the term frequency in this document docList.put(docId, docList.getOrDefault(docId, 0) + 1); @@ -180,7 +180,7 @@ public List search(String term) { int termFrequency = entry.getValue(); Movie movie = movies.get(docId); if (movie == null) { - continue; // Skip this document if movie doesn't exist + continue; // Skip this document if movie doesn't exist } double docLength = movie.getWords().length;