Add BM25 Inverted Index Search Algorithm (#5615)

prayas7102 · web-flow · commit 732f7c845866 · 2024-10-07T20:30:46.000+03:00
diff --git a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java
@@ -0,0 +1,220 @@
+package com.thealgorithms.searches;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+/**
+ * Inverted Index implementation with BM25 Scoring for movie search.
+ * This class supports adding movie documents and searching for terms
+ * within those documents using the BM25 algorithm.
+ * @author Prayas Kumar (https://github.com/prayas7102)
+ */
+
+class Movie {
+    int docId; // Unique identifier for the movie
+    String name; // Movie name
+    double imdbRating; // IMDb rating of the movie
+    int releaseYear; // Year the movie was released
+    String content; // Full text content (could be the description or script)
+
+    /**
+     * Constructor for the Movie class.
+     * @param docId Unique identifier for the movie.
+     * @param name Name of the movie.
+     * @param imdbRating IMDb rating of the movie.
+     * @param releaseYear Release year of the movie.
+     * @param content Content or description of the movie.
+     */
+    Movie(int docId, String name, double imdbRating, int releaseYear, String content) {
+        this.docId = docId;
+        this.name = name;
+        this.imdbRating = imdbRating;
+        this.releaseYear = releaseYear;
+        this.content = content;
+    }
+
+    /**
+     * Get all the words from the movie's name and content.
+     * Converts the name and content to lowercase and splits on non-word characters.
+     * @return Array of words from the movie name and content.
+     */
+    public String[] getWords() {
+        return (name + " " + content).toLowerCase().split("\\W+");
+    }
+
+    @Override
+    public String toString() {
+        return "Movie{"
+            + "docId=" + docId + ", name='" + name + '\'' + ", imdbRating=" + imdbRating + ", releaseYear=" + releaseYear + '}';
+    }
+}
+
+class SearchResult {
+    int docId; // Unique identifier of the movie document
+    double relevanceScore; // Relevance score based on the BM25 algorithm
+
+    /**
+     * Constructor for SearchResult class.
+     * @param docId Document ID (movie) for this search result.
+     * @param relevanceScore The relevance score based on BM25 scoring.
+     */
+    SearchResult(int docId, double relevanceScore) {
+        this.docId = docId;
+        this.relevanceScore = relevanceScore;
+    }
+
+    public int getDocId() {
+        return docId;
+    }
+
+    @Override
+    public String toString() {
+        return "SearchResult{"
+            + "docId=" + docId + ", relevanceScore=" + relevanceScore + '}';
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+        SearchResult that = (SearchResult) o;
+        return docId == that.docId && Double.compare(that.relevanceScore, relevanceScore) == 0;
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(docId, relevanceScore);
+    }
+
+    public double getRelevanceScore() {
+        return this.relevanceScore;
+    }
+}
+
+public final class BM25InvertedIndex {
+    private Map<String, Map<Integer, Integer>> index; // Inverted index mapping terms to document id and frequency
+    private Map<Integer, Movie> movies; // Mapping of movie document IDs to Movie objects
+    private int totalDocuments; // Total number of movies/documents
+    private double avgDocumentLength; // Average length of documents (number of words)
+    private static final double K = 1.5; // BM25 tuning parameter, controls term frequency saturation
+    private static final double B = 0.75; // BM25 tuning parameter, controls length normalization
+
+    /**
+     * Constructor for BM25InvertedIndex.
+     * Initializes the inverted index and movie storage.
+     */
+    BM25InvertedIndex() {
+        index = new HashMap<>();
+        movies = new HashMap<>();
+        totalDocuments = 0;
+        avgDocumentLength = 0.0;
+    }
+
+    /**
+     * Add a movie to the index.
+     * @param docId Unique identifier for the movie.
+     * @param name Name of the movie.
+     * @param imdbRating IMDb rating of the movie.
+     * @param releaseYear Release year of the movie.
+     * @param content Content or description of the movie.
+     */
+    public void addMovie(int docId, String name, double imdbRating, int releaseYear, String content) {
+        Movie movie = new Movie(docId, name, imdbRating, releaseYear, content);
+        movies.put(docId, movie);
+        totalDocuments++;
+
+        // Get words (terms) from the movie's name and content
+        String[] terms = movie.getWords();
+        int docLength = terms.length;
+
+        // Update the average document length
+        avgDocumentLength = (avgDocumentLength * (totalDocuments - 1) + docLength) / totalDocuments;
+
+        // Update the inverted index
+        for (String term : terms) {
+            // Create a new entry if the term is not yet in the index
+            index.putIfAbsent(term, new HashMap<>());
+
+            // Get the list of documents containing the term
+            Map<Integer, Integer> docList = index.get(term);
+            if (docList == null) {
+                docList = new HashMap<>();
+                index.put(term, docList); // Ensure docList is added to the index
+            }
+            // Increment the term frequency in this document
+            docList.put(docId, docList.getOrDefault(docId, 0) + 1);
+        }
+    }
+
+    public int getMoviesLength() {
+        return movies.size();
+    }
+
+    /**
+     * Search for documents containing a term using BM25 scoring.
+     * @param term The search term.
+     * @return A list of search results sorted by relevance score.
+     */
+    public List<SearchResult> search(String term) {
+        term = term.toLowerCase(); // Normalize search term
+        if (!index.containsKey(term)) {
+            return new ArrayList<>(); // Return empty list if term not found
+        }
+
+        Map<Integer, Integer> termDocs = index.get(term); // Documents containing the term
+        List<SearchResult> results = new ArrayList<>();
+
+        // Compute IDF for the search term
+        double idf = computeIDF(termDocs.size());
+
+        // Calculate relevance scores for all documents containing the term
+        for (Map.Entry<Integer, Integer> entry : termDocs.entrySet()) {
+            int docId = entry.getKey();
+            int termFrequency = entry.getValue();
+            Movie movie = movies.get(docId);
+            if (movie == null) {
+                continue; // Skip this document if movie doesn't exist
+            }
+            double docLength = movie.getWords().length;
+
+            // Compute BM25 relevance score
+            double score = computeBM25Score(termFrequency, docLength, idf);
+            results.add(new SearchResult(docId, score));
+        }
+
+        // Sort the results by relevance score in descending order
+        results.sort((r1, r2) -> Double.compare(r2.relevanceScore, r1.relevanceScore));
+        return results;
+    }
+
+    /**
+     * Compute the BM25 score for a given term and document.
+     * @param termFrequency The frequency of the term in the document.
+     * @param docLength The length of the document.
+     * @param idf The inverse document frequency of the term.
+     * @return The BM25 relevance score for the term in the document.
+     */
+    private double computeBM25Score(int termFrequency, double docLength, double idf) {
+        double numerator = termFrequency * (K + 1);
+        double denominator = termFrequency + K * (1 - B + B * (docLength / avgDocumentLength));
+        return idf * (numerator / denominator);
+    }
+
+    /**
+     * Compute the inverse document frequency (IDF) of a term.
+     * The IDF measures the importance of a term across the entire document set.
+     * @param docFrequency The number of documents that contain the term.
+     * @return The inverse document frequency (IDF) value.
+     */
+    private double computeIDF(int docFrequency) {
+        // Total number of documents in the index
+        return Math.log((totalDocuments - docFrequency + 0.5) / (docFrequency + 0.5));
+    }
+}
diff --git a/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java b/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java
@@ -0,0 +1,93 @@
+package com.thealgorithms.searches;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.List;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Test Cases for Inverted Index with BM25
+ * @author Prayas Kumar (https://github.com/prayas7102)
+ */
+
+class BM25InvertedIndexTest {
+
+    private static BM25InvertedIndex index;
+
+    @BeforeAll
+    static void setUp() {
+        index = new BM25InvertedIndex();
+        index.addMovie(1, "The Shawshank Redemption", 9.3, 1994, "Hope is a good thing. Maybe the best of things. And no good thing ever dies.");
+        index.addMovie(2, "The Godfather", 9.2, 1972, "I'm gonna make him an offer he can't refuse.");
+        index.addMovie(3, "The Dark Knight", 9.0, 2008, "You either die a hero or live long enough to see yourself become the villain.");
+        index.addMovie(4, "Pulp Fiction", 8.9, 1994, "You know what they call a Quarter Pounder with Cheese in Paris? They call it a Royale with Cheese.");
+        index.addMovie(5, "Good Will Hunting", 8.3, 1997, "Will Hunting is a genius and he has a good heart. The best of his abilities is yet to be explored.");
+        index.addMovie(6, "It's a Wonderful Life", 8.6, 1946, "Each man's life touches so many other lives. If he wasn't around, it would leave an awfully good hole.");
+        index.addMovie(7, "The Pursuit of Happyness", 8.0, 2006, "It was the pursuit of a better life, and a good opportunity to change things for the better.");
+        index.addMovie(8, "A Few Good Men", 7.7, 1992, "You can't handle the truth! This movie has a lot of good moments and intense drama.");
+    }
+
+    @Test
+    void testAddMovie() {
+        // Check that the index contains the correct number of movies
+        int moviesLength = index.getMoviesLength();
+        assertEquals(8, moviesLength);
+    }
+
+    @Test
+    void testSearchForTermFound() {
+        int expected = 1;
+        List<SearchResult> result = index.search("hope");
+        int actual = result.getFirst().getDocId();
+        assertEquals(expected, actual);
+    }
+
+    @Test
+    void testSearchRanking() {
+        // Perform search for the term "good"
+        List<SearchResult> results = index.search("good");
+        assertFalse(results.isEmpty());
+
+        // Validate the ranking based on the provided relevance scores
+        assertEquals(6, results.get(0).getDocId()); // It's a Wonderful Life should be ranked 1st
+        assertEquals(7, results.get(1).getDocId()); // The Pursuit of Happyness should be ranked 2nd
+        assertEquals(5, results.get(2).getDocId()); // Good Will Hunting should be ranked 3rd
+        assertEquals(8, results.get(3).getDocId()); // A Few Good Men should be ranked 4th
+        assertEquals(1, results.get(4).getDocId()); // The Shawshank Redemption should be ranked 5th
+
+        // Ensure the relevance scores are in descending order
+        for (int i = 0; i < results.size() - 1; i++) {
+            assertTrue(results.get(i).getRelevanceScore() > results.get(i + 1).getRelevanceScore());
+        }
+    }
+
+    @Test
+    void testSearchForTermNotFound() {
+        List<SearchResult> results = index.search("nonexistent");
+        assertTrue(results.isEmpty());
+    }
+
+    @Test
+    void testSearchForCommonTerm() {
+        List<SearchResult> results = index.search("the");
+        assertFalse(results.isEmpty());
+        assertTrue(results.size() > 1);
+    }
+
+    @Test
+    void testBM25ScoreCalculation() {
+        List<SearchResult> results = index.search("cheese");
+        assertEquals(1, results.size());
+        assertEquals(4, results.getFirst().docId); // Pulp Fiction should have the highest score
+    }
+
+    @Test
+    void testCaseInsensitivity() {
+        List<SearchResult> resultsLowerCase = index.search("hope");
+        List<SearchResult> resultsUpperCase = index.search("HOPE");
+        assertEquals(resultsLowerCase, resultsUpperCase);
+    }
+}