Skip to content

Commit 8b41518

Browse files
author
prayas7102
committed
Inverted Index implementation with BM25 Scoring for movie search.
1 parent be8df21 commit 8b41518

File tree

2 files changed

+312
-0
lines changed

2 files changed

+312
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
package com.thealgorithms.searches;
2+
3+
import java.util.ArrayList;
4+
import java.util.HashMap;
5+
import java.util.List;
6+
import java.util.Map;
7+
import java.util.Objects;
8+
9+
/**
10+
* Inverted Index implementation with BM25 Scoring for movie search.
11+
* This class supports adding movie documents and searching for terms
12+
* within those documents using the BM25 algorithm.
13+
* @author Prayas Kumar (https://github.com/prayas7102)
14+
*/
15+
16+
class Movie {
17+
int docId; // Unique identifier for the movie
18+
String name; // Movie name
19+
double imdbRating; // IMDb rating of the movie
20+
int releaseYear; // Year the movie was released
21+
String content; // Full text content (could be the description or script)
22+
23+
/**
24+
* Constructor for the Movie class.
25+
* @param docId Unique identifier for the movie.
26+
* @param name Name of the movie.
27+
* @param imdbRating IMDb rating of the movie.
28+
* @param releaseYear Release year of the movie.
29+
* @param content Content or description of the movie.
30+
*/
31+
public Movie(int docId, String name, double imdbRating, int releaseYear, String content) {
32+
this.docId = docId;
33+
this.name = name;
34+
this.imdbRating = imdbRating;
35+
this.releaseYear = releaseYear;
36+
this.content = content;
37+
}
38+
39+
/**
40+
* Get all the words from the movie's name and content.
41+
* Converts the name and content to lowercase and splits on non-word characters.
42+
* @return Array of words from the movie name and content.
43+
*/
44+
public String[] getWords() {
45+
return (name + " " + content).toLowerCase().split("\\W+");
46+
}
47+
48+
@Override
49+
public String toString() {
50+
return "Movie{"
51+
+ "docId=" + docId + ", name='" + name + '\'' + ", imdbRating=" + imdbRating + ", releaseYear=" + releaseYear + '}';
52+
}
53+
}
54+
55+
class SearchResult {
56+
int docId; // Unique identifier of the movie document
57+
double relevanceScore; // Relevance score based on the BM25 algorithm
58+
59+
/**
60+
* Constructor for SearchResult class.
61+
* @param docId Document ID (movie) for this search result.
62+
* @param relevanceScore The relevance score based on BM25 scoring.
63+
*/
64+
public SearchResult(int docId, double relevanceScore) {
65+
this.docId = docId;
66+
this.relevanceScore = relevanceScore;
67+
}
68+
69+
public int getDocId() {
70+
return docId;
71+
}
72+
73+
@Override
74+
public String toString() {
75+
return "SearchResult{"
76+
+ "docId=" + docId + ", relevanceScore=" + relevanceScore + '}';
77+
}
78+
79+
@Override
80+
public boolean equals(Object o) {
81+
if (this == o) return true;
82+
if (o == null || getClass() != o.getClass()) return false;
83+
SearchResult that = (SearchResult) o;
84+
return docId == that.docId && Double.compare(that.relevanceScore, relevanceScore) == 0;
85+
}
86+
87+
@Override
88+
public int hashCode() {
89+
return Objects.hash(docId, relevanceScore);
90+
}
91+
92+
public double getRelevanceScore() {
93+
return this.relevanceScore;
94+
}
95+
}
96+
97+
public final class BM25InvertedIndex {
98+
private Map<String, Map<Integer, Integer>> index; // Inverted index mapping terms to document frequencies
99+
private Map<Integer, Movie> movies; // Mapping of movie document IDs to Movie objects
100+
private int totalDocuments; // Total number of movies/documents
101+
private double avgDocumentLength; // Average length of documents (number of words)
102+
private static final double k = 1.5; // BM25 tuning parameter, controls term frequency saturation
103+
private static final double b = 0.75; // BM25 tuning parameter, controls length normalization
104+
105+
/**
106+
* Constructor for BM25InvertedIndex.
107+
* Initializes the inverted index and movie storage.
108+
*/
109+
public BM25InvertedIndex() {
110+
index = new HashMap<>();
111+
movies = new HashMap<>();
112+
totalDocuments = 0;
113+
avgDocumentLength = 0.0;
114+
}
115+
116+
/**
117+
* Add a movie to the index.
118+
* @param docId Unique identifier for the movie.
119+
* @param name Name of the movie.
120+
* @param imdbRating IMDb rating of the movie.
121+
* @param releaseYear Release year of the movie.
122+
* @param content Content or description of the movie.
123+
*/
124+
public void addMovie(int docId, String name, double imdbRating, int releaseYear, String content) {
125+
Movie movie = new Movie(docId, name, imdbRating, releaseYear, content);
126+
movies.put(docId, movie);
127+
totalDocuments++;
128+
129+
// Get words (terms) from the movie's name and content
130+
String[] terms = movie.getWords();
131+
int docLength = terms.length;
132+
133+
// Update the average document length
134+
avgDocumentLength = (avgDocumentLength * (totalDocuments - 1) + docLength) / totalDocuments;
135+
136+
// Update the inverted index
137+
for (String term : terms) {
138+
// Create a new entry if the term is not yet in the index
139+
index.putIfAbsent(term, new HashMap<>());
140+
141+
// Get the list of documents containing the term
142+
Map<Integer, Integer> docList = index.get(term);
143+
144+
// Increment the term frequency in this document
145+
docList.put(docId, docList.getOrDefault(docId, 0) + 1);
146+
}
147+
}
148+
149+
public int getMoviesLength() {
150+
return movies.size();
151+
}
152+
153+
/**
154+
* Search for documents containing a term using BM25 scoring.
155+
* @param term The search term.
156+
* @return A list of search results sorted by relevance score.
157+
*/
158+
public List<SearchResult> search(String term) {
159+
term = term.toLowerCase(); // Normalize search term
160+
if (!index.containsKey(term)) {
161+
return new ArrayList<>(); // Return empty list if term not found
162+
}
163+
164+
Map<Integer, Integer> termDocs = index.get(term); // Documents containing the term
165+
List<SearchResult> results = new ArrayList<>();
166+
167+
// Compute IDF for the search term
168+
double idf = computeIDF(termDocs.size());
169+
170+
// Calculate relevance scores for all documents containing the term
171+
for (Map.Entry<Integer, Integer> entry : termDocs.entrySet()) {
172+
int docId = entry.getKey();
173+
int termFrequency = entry.getValue();
174+
Movie movie = movies.get(docId);
175+
double docLength = movie.getWords().length;
176+
177+
// Compute BM25 relevance score
178+
double score = computeBM25Score(termFrequency, docLength, idf);
179+
results.add(new SearchResult(docId, score));
180+
}
181+
182+
// Sort the results by relevance score in descending order
183+
results.sort((r1, r2) -> Double.compare(r2.relevanceScore, r1.relevanceScore));
184+
return results;
185+
}
186+
187+
/**
188+
* Compute the BM25 score for a given term and document.
189+
* @param termFrequency The frequency of the term in the document.
190+
* @param docLength The length of the document.
191+
* @param idf The inverse document frequency of the term.
192+
* @return The BM25 relevance score for the term in the document.
193+
*/
194+
private double computeBM25Score(int termFrequency, double docLength, double idf) {
195+
double numerator = termFrequency * (k + 1);
196+
double denominator = termFrequency + k * (1 - b + b * (docLength / avgDocumentLength));
197+
return idf * (numerator / denominator);
198+
}
199+
200+
/**
201+
* Compute the inverse document frequency (IDF) of a term.
202+
* The IDF measures the importance of a term across the entire document set.
203+
* @param docFrequency The number of documents that contain the term.
204+
* @return The inverse document frequency (IDF) value.
205+
*/
206+
private double computeIDF(int docFrequency) {
207+
// Total number of documents in the index
208+
return Math.log((totalDocuments - docFrequency + 0.5) / (docFrequency + 0.5));
209+
}
210+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package com.thealgorithms.searches;
2+
3+
import org.junit.jupiter.api.BeforeAll;
4+
import org.junit.jupiter.api.Test;
5+
6+
import java.util.List;
7+
8+
import static org.junit.jupiter.api.Assertions.assertEquals;
9+
import static org.junit.jupiter.api.Assertions.assertTrue;
10+
import static org.junit.jupiter.api.Assertions.assertFalse;
11+
12+
/**
13+
* Test Cases for Inverted Index with BM25
14+
* @author Prayas Kumar (https://github.com/prayas7102)
15+
*/
16+
17+
class BM25InvertedIndexTest {
18+
19+
private static BM25InvertedIndex index;
20+
21+
@BeforeAll
22+
static void setUp() {
23+
index = new BM25InvertedIndex();
24+
index.addMovie(1, "The Shawshank Redemption", 9.3, 1994,
25+
"Hope is a good thing. Maybe the best of things. And no good thing ever dies.");
26+
index.addMovie(2, "The Godfather", 9.2, 1972,
27+
"I'm gonna make him an offer he can't refuse.");
28+
index.addMovie(3, "The Dark Knight", 9.0, 2008,
29+
"You either die a hero or live long enough to see yourself become the villain.");
30+
index.addMovie(4, "Pulp Fiction", 8.9, 1994,
31+
"You know what they call a Quarter Pounder with Cheese in Paris? They call it a Royale with Cheese.");
32+
index.addMovie(5, "Good Will Hunting", 8.3, 1997,
33+
"Will Hunting is a genius and he has a good heart. The best of his abilities is yet to be explored.");
34+
index.addMovie(6, "It's a Wonderful Life", 8.6, 1946,
35+
"Each man's life touches so many other lives. If he wasn't around, it would leave an awfully good hole.");
36+
index.addMovie(7, "The Pursuit of Happyness", 8.0, 2006,
37+
"It was the pursuit of a better life, and a good opportunity to change things for the better.");
38+
index.addMovie(8, "A Few Good Men", 7.7, 1992,
39+
"You can't handle the truth! This movie has a lot of good moments and intense drama.");
40+
}
41+
42+
@Test
43+
void testAddMovie() {
44+
// Check that the index contains the correct number of movies
45+
int moviesLength = index.getMoviesLength();
46+
assertEquals(8, moviesLength);
47+
}
48+
49+
@Test
50+
void testSearchForTermFound() {
51+
int expected = 1;
52+
List<SearchResult> result = index.search("hope");
53+
int actual = result.getFirst().getDocId();
54+
assertEquals(expected, actual);
55+
}
56+
57+
@Test
58+
void testSearchRanking() {
59+
// Perform search for the term "good"
60+
List<SearchResult> results = index.search("good");
61+
assertFalse(results.isEmpty());
62+
63+
// Validate the ranking based on the provided relevance scores
64+
assertEquals(6, results.get(0).getDocId()); // It's a Wonderful Life should be ranked 1st
65+
assertEquals(7, results.get(1).getDocId()); // The Pursuit of Happyness should be ranked 2nd
66+
assertEquals(5, results.get(2).getDocId()); // Good Will Hunting should be ranked 3rd
67+
assertEquals(8, results.get(3).getDocId()); // A Few Good Men should be ranked 4th
68+
assertEquals(1, results.get(4).getDocId()); // The Shawshank Redemption should be ranked 5th
69+
70+
// Ensure the relevance scores are in descending order
71+
for (int i = 0; i < results.size() - 1; i++) {
72+
assertTrue(results.get(i).getRelevanceScore() > results.get(i + 1).getRelevanceScore());
73+
}
74+
}
75+
76+
@Test
77+
void testSearchForTermNotFound() {
78+
List<SearchResult> results = index.search("nonexistent");
79+
assertTrue(results.isEmpty());
80+
}
81+
82+
@Test
83+
void testSearchForCommonTerm() {
84+
List<SearchResult> results = index.search("the");
85+
assertFalse(results.isEmpty());
86+
assertTrue(results.size() > 1);
87+
}
88+
89+
@Test
90+
void testBM25ScoreCalculation() {
91+
List<SearchResult> results = index.search("cheese");
92+
assertEquals(1, results.size());
93+
assertEquals(4, results.getFirst().docId); // Pulp Fiction should have the highest score
94+
}
95+
96+
@Test
97+
void testCaseInsensitivity() {
98+
List<SearchResult> resultsLowerCase = index.search("hope");
99+
List<SearchResult> resultsUpperCase = index.search("HOPE");
100+
assertEquals(resultsLowerCase, resultsUpperCase);
101+
}
102+
}

0 commit comments

Comments
 (0)