Skip to content

Implement BloomFilter class #4524

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jan 20, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import java.security.NoSuchAlgorithmException;

public class BloomFilter {
private final int size;
private final int bitCount;
private final byte[] bitmap;
private final int hashCount;
private static MessageDigest md5HashMessageDigest;
private final MessageDigest md5HashMessageDigest;

public BloomFilter(@NonNull byte[] bitmap, int padding, int hashCount) {
if (bitmap == null) {
Expand All @@ -52,38 +52,33 @@ public BloomFilter(@NonNull byte[] bitmap, int padding, int hashCount) {
}
this.bitmap = bitmap;
this.hashCount = hashCount;
this.size = bitmap.length * 8 - padding;
this.md5HashMessageDigest = getMd5HashMessageDigest();
this.bitCount = bitmap.length * 8 - padding;
this.md5HashMessageDigest = createMd5HashMessageDigest();
}

private boolean isEmpty() {
return this.size == 0;
}

/** Returns the number of bits in the bloom filter. */
@VisibleForTesting
int getSize() {
return this.size;
int getBitCount() {
return this.bitCount;
}

/**
* Check whether the given string is a possible member of the bloom filter. It might return false
* positive result, ie, the given string is not a member of the bloom filter, but the method
* returned true.
*
* @param value the string to be tested membership.
* @param value the string to be tested for membership.
* @return true if the given string might be contained in the bloom filter.
*/
public boolean mightContain(@NonNull String value) {
// Empty bitmap or empty value should always return false on membership check.
if (this.isEmpty() || value.isEmpty()) {
if (this.bitCount == 0 || value.isEmpty()) {
return false;
}

byte[] hashedValue = md5HashDigest(value);
if (hashedValue.length != 16) {
throw new RuntimeException(
"Invalid md5HashedValue.length: " + hashedValue.length + " (expected 16)");
"Invalid md5 hash array length: " + hashedValue.length + " (expected 16)");
}

long hash1 = getLongLittleEndian(hashedValue, 0);
Expand All @@ -100,19 +95,17 @@ public boolean mightContain(@NonNull String value) {

/** Hash a string using md5 hashing algorithm, and return an array of 16 bytes. */
@NonNull
private static byte[] md5HashDigest(@NonNull String value) {
private byte[] md5HashDigest(@NonNull String value) {
return md5HashMessageDigest.digest(value.getBytes(StandardCharsets.UTF_8));
}

@NonNull
private static MessageDigest getMd5HashMessageDigest() {
MessageDigest digest;
private MessageDigest createMd5HashMessageDigest() {
try {
digest = MessageDigest.getInstance("MD5");
return MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException("Missing MD5 MessageDigest provider.", e);
}
return digest;
}

/** Interpret 8 bytes into a long, using little endian 2’s complement. */
Expand All @@ -125,14 +118,18 @@ private static long getLongLittleEndian(@NonNull byte[] bytes, int offset) {
}

/**
* Calculate the ith hash value based on the hashed 64bit integers, and calculate its
* Calculate the ith hash value based on the hashed 64 bit unsigned integers, and calculate its
* corresponding bit index in the bitmap to be checked.
*/
private int getBitIndex(long hash1, long hash2, int index) {
// Calculate hashed value h(i) = h1 + (i * h2).
/**
* Calculate hashed value h(i) = h1 + (i * h2).
* Even though we are interpreting hash1 and hash2 as unsigned, the addition and multiplication
* operators still perform the correct operation and give the desired overflow behavior.
*/
long combinedHash = hash1 + (hash2 * index);
long mod = unsignedRemainder(combinedHash, this.size);
return (int) mod;
long modulo = unsignedRemainder(combinedHash, this.bitCount);
return (int) modulo;
}

/**
Expand All @@ -141,7 +138,6 @@ private int getBitIndex(long hash1, long hash2, int index) {
* <p>The implementation is taken from <a
* href="https://github.com/google/guava/blob/553037486901cc60820ab7dcb38a25b6f34eba43/android/guava/src/com/google/common/primitives/UnsignedLongs.java">Guava</a>,
* simplified to our needs.
*
* <p>
*/
private static long unsignedRemainder(long dividend, long divisor) {
Expand All @@ -164,7 +160,7 @@ public String toString() {
+ "hashCount="
+ hashCount
+ ", size="
+ size
+ bitCount
+ ", bitmap=\""
+ Base64.encodeToString(bitmap, Base64.NO_WRAP)
+ "\"}";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ public class BloomFilterTest {
@Test
public void instantiateEmptyBloomFilter() {
BloomFilter bloomFilter = new BloomFilter(new byte[0], 0, 0);
assertEquals(bloomFilter.getSize(), 0);
assertEquals(bloomFilter.getBitCount(), 0);
}

@Test
public void instantiateNonEmptyBloomFilter() {
BloomFilter bloomFilter1 = new BloomFilter(new byte[] {1}, 0, 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For tests like instantiateNonEmptyBloomFilter(), which effectively test more than one thing, separate each test into its own logic scope using { and }. This provides several benefits, including

  1. It logically groups the two tests together, improving readability.
  2. It prevents accidental usage of variables from one test in another, improving robustness.
  3. Variable names can be re-used, increasing the symmetry between the tests, making it easier for a human to see what's the same and what's different between the cases.

For example, the body of instantiateNonEmptyBloomFilter() would become this:

{
  BloomFilter bloomFilter = new BloomFilter(new byte[] {1}, 0, 1);
  assertEquals(bloomFilter.getBitCount(), 8);
}
{
  BloomFilter bloomFilter = new BloomFilter(new byte[] {1}, 7, 1);
  assertEquals(bloomFilter.getBitCount(), 1);
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please make this change in constructorShouldThrowNPEOnNullBitmap(), constructorShouldThrowIAEOnNegativePadding(), constructorShouldThrowIAEOnNegativeHashCount(), mightContainWithEmptyStringMightReturnFalsePositiveResult(), and bloomFilterToString() too.

assertEquals(bloomFilter1.getSize(), 8);
assertEquals(bloomFilter1.getBitCount(), 8);
BloomFilter bloomFilter2 = new BloomFilter(new byte[] {1}, 7, 1);
assertEquals(bloomFilter2.getSize(), 1);
assertEquals(bloomFilter2.getBitCount(), 1);
}

@Test
Expand Down Expand Up @@ -89,7 +89,7 @@ public void constructorShouldThrowIAEOnNegativePadding() {
}

@Test
public void constructorShouldThrowIAEOnNegativeHashValue() {
public void constructorShouldThrowIAEOnNegativeHashCount() {
IllegalArgumentException emptyBloomFilterException =
assertThrows(IllegalArgumentException.class, () -> new BloomFilter(new byte[0], 0, -1));
assertThat(emptyBloomFilterException).hasMessageThat().contains("Invalid hash count: -1");
Expand Down