Skip to content

Commit 8ed3e47

Browse files
author
cureprotocols
committed
Add Reservoir Sampling algorithm for streaming data
1 parent 213ff55 commit 8ed3e47

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

Diff for: searches/reservoir_sampling.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Reservoir Sampling Algorithm
3+
4+
Use Case:
5+
Efficient for selecting k random items from a data stream of unknown size,
6+
or when the entire dataset cannot fit into memory.
7+
8+
Time Complexity:
9+
- O(n), where n is the total number of items
10+
- Space Complexity: O(k)
11+
12+
Author: Michael Alexander Montoya
13+
"""
14+
15+
import random
16+
17+
def reservoir_sampling(stream, k):
18+
"""
19+
Performs reservoir sampling on a stream of items.
20+
21+
Args:
22+
stream: An iterable data stream.
23+
k: Number of items to sample.
24+
25+
Returns:
26+
A list containing k randomly sampled items from the stream.
27+
"""
28+
29+
reservoir = []
30+
31+
for i, item in enumerate(stream):
32+
if i < k:
33+
reservoir.append(item)
34+
else:
35+
j = random.randint(0, i)
36+
if j < k:
37+
reservoir[j] = item
38+
39+
return reservoir
40+
41+
42+
# Example usage
43+
if __name__ == "__main__":
44+
stream_data = range(1, 1001) # Simulate a stream of numbers from 1 to 1000
45+
sample_size = 10
46+
47+
sample = reservoir_sampling(stream_data, sample_size)
48+
print(f"Random sample of {sample_size} items from stream: {sample}")

0 commit comments

Comments
 (0)