Skip to content

Commit c070a7e

Browse files
authored
Merge pull request aws#215 from awslabs/edoliberty_streaming_median
Added: Streaming median content from webinar
2 parents 5ba283e + 4f1bbdd commit c070a7e

File tree

4 files changed

+479
-0
lines changed

4 files changed

+479
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ These examples provide quick walkthroughs to get you up and running with Amazon
3636

3737
These examples provide more thorough mathematical treatment on a select group of algorithms.
3838

39+
- [Streaming Median](scientific_details_of_algorithms/streaming_median) sequentially introduces concepts used in streaming algorithms, which many SageMaker algorithms rely on to deliver speed and scalability.
3940
- [Latent Dirichlet Allocation (LDA)](scientific_details_of_algorithms/lda_topic_modeling) dives into Amazon SageMaker's spectral decomposition approach to LDA.
4041

4142
### Advanced Amazon SageMaker Functionality

scientific_details_of_algorithms/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44

55
These examples provide more thorough mathematical treatment on a select group of algorithms.
66

7+
- [Streaming Median](streaming_median) sequentially introduces concepts used in streaming algorithms, which many SageMaker algorithms rely on to deliver speed and scalability.
78
- [Latent Dirichlet Allocation (LDA)](lda_topic_modeling) dives into Amazon SageMaker's spectral decomposition approach to LDA.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/python
2+
3+
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the License. A copy of the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
10+
11+
import sys
12+
from random import randint
13+
14+
class KLL300:
15+
def __init__(self):
16+
self.maxSize = 300
17+
self.size = 0
18+
self.capacities = [2, 2, 4, 6, 10, 18, 28, 44, 70, 112]
19+
self.H = len(self.capacities)
20+
self.compactors = [Compactor() for _ in range(self.H)]
21+
22+
def sizef(self):
23+
return sum([len(c) for c in self.compactors])
24+
25+
def update(self, item):
26+
self.compactors[0].append(item)
27+
self.size += 1
28+
if self.size >= self.maxSize:
29+
for h in range(self.H - 1):
30+
if len(self.compactors[h]) >= self.capacities[h]:
31+
newItems = self.compactors[h].compact()
32+
self.compactors[h+1].extend(newItems)
33+
break
34+
self.size = self.sizef()
35+
assert(self.size < self.maxSize)
36+
37+
def cdf(self):
38+
itemsAndWeights = []
39+
for (h, items) in enumerate(self.compactors):
40+
itemsAndWeights.extend( (item, 2**h) for item in items )
41+
itemsAndWeights.sort()
42+
items = [t[0] for t in itemsAndWeights]
43+
weights = [t[1] for t in itemsAndWeights]
44+
for i in range(len(weights)-1):
45+
weights[i+1]+=weights[i]
46+
totWeight = weights[-1]
47+
return items, [w/totWeight for w in weights]
48+
49+
class Compactor(list):
50+
def compact(self):
51+
self.sort()
52+
offset = randint(0,1)
53+
for item in self[offset::2]:
54+
yield item
55+
self.clear()

0 commit comments

Comments
 (0)