Skip to content

Commit 4f1bbdd

Browse files
committed
Added: Streaming median content from webinar
1 parent a240855 commit 4f1bbdd

File tree

4 files changed

+479
-0
lines changed

4 files changed

+479
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ These examples provide quick walkthroughs to get you up and running with Amazon
3636

3737
These examples provide more thorough mathematical treatment on a select group of algorithms.
3838

39+
- [Streaming Median](scientific_details_of_algorithms/streaming_median) sequentially introduces concepts used in streaming algorithms, which many SageMaker algorithms rely on to deliver speed and scalability.
3940
- [Latent Dirichlet Allocation (LDA)](scientific_details_of_algorithms/lda_topic_modeling) dives into Amazon SageMaker's spectral decomposition approach to LDA.
4041

4142
### Advanced Amazon SageMaker Functionality

scientific_details_of_algorithms/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44

55
These examples provide more thorough mathematical treatment on a select group of algorithms.
66

7+
- [Streaming Median](streaming_median) sequentially introduces concepts used in streaming algorithms, which many SageMaker algorithms rely on to deliver speed and scalability.
78
- [Latent Dirichlet Allocation (LDA)](lda_topic_modeling) dives into Amazon SageMaker's spectral decomposition approach to LDA.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/python
2+
3+
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the License. A copy of the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
10+
11+
import sys
12+
from random import randint
13+
14+
class KLL300:
15+
def __init__(self):
16+
self.maxSize = 300
17+
self.size = 0
18+
self.capacities = [2, 2, 4, 6, 10, 18, 28, 44, 70, 112]
19+
self.H = len(self.capacities)
20+
self.compactors = [Compactor() for _ in range(self.H)]
21+
22+
def sizef(self):
23+
return sum([len(c) for c in self.compactors])
24+
25+
def update(self, item):
26+
self.compactors[0].append(item)
27+
self.size += 1
28+
if self.size >= self.maxSize:
29+
for h in range(self.H - 1):
30+
if len(self.compactors[h]) >= self.capacities[h]:
31+
newItems = self.compactors[h].compact()
32+
self.compactors[h+1].extend(newItems)
33+
break
34+
self.size = self.sizef()
35+
assert(self.size < self.maxSize)
36+
37+
def cdf(self):
38+
itemsAndWeights = []
39+
for (h, items) in enumerate(self.compactors):
40+
itemsAndWeights.extend( (item, 2**h) for item in items )
41+
itemsAndWeights.sort()
42+
items = [t[0] for t in itemsAndWeights]
43+
weights = [t[1] for t in itemsAndWeights]
44+
for i in range(len(weights)-1):
45+
weights[i+1]+=weights[i]
46+
totWeight = weights[-1]
47+
return items, [w/totWeight for w in weights]
48+
49+
class Compactor(list):
50+
def compact(self):
51+
self.sort()
52+
offset = randint(0,1)
53+
for item in self[offset::2]:
54+
yield item
55+
self.clear()

0 commit comments

Comments
 (0)