Skip to content

Commit 4c1f876

Browse files
cclaussapariboccigithub-actions
authored
Solving the Top k most frequent words problem using a max-heap (TheAlgorithms#8685)
* Solving the `Top k most frequent words` problem using a max-heap * Mentioning Python standard library solution in `Top k most frequent words` docstring * ruff --fix . * updating DIRECTORY.md --------- Co-authored-by: Amos Paribocci <[email protected]> Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
1 parent c1b3ea5 commit 4c1f876

File tree

4 files changed

+128
-7
lines changed

4 files changed

+128
-7
lines changed

DIRECTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -1167,6 +1167,7 @@
11671167
* [Snake Case To Camel Pascal Case](strings/snake_case_to_camel_pascal_case.py)
11681168
* [Split](strings/split.py)
11691169
* [Text Justification](strings/text_justification.py)
1170+
* [Top K Frequent Words](strings/top_k_frequent_words.py)
11701171
* [Upper](strings/upper.py)
11711172
* [Wave](strings/wave.py)
11721173
* [Wildcard Pattern Matching](strings/wildcard_pattern_matching.py)

data_structures/heap/heap.py

+25-6
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,28 @@
11
from __future__ import annotations
22

3+
from abc import abstractmethod
34
from collections.abc import Iterable
5+
from typing import Generic, Protocol, TypeVar
46

57

6-
class Heap:
8+
class Comparable(Protocol):
9+
@abstractmethod
10+
def __lt__(self: T, other: T) -> bool:
11+
pass
12+
13+
@abstractmethod
14+
def __gt__(self: T, other: T) -> bool:
15+
pass
16+
17+
@abstractmethod
18+
def __eq__(self: T, other: object) -> bool:
19+
pass
20+
21+
22+
T = TypeVar("T", bound=Comparable)
23+
24+
25+
class Heap(Generic[T]):
726
"""A Max Heap Implementation
827
928
>>> unsorted = [103, 9, 1, 7, 11, 15, 25, 201, 209, 107, 5]
@@ -27,7 +46,7 @@ class Heap:
2746
"""
2847

2948
def __init__(self) -> None:
30-
self.h: list[float] = []
49+
self.h: list[T] = []
3150
self.heap_size: int = 0
3251

3352
def __repr__(self) -> str:
@@ -79,7 +98,7 @@ def max_heapify(self, index: int) -> None:
7998
# fix the subsequent violation recursively if any
8099
self.max_heapify(violation)
81100

82-
def build_max_heap(self, collection: Iterable[float]) -> None:
101+
def build_max_heap(self, collection: Iterable[T]) -> None:
83102
"""build max heap from an unsorted array"""
84103
self.h = list(collection)
85104
self.heap_size = len(self.h)
@@ -88,7 +107,7 @@ def build_max_heap(self, collection: Iterable[float]) -> None:
88107
for i in range(self.heap_size // 2 - 1, -1, -1):
89108
self.max_heapify(i)
90109

91-
def extract_max(self) -> float:
110+
def extract_max(self) -> T:
92111
"""get and remove max from heap"""
93112
if self.heap_size >= 2:
94113
me = self.h[0]
@@ -102,7 +121,7 @@ def extract_max(self) -> float:
102121
else:
103122
raise Exception("Empty heap")
104123

105-
def insert(self, value: float) -> None:
124+
def insert(self, value: T) -> None:
106125
"""insert a new value into the max heap"""
107126
self.h.append(value)
108127
idx = (self.heap_size - 1) // 2
@@ -144,7 +163,7 @@ def heap_sort(self) -> None:
144163
]:
145164
print(f"unsorted array: {unsorted}")
146165

147-
heap = Heap()
166+
heap: Heap[int] = Heap()
148167
heap.build_max_heap(unsorted)
149168
print(f"after build heap: {heap}")
150169

machine_learning/linear_discriminant_analysis.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ def main():
399399
if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q":
400400
print("\n" + "GoodBye!".center(100, "-") + "\n")
401401
break
402-
system("clear" if name == "posix" else "cls") # noqa: S605
402+
system("cls" if name == "nt" else "clear") # noqa: S605
403403

404404

405405
if __name__ == "__main__":

strings/top_k_frequent_words.py

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""
2+
Finds the top K most frequent words from the provided word list.
3+
4+
This implementation aims to show how to solve the problem using the Heap class
5+
already present in this repository.
6+
Computing order statistics is, in fact, a typical usage of heaps.
7+
8+
This is mostly shown for educational purposes, since the problem can be solved
9+
in a few lines using collections.Counter from the Python standard library:
10+
11+
from collections import Counter
12+
def top_k_frequent_words(words, k_value):
13+
return [x[0] for x in Counter(words).most_common(k_value)]
14+
"""
15+
16+
17+
from collections import Counter
18+
from functools import total_ordering
19+
20+
from data_structures.heap.heap import Heap
21+
22+
23+
@total_ordering
24+
class WordCount:
25+
def __init__(self, word: str, count: int) -> None:
26+
self.word = word
27+
self.count = count
28+
29+
def __eq__(self, other: object) -> bool:
30+
"""
31+
>>> WordCount('a', 1).__eq__(WordCount('b', 1))
32+
True
33+
>>> WordCount('a', 1).__eq__(WordCount('a', 1))
34+
True
35+
>>> WordCount('a', 1).__eq__(WordCount('a', 2))
36+
False
37+
>>> WordCount('a', 1).__eq__(WordCount('b', 2))
38+
False
39+
>>> WordCount('a', 1).__eq__(1)
40+
NotImplemented
41+
"""
42+
if not isinstance(other, WordCount):
43+
return NotImplemented
44+
return self.count == other.count
45+
46+
def __lt__(self, other: object) -> bool:
47+
"""
48+
>>> WordCount('a', 1).__lt__(WordCount('b', 1))
49+
False
50+
>>> WordCount('a', 1).__lt__(WordCount('a', 1))
51+
False
52+
>>> WordCount('a', 1).__lt__(WordCount('a', 2))
53+
True
54+
>>> WordCount('a', 1).__lt__(WordCount('b', 2))
55+
True
56+
>>> WordCount('a', 2).__lt__(WordCount('a', 1))
57+
False
58+
>>> WordCount('a', 2).__lt__(WordCount('b', 1))
59+
False
60+
>>> WordCount('a', 1).__lt__(1)
61+
NotImplemented
62+
"""
63+
if not isinstance(other, WordCount):
64+
return NotImplemented
65+
return self.count < other.count
66+
67+
68+
def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
69+
"""
70+
Returns the `k_value` most frequently occurring words,
71+
in non-increasing order of occurrence.
72+
In this context, a word is defined as an element in the provided list.
73+
74+
In case `k_value` is greater than the number of distinct words, a value of k equal
75+
to the number of distinct words will be considered, instead.
76+
77+
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
78+
['c', 'a', 'b']
79+
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
80+
['c', 'a']
81+
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
82+
['c']
83+
>>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
84+
[]
85+
>>> top_k_frequent_words([], 1)
86+
[]
87+
>>> top_k_frequent_words(['a', 'a'], 2)
88+
['a']
89+
"""
90+
heap: Heap[WordCount] = Heap()
91+
count_by_word = Counter(words)
92+
heap.build_max_heap(
93+
[WordCount(word, count) for word, count in count_by_word.items()]
94+
)
95+
return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]
96+
97+
98+
if __name__ == "__main__":
99+
import doctest
100+
101+
doctest.testmod()

0 commit comments

Comments
 (0)