Solving the Top k most frequent words problem using a max-heap (TheAlgorithms#8685)

cclauss · aparibocci · github-actions · web-flow · commit 4c1f87656767 · 2023-04-27T23:02:07.000+05:30
* Solving the `Top k most frequent words` problem using a max-heap

* Mentioning Python standard library solution in `Top k most frequent words` docstring

* ruff --fix .

* updating DIRECTORY.md

---------

Co-authored-by: Amos Paribocci &lt;aparibocci@gmail.com&gt;
Co-authored-by: github-actions &lt;${GITHUB_ACTOR}@users.noreply.github.com&gt;
diff --git a/DIRECTORY.md b/DIRECTORY.md
@@ -1167,6 +1167,7 @@
   * [Snake Case To Camel Pascal Case](strings/snake_case_to_camel_pascal_case.py)
   * [Split](strings/split.py)
   * [Text Justification](strings/text_justification.py)
+  * [Top K Frequent Words](strings/top_k_frequent_words.py)
   * [Upper](strings/upper.py)
   * [Wave](strings/wave.py)
   * [Wildcard Pattern Matching](strings/wildcard_pattern_matching.py)
diff --git a/data_structures/heap/heap.py b/data_structures/heap/heap.py
@@ -1,9 +1,28 @@
 from __future__ import annotations
 
+from abc import abstractmethod
 from collections.abc import Iterable
+from typing import Generic, Protocol, TypeVar
 
 
-class Heap:
+class Comparable(Protocol):
+    @abstractmethod
+    def __lt__(self: T, other: T) -> bool:
+        pass
+
+    @abstractmethod
+    def __gt__(self: T, other: T) -> bool:
+        pass
+
+    @abstractmethod
+    def __eq__(self: T, other: object) -> bool:
+        pass
+
+
+T = TypeVar("T", bound=Comparable)
+
+
+class Heap(Generic[T]):
     """A Max Heap Implementation
 
     >>> unsorted = [103, 9, 1, 7, 11, 15, 25, 201, 209, 107, 5]
@@ -27,7 +46,7 @@ class Heap:
     """
 
     def __init__(self) -> None:
-        self.h: list[float] = []
+        self.h: list[T] = []
         self.heap_size: int = 0
 
     def __repr__(self) -> str:
@@ -79,7 +98,7 @@ def max_heapify(self, index: int) -> None:
                 # fix the subsequent violation recursively if any
                 self.max_heapify(violation)
 
-    def build_max_heap(self, collection: Iterable[float]) -> None:
+    def build_max_heap(self, collection: Iterable[T]) -> None:
         """build max heap from an unsorted array"""
         self.h = list(collection)
         self.heap_size = len(self.h)
@@ -88,7 +107,7 @@ def build_max_heap(self, collection: Iterable[float]) -> None:
             for i in range(self.heap_size // 2 - 1, -1, -1):
                 self.max_heapify(i)
 
-    def extract_max(self) -> float:
+    def extract_max(self) -> T:
         """get and remove max from heap"""
         if self.heap_size >= 2:
             me = self.h[0]
@@ -102,7 +121,7 @@ def extract_max(self) -> float:
         else:
             raise Exception("Empty heap")
 
-    def insert(self, value: float) -> None:
+    def insert(self, value: T) -> None:
         """insert a new value into the max heap"""
         self.h.append(value)
         idx = (self.heap_size - 1) // 2
@@ -144,7 +163,7 @@ def heap_sort(self) -> None:
     ]:
         print(f"unsorted array: {unsorted}")
 
-        heap = Heap()
+        heap: Heap[int] = Heap()
         heap.build_max_heap(unsorted)
         print(f"after build heap: {heap}")
 
diff --git a/machine_learning/linear_discriminant_analysis.py b/machine_learning/linear_discriminant_analysis.py
@@ -399,7 +399,7 @@ def main():
         if input("Press any key to restart or 'q' for quit: ").strip().lower() == "q":
             print("\n" + "GoodBye!".center(100, "-") + "\n")
             break
-        system("clear" if name == "posix" else "cls")  # noqa: S605
+        system("cls" if name == "nt" else "clear")  # noqa: S605
 
 
 if __name__ == "__main__":
diff --git a/strings/top_k_frequent_words.py b/strings/top_k_frequent_words.py
@@ -0,0 +1,101 @@
+"""
+Finds the top K most frequent words from the provided word list.
+
+This implementation aims to show how to solve the problem using the Heap class
+already present in this repository.
+Computing order statistics is, in fact, a typical usage of heaps.
+
+This is mostly shown for educational purposes, since the problem can be solved
+in a few lines using collections.Counter from the Python standard library:
+
+from collections import Counter
+def top_k_frequent_words(words, k_value):
+    return [x[0] for x in Counter(words).most_common(k_value)]
+"""
+
+
+from collections import Counter
+from functools import total_ordering
+
+from data_structures.heap.heap import Heap
+
+
+@total_ordering
+class WordCount:
+    def __init__(self, word: str, count: int) -> None:
+        self.word = word
+        self.count = count
+
+    def __eq__(self, other: object) -> bool:
+        """
+        >>> WordCount('a', 1).__eq__(WordCount('b', 1))
+        True
+        >>> WordCount('a', 1).__eq__(WordCount('a', 1))
+        True
+        >>> WordCount('a', 1).__eq__(WordCount('a', 2))
+        False
+        >>> WordCount('a', 1).__eq__(WordCount('b', 2))
+        False
+        >>> WordCount('a', 1).__eq__(1)
+        NotImplemented
+        """
+        if not isinstance(other, WordCount):
+            return NotImplemented
+        return self.count == other.count
+
+    def __lt__(self, other: object) -> bool:
+        """
+        >>> WordCount('a', 1).__lt__(WordCount('b', 1))
+        False
+        >>> WordCount('a', 1).__lt__(WordCount('a', 1))
+        False
+        >>> WordCount('a', 1).__lt__(WordCount('a', 2))
+        True
+        >>> WordCount('a', 1).__lt__(WordCount('b', 2))
+        True
+        >>> WordCount('a', 2).__lt__(WordCount('a', 1))
+        False
+        >>> WordCount('a', 2).__lt__(WordCount('b', 1))
+        False
+        >>> WordCount('a', 1).__lt__(1)
+        NotImplemented
+        """
+        if not isinstance(other, WordCount):
+            return NotImplemented
+        return self.count < other.count
+
+
+def top_k_frequent_words(words: list[str], k_value: int) -> list[str]:
+    """
+    Returns the `k_value` most frequently occurring words,
+    in non-increasing order of occurrence.
+    In this context, a word is defined as an element in the provided list.
+
+    In case `k_value` is greater than the number of distinct words, a value of k equal
+    to the number of distinct words will be considered, instead.
+
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 3)
+    ['c', 'a', 'b']
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 2)
+    ['c', 'a']
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 1)
+    ['c']
+    >>> top_k_frequent_words(['a', 'b', 'c', 'a', 'c', 'c'], 0)
+    []
+    >>> top_k_frequent_words([], 1)
+    []
+    >>> top_k_frequent_words(['a', 'a'], 2)
+    ['a']
+    """
+    heap: Heap[WordCount] = Heap()
+    count_by_word = Counter(words)
+    heap.build_max_heap(
+        [WordCount(word, count) for word, count in count_by_word.items()]
+    )
+    return [heap.extract_max().word for _ in range(min(k_value, len(count_by_word)))]
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()