From 17d6427ea4828c689097be8413e23c7d817c9ae8 Mon Sep 17 00:00:00 2001
From: thor-harsh <105957576+thor-harsh@users.noreply.github.com>
Date: Sat, 19 Aug 2023 19:17:54 +0530
Subject: [PATCH 1/2] Update k_means_clust.py

---
 machine_learning/k_means_clust.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index 7c8142aab878..c4dbf0a1971f 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -11,10 +11,10 @@
   - initial_centroids , initial centroid values generated by utility function(mentioned
     in usage).
   - maxiter , maximum number of iterations to process.
-  - heterogeneity , empty list that will be filled with hetrogeneity values if passed
+  - heterogeneity , empty list that will be filled with heterogeneity values if passed
     to kmeans func.
 Usage:
-  1. define 'k' value, 'X' features array and 'hetrogeneity' empty list
+  1. define 'k' value, 'X' features array and 'heterogeneity' empty list
   2. create initial_centroids,
         initial_centroids = get_initial_centroids(
             X,
@@ -31,8 +31,8 @@
             record_heterogeneity=heterogeneity,
             verbose=True # whether to print logs in console or not.(default=False)
             )
-  4. Plot the loss function, hetrogeneity values for every iteration saved in
-     hetrogeneity list.
+  4. Plot the loss function, and heterogeneity values for every iteration saved in
+     heterogeneity list.
         plot_heterogeneity(
             heterogeneity,
             k
@@ -198,10 +198,10 @@ def report_generator(
     df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
 ) -> pd.DataFrame:
     """
-    Function generates easy-erading clustering report. It takes 2 arguments as an input:
-        DataFrame - dataframe with predicted cluester column;
+    Function generates an easy-erading clustering report. It takes 2 arguments as an input:
+        DataFrame - dataframe with predicted cluster column;
         FillMissingReport - dictionary of rules how we are going to fill missing
-        values of for final report generate (not included in modeling);
+        values of for final report generated (not included in modelling);
     in order to run the function following libraries must be imported:
         import pandas as pd
         import numpy as np
@@ -309,7 +309,7 @@ def report_generator(
     )  # drop count values except cluster size
     report = pd.concat(
         [report, a, clustersize, clusterproportion], axis=0
-    )  # concat report with clustert size and nan values
+    )  # concat report with cluster size and nan values
     report["Mark"] = report["Features"].isin(clustering_variables)
     cols = report.columns.tolist()
     cols = cols[0:2] + cols[-1:] + cols[2:-1]

From f06bee6109c68238d1fcc780498ed64d5ed56dab Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Wed, 27 Sep 2023 02:20:49 -0400
Subject: [PATCH 2/2] Apply suggestions from code review

---
 machine_learning/k_means_clust.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
index c4dbf0a1971f..d93c5addf2ee 100644
--- a/machine_learning/k_means_clust.py
+++ b/machine_learning/k_means_clust.py
@@ -31,7 +31,7 @@
             record_heterogeneity=heterogeneity,
             verbose=True # whether to print logs in console or not.(default=False)
             )
-  4. Plot the loss function, and heterogeneity values for every iteration saved in
+  4. Plot the loss function and heterogeneity values for every iteration saved in
      heterogeneity list.
         plot_heterogeneity(
             heterogeneity,
@@ -198,13 +198,10 @@ def report_generator(
     df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
 ) -> pd.DataFrame:
     """
-    Function generates an easy-erading clustering report. It takes 2 arguments as an input:
-        DataFrame - dataframe with predicted cluster column;
-        FillMissingReport - dictionary of rules how we are going to fill missing
-        values of for final report generated (not included in modelling);
-    in order to run the function following libraries must be imported:
-        import pandas as pd
-        import numpy as np
+    Generates a clustering report. This function takes 2 arguments as input:
+        df - dataframe with predicted cluster column
+        fill_missing_report - dictionary of rules on how we are going to fill in missing
+        values for final generated report (not included in modelling);
     >>> data = pd.DataFrame()
     >>> data['numbers'] = [1, 2, 3]
     >>> data['col1'] = [0.5, 2.5, 4.5]
@@ -306,7 +303,7 @@ def report_generator(
     a.columns = report.columns  # rename columns to match report
     report = report.drop(
         report[report.Type == "count"].index
-    )  # drop count values except cluster size
+    )  # drop count values except for cluster size
     report = pd.concat(
         [report, a, clustersize, clusterproportion], axis=0
     )  # concat report with cluster size and nan values