From 17d6427ea4828c689097be8413e23c7d817c9ae8 Mon Sep 17 00:00:00 2001 From: thor-harsh <105957576+thor-harsh@users.noreply.github.com> Date: Sat, 19 Aug 2023 19:17:54 +0530 Subject: [PATCH 1/2] Update k_means_clust.py --- machine_learning/k_means_clust.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index 7c8142aab878..c4dbf0a1971f 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -11,10 +11,10 @@ - initial_centroids , initial centroid values generated by utility function(mentioned in usage). - maxiter , maximum number of iterations to process. - - heterogeneity , empty list that will be filled with hetrogeneity values if passed + - heterogeneity , empty list that will be filled with heterogeneity values if passed to kmeans func. Usage: - 1. define 'k' value, 'X' features array and 'hetrogeneity' empty list + 1. define 'k' value, 'X' features array and 'heterogeneity' empty list 2. create initial_centroids, initial_centroids = get_initial_centroids( X, @@ -31,8 +31,8 @@ record_heterogeneity=heterogeneity, verbose=True # whether to print logs in console or not.(default=False) ) - 4. Plot the loss function, hetrogeneity values for every iteration saved in - hetrogeneity list. + 4. Plot the loss function, and heterogeneity values for every iteration saved in + heterogeneity list. plot_heterogeneity( heterogeneity, k @@ -198,10 +198,10 @@ def report_generator( df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None ) -> pd.DataFrame: """ - Function generates easy-erading clustering report. It takes 2 arguments as an input: - DataFrame - dataframe with predicted cluester column; + Function generates an easy-erading clustering report. It takes 2 arguments as an input: + DataFrame - dataframe with predicted cluster column; FillMissingReport - dictionary of rules how we are going to fill missing - values of for final report generate (not included in modeling); + values of for final report generated (not included in modelling); in order to run the function following libraries must be imported: import pandas as pd import numpy as np @@ -309,7 +309,7 @@ def report_generator( ) # drop count values except cluster size report = pd.concat( [report, a, clustersize, clusterproportion], axis=0 - ) # concat report with clustert size and nan values + ) # concat report with cluster size and nan values report["Mark"] = report["Features"].isin(clustering_variables) cols = report.columns.tolist() cols = cols[0:2] + cols[-1:] + cols[2:-1] From f06bee6109c68238d1fcc780498ed64d5ed56dab Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Wed, 27 Sep 2023 02:20:49 -0400 Subject: [PATCH 2/2] Apply suggestions from code review --- machine_learning/k_means_clust.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py index c4dbf0a1971f..d93c5addf2ee 100644 --- a/machine_learning/k_means_clust.py +++ b/machine_learning/k_means_clust.py @@ -31,7 +31,7 @@ record_heterogeneity=heterogeneity, verbose=True # whether to print logs in console or not.(default=False) ) - 4. Plot the loss function, and heterogeneity values for every iteration saved in + 4. Plot the loss function and heterogeneity values for every iteration saved in heterogeneity list. plot_heterogeneity( heterogeneity, @@ -198,13 +198,10 @@ def report_generator( df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None ) -> pd.DataFrame: """ - Function generates an easy-erading clustering report. It takes 2 arguments as an input: - DataFrame - dataframe with predicted cluster column; - FillMissingReport - dictionary of rules how we are going to fill missing - values of for final report generated (not included in modelling); - in order to run the function following libraries must be imported: - import pandas as pd - import numpy as np + Generates a clustering report. This function takes 2 arguments as input: + df - dataframe with predicted cluster column + fill_missing_report - dictionary of rules on how we are going to fill in missing + values for final generated report (not included in modelling); >>> data = pd.DataFrame() >>> data['numbers'] = [1, 2, 3] >>> data['col1'] = [0.5, 2.5, 4.5] @@ -306,7 +303,7 @@ def report_generator( a.columns = report.columns # rename columns to match report report = report.drop( report[report.Type == "count"].index - ) # drop count values except cluster size + ) # drop count values except for cluster size report = pd.concat( [report, a, clustersize, clusterproportion], axis=0 ) # concat report with cluster size and nan values