Skip to content

Commit d034add

Browse files
beqakdcclauss
andauthored
add visualization of k means clustering as excel format (#2104)
* add visualization of kmneas clust as excel format * style changes * style changes * Add doctest and typehint! * style change * Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss <[email protected]> * Update machine_learning/k_means_clust.py Co-authored-by: Christian Clauss <[email protected]> Co-authored-by: Christian Clauss <[email protected]>
1 parent b9e7c89 commit d034add

File tree

1 file changed

+162
-1
lines changed

1 file changed

+162
-1
lines changed

Diff for: machine_learning/k_means_clust.py

+162-1
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,18 @@
4747
k
4848
)
4949
50-
5. Have fun..
50+
5. Transfers Dataframe into excel format it must have feature called
51+
'Clust' with k means clustering numbers in it.
52+
5153
5254
"""
5355
import numpy as np
56+
import pandas as pd
5457
from matplotlib import pyplot as plt
5558
from sklearn.metrics import pairwise_distances
59+
import warnings
60+
61+
warnings.filterwarnings("ignore")
5662

5763
TAG = "K-MEANS-CLUST/ "
5864

@@ -202,3 +208,158 @@ def kmeans(
202208
verbose=True,
203209
)
204210
plot_heterogeneity(heterogeneity, k)
211+
212+
213+
def ReportGenerator(
214+
df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None
215+
) -> pd.DataFrame:
216+
"""
217+
Function generates easy-erading clustering report. It takes 2 arguments as an input:
218+
DataFrame - dataframe with predicted cluester column;
219+
FillMissingReport - dictionary of rules how we are going to fill missing
220+
values of for final report generate (not included in modeling);
221+
in order to run the function following libraries must be imported:
222+
import pandas as pd
223+
import numpy as np
224+
225+
>>> data = pd.DataFrame()
226+
>>> data['numbers'] = [1, 2, 3]
227+
>>> data['col1'] = [0.5, 2.5, 4.5]
228+
>>> data['col2'] = [100, 200, 300]
229+
>>> data['col3'] = [10, 20, 30]
230+
>>> data['Cluster'] = [1, 1, 2]
231+
>>> ReportGenerator(data, ['col1', 'col2'], 0)
232+
Features Type Mark 1 2
233+
0 # of Customers ClusterSize False 2.000000 1.000000
234+
1 % of Customers ClusterProportion False 0.666667 0.333333
235+
2 col1 mean_with_zeros True 1.500000 4.500000
236+
3 col2 mean_with_zeros True 150.000000 300.000000
237+
4 numbers mean_with_zeros False 1.500000 3.000000
238+
.. ... ... ... ... ...
239+
99 dummy 5% False 1.000000 1.000000
240+
100 dummy 95% False 1.000000 1.000000
241+
101 dummy stdev False 0.000000 NaN
242+
102 dummy mode False 1.000000 1.000000
243+
103 dummy median False 1.000000 1.000000
244+
<BLANKLINE>
245+
[104 rows x 5 columns]
246+
"""
247+
# Fill missing values with given rules
248+
if FillMissingReport:
249+
df.fillna(value=FillMissingReport, inplace=True)
250+
df["dummy"] = 1
251+
numeric_cols = df.select_dtypes(np.number).columns
252+
report = (
253+
df.groupby(["Cluster"])[ # constract report dataframe
254+
numeric_cols
255+
] # group by cluster number
256+
.agg(
257+
[
258+
("sum", np.sum),
259+
("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
260+
("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
261+
(
262+
"mean_25-75",
263+
lambda x: np.mean(
264+
np.nan_to_num(
265+
sorted(x)[
266+
round((len(x) * 25 / 100)) : round(len(x) * 75 / 100)
267+
]
268+
)
269+
),
270+
),
271+
("mean_with_na", np.mean),
272+
("min", lambda x: x.min()),
273+
("5%", lambda x: x.quantile(0.05)),
274+
("25%", lambda x: x.quantile(0.25)),
275+
("50%", lambda x: x.quantile(0.50)),
276+
("75%", lambda x: x.quantile(0.75)),
277+
("95%", lambda x: x.quantile(0.95)),
278+
("max", lambda x: x.max()),
279+
("count", lambda x: x.count()),
280+
("stdev", lambda x: x.std()),
281+
("mode", lambda x: x.mode()[0]),
282+
("median", lambda x: x.median()),
283+
("# > 0", lambda x: (x > 0).sum()),
284+
]
285+
)
286+
.T.reset_index()
287+
.rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
288+
) # rename columns
289+
290+
clustersize = report[
291+
(report["Features"] == "dummy") & (report["Type"] == "count")
292+
] # caclulating size of cluster(count of clientID's)
293+
clustersize.Type = (
294+
"ClusterSize" # rename created cluster df to match report column names
295+
)
296+
clustersize.Features = "# of Customers"
297+
clusterproportion = pd.DataFrame(
298+
clustersize.iloc[:, 2:].values
299+
/ clustersize.iloc[:, 2:].values.sum() # caclulating proportion of cluster
300+
)
301+
clusterproportion[
302+
"Type"
303+
] = "% of Customers" # rename created cluster df to match report column names
304+
clusterproportion["Features"] = "ClusterProportion"
305+
cols = clusterproportion.columns.tolist()
306+
cols = cols[-2:] + cols[:-2]
307+
clusterproportion = clusterproportion[cols] # rearrange columns to match report
308+
clusterproportion.columns = report.columns
309+
a = pd.DataFrame(
310+
abs(
311+
report[report["Type"] == "count"].iloc[:, 2:].values
312+
- clustersize.iloc[:, 2:].values
313+
)
314+
) # generating df with count of nan values
315+
a["Features"] = 0
316+
a["Type"] = "# of nan"
317+
a.Features = report[
318+
report["Type"] == "count"
319+
].Features.tolist() # filling values in order to match report
320+
cols = a.columns.tolist()
321+
cols = cols[-2:] + cols[:-2]
322+
a = a[cols] # rearrange columns to match report
323+
a.columns = report.columns # rename columns to match report
324+
report = report.drop(
325+
report[report.Type == "count"].index
326+
) # drop count values except cluster size
327+
report = pd.concat(
328+
[report, a, clustersize, clusterproportion], axis=0
329+
) # concat report with clustert size and nan values
330+
report["Mark"] = report["Features"].isin(ClusteringVariables)
331+
cols = report.columns.tolist()
332+
cols = cols[0:2] + cols[-1:] + cols[2:-1]
333+
report = report[cols]
334+
sorter1 = {
335+
"ClusterSize": 9,
336+
"ClusterProportion": 8,
337+
"mean_with_zeros": 7,
338+
"mean_with_na": 6,
339+
"max": 5,
340+
"50%": 4,
341+
"min": 3,
342+
"25%": 2,
343+
"75%": 1,
344+
"# of nan": 0,
345+
"# > 0": -1,
346+
"sum_with_na": -2,
347+
}
348+
report = (
349+
report.assign(
350+
Sorter1=lambda x: x.Type.map(sorter1),
351+
Sorter2=lambda x: list(reversed(range(len(x)))),
352+
)
353+
.sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
354+
.drop(["Sorter1", "Sorter2"], axis=1)
355+
)
356+
report.columns.name = ""
357+
report = report.reset_index()
358+
report.drop(columns=["index"], inplace=True)
359+
return report
360+
361+
362+
if __name__ == "__main__":
363+
import doctest
364+
365+
doctest.testmod()

0 commit comments

Comments
 (0)