Skip to content

add visualization of k means clustering as excel format #2104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 19, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 162 additions & 1 deletion machine_learning/k_means_clust.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,18 @@
k
)

5. Have fun..
5. Transfers Dataframe into excel format it must have feature called
'Clust' with k means clustering numbers in it.


"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances
import warnings

warnings.filterwarnings("ignore")

TAG = "K-MEANS-CLUST/ "

Expand Down Expand Up @@ -202,3 +208,158 @@ def kmeans(
verbose=True,
)
plot_heterogeneity(heterogeneity, k)


def ReportGenerator(
df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None
) -> pd.DataFrame:
"""
Function generates easy-erading clustering report. It takes 2 arguments as an input:
DataFrame - dataframe with predicted cluester column;
FillMissingReport - dictionary of rules how we are going to fill missing
values of for final report generate (not included in modeling);
in order to run the function following libraries must be imported:
import pandas as pd
import numpy as np

>>> data = pd.DataFrame()
>>> data['numbers'] = [1, 2, 3]
>>> data['col1'] = [0.5, 2.5, 4.5]
>>> data['col2'] = [100, 200, 300]
>>> data['col3'] = [10, 20, 30]
>>> data['Cluster'] = [1, 1, 2]
>>> ReportGenerator(data, ['col1', 'col2'], 0)
Features Type Mark 1 2
0 # of Customers ClusterSize False 2.000000 1.000000
1 % of Customers ClusterProportion False 0.666667 0.333333
2 col1 mean_with_zeros True 1.500000 4.500000
3 col2 mean_with_zeros True 150.000000 300.000000
4 numbers mean_with_zeros False 1.500000 3.000000
.. ... ... ... ... ...
99 dummy 5% False 1.000000 1.000000
100 dummy 95% False 1.000000 1.000000
101 dummy stdev False 0.000000 NaN
102 dummy mode False 1.000000 1.000000
103 dummy median False 1.000000 1.000000
<BLANKLINE>
[104 rows x 5 columns]
"""
# Fill missing values with given rules
if FillMissingReport:
df.fillna(value=FillMissingReport, inplace=True)
df["dummy"] = 1
numeric_cols = df.select_dtypes(np.number).columns
report = (
df.groupby(["Cluster"])[ # constract report dataframe
numeric_cols
] # group by cluster number
.agg(
[
("sum", np.sum),
("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
(
"mean_25-75",
lambda x: np.mean(
np.nan_to_num(
sorted(x)[
round((len(x) * 25 / 100)) : round(len(x) * 75 / 100)
]
)
),
),
("mean_with_na", np.mean),
("min", lambda x: x.min()),
("5%", lambda x: x.quantile(0.05)),
("25%", lambda x: x.quantile(0.25)),
("50%", lambda x: x.quantile(0.50)),
("75%", lambda x: x.quantile(0.75)),
("95%", lambda x: x.quantile(0.95)),
("max", lambda x: x.max()),
("count", lambda x: x.count()),
("stdev", lambda x: x.std()),
("mode", lambda x: x.mode()[0]),
("median", lambda x: x.median()),
("# > 0", lambda x: (x > 0).sum()),
]
)
.T.reset_index()
.rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
) # rename columns

clustersize = report[
(report["Features"] == "dummy") & (report["Type"] == "count")
] # caclulating size of cluster(count of clientID's)
clustersize.Type = (
"ClusterSize" # rename created cluster df to match report column names
)
clustersize.Features = "# of Customers"
clusterproportion = pd.DataFrame(
clustersize.iloc[:, 2:].values
/ clustersize.iloc[:, 2:].values.sum() # caclulating proportion of cluster
)
clusterproportion[
"Type"
] = "% of Customers" # rename created cluster df to match report column names
clusterproportion["Features"] = "ClusterProportion"
cols = clusterproportion.columns.tolist()
cols = cols[-2:] + cols[:-2]
clusterproportion = clusterproportion[cols] # rearrange columns to match report
clusterproportion.columns = report.columns
a = pd.DataFrame(
abs(
report[report["Type"] == "count"].iloc[:, 2:].values
- clustersize.iloc[:, 2:].values
)
) # generating df with count of nan values
a["Features"] = 0
a["Type"] = "# of nan"
a.Features = report[
report["Type"] == "count"
].Features.tolist() # filling values in order to match report
cols = a.columns.tolist()
cols = cols[-2:] + cols[:-2]
a = a[cols] # rearrange columns to match report
a.columns = report.columns # rename columns to match report
report = report.drop(
report[report.Type == "count"].index
) # drop count values except cluster size
report = pd.concat(
[report, a, clustersize, clusterproportion], axis=0
) # concat report with clustert size and nan values
report["Mark"] = report["Features"].isin(ClusteringVariables)
cols = report.columns.tolist()
cols = cols[0:2] + cols[-1:] + cols[2:-1]
report = report[cols]
sorter1 = {
"ClusterSize": 9,
"ClusterProportion": 8,
"mean_with_zeros": 7,
"mean_with_na": 6,
"max": 5,
"50%": 4,
"min": 3,
"25%": 2,
"75%": 1,
"# of nan": 0,
"# > 0": -1,
"sum_with_na": -2,
}
report = (
report.assign(
Sorter1=lambda x: x.Type.map(sorter1),
Sorter2=lambda x: list(reversed(range(len(x)))),
)
.sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
.drop(["Sorter1", "Sorter2"], axis=1)
)
report.columns.name = ""
report = report.reset_index()
report.drop(columns=["index"], inplace=True)
return report


if __name__ == "__main__":
import doctest

doctest.testmod()