add visualization of k means clustering as excel format (#2104)

beqakd · cclauss · web-flow · commit d034add61f4f · 2020-06-19T17:55:13.000+02:00
* add visualization of kmneas clust as excel format

* style changes

* style changes

* Add doctest and typehint!

* style change

* Update machine_learning/k_means_clust.py

Co-authored-by: Christian Clauss &lt;cclauss@me.com&gt;

* Update machine_learning/k_means_clust.py

Co-authored-by: Christian Clauss &lt;cclauss@me.com&gt;

Co-authored-by: Christian Clauss &lt;cclauss@me.com&gt;
diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py
@@ -47,12 +47,18 @@
             k
         )
 
-  5. Have fun..
+  5. Transfers Dataframe into excel format it must have feature called
+      'Clust' with k means clustering numbers in it.
+
 
 """
 import numpy as np
+import pandas as pd
 from matplotlib import pyplot as plt
 from sklearn.metrics import pairwise_distances
+import warnings
+
+warnings.filterwarnings("ignore")
 
 TAG = "K-MEANS-CLUST/ "
 
@@ -202,3 +208,158 @@ def kmeans(
         verbose=True,
     )
     plot_heterogeneity(heterogeneity, k)
+
+
+def ReportGenerator(
+    df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None
+) -> pd.DataFrame:
+    """
+    Function generates easy-erading clustering report. It takes 2 arguments as an input:
+        DataFrame - dataframe with predicted cluester column;
+        FillMissingReport - dictionary of rules how we are going to fill missing
+        values of for final report generate (not included in modeling);
+    in order to run the function following libraries must be imported:
+        import pandas as pd
+        import numpy as np
+
+    >>> data = pd.DataFrame()
+    >>> data['numbers'] = [1, 2, 3]
+    >>> data['col1'] = [0.5, 2.5, 4.5]
+    >>> data['col2'] = [100, 200, 300]
+    >>> data['col3'] = [10, 20, 30]
+    >>> data['Cluster'] = [1, 1, 2]
+    >>> ReportGenerator(data, ['col1', 'col2'], 0)
+               Features               Type   Mark           1           2
+    0    # of Customers        ClusterSize  False    2.000000    1.000000
+    1    % of Customers  ClusterProportion  False    0.666667    0.333333
+    2              col1    mean_with_zeros   True    1.500000    4.500000
+    3              col2    mean_with_zeros   True  150.000000  300.000000
+    4           numbers    mean_with_zeros  False    1.500000    3.000000
+    ..              ...                ...    ...         ...         ...
+    99            dummy                 5%  False    1.000000    1.000000
+    100           dummy                95%  False    1.000000    1.000000
+    101           dummy              stdev  False    0.000000         NaN
+    102           dummy               mode  False    1.000000    1.000000
+    103           dummy             median  False    1.000000    1.000000
+    <BLANKLINE>
+    [104 rows x 5 columns]
+    """
+    # Fill missing values with given rules
+    if FillMissingReport:
+        df.fillna(value=FillMissingReport, inplace=True)
+    df["dummy"] = 1
+    numeric_cols = df.select_dtypes(np.number).columns
+    report = (
+        df.groupby(["Cluster"])[  # constract report dataframe
+            numeric_cols
+        ]  # group by cluster number
+        .agg(
+            [
+                ("sum", np.sum),
+                ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
+                ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
+                (
+                    "mean_25-75",
+                    lambda x: np.mean(
+                        np.nan_to_num(
+                            sorted(x)[
+                                round((len(x) * 25 / 100)) : round(len(x) * 75 / 100)
+                            ]
+                        )
+                    ),
+                ),
+                ("mean_with_na", np.mean),
+                ("min", lambda x: x.min()),
+                ("5%", lambda x: x.quantile(0.05)),
+                ("25%", lambda x: x.quantile(0.25)),
+                ("50%", lambda x: x.quantile(0.50)),
+                ("75%", lambda x: x.quantile(0.75)),
+                ("95%", lambda x: x.quantile(0.95)),
+                ("max", lambda x: x.max()),
+                ("count", lambda x: x.count()),
+                ("stdev", lambda x: x.std()),
+                ("mode", lambda x: x.mode()[0]),
+                ("median", lambda x: x.median()),
+                ("# > 0", lambda x: (x > 0).sum()),
+            ]
+        )
+        .T.reset_index()
+        .rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
+    )  # rename columns
+
+    clustersize = report[
+        (report["Features"] == "dummy") & (report["Type"] == "count")
+    ]  # caclulating size of cluster(count of clientID's)
+    clustersize.Type = (
+        "ClusterSize"  # rename created cluster df to match report column names
+    )
+    clustersize.Features = "# of Customers"
+    clusterproportion = pd.DataFrame(
+        clustersize.iloc[:, 2:].values
+        / clustersize.iloc[:, 2:].values.sum()  # caclulating proportion of cluster
+    )
+    clusterproportion[
+        "Type"
+    ] = "% of Customers"  # rename created cluster df to match report column names
+    clusterproportion["Features"] = "ClusterProportion"
+    cols = clusterproportion.columns.tolist()
+    cols = cols[-2:] + cols[:-2]
+    clusterproportion = clusterproportion[cols]  # rearrange columns to match report
+    clusterproportion.columns = report.columns
+    a = pd.DataFrame(
+        abs(
+            report[report["Type"] == "count"].iloc[:, 2:].values
+            - clustersize.iloc[:, 2:].values
+        )
+    )  # generating df with count of nan values
+    a["Features"] = 0
+    a["Type"] = "# of nan"
+    a.Features = report[
+        report["Type"] == "count"
+    ].Features.tolist()  # filling values in order to match report
+    cols = a.columns.tolist()
+    cols = cols[-2:] + cols[:-2]
+    a = a[cols]  # rearrange columns to match report
+    a.columns = report.columns  # rename columns to match report
+    report = report.drop(
+        report[report.Type == "count"].index
+    )  # drop count values except cluster size
+    report = pd.concat(
+        [report, a, clustersize, clusterproportion], axis=0
+    )  # concat report with clustert size and nan values
+    report["Mark"] = report["Features"].isin(ClusteringVariables)
+    cols = report.columns.tolist()
+    cols = cols[0:2] + cols[-1:] + cols[2:-1]
+    report = report[cols]
+    sorter1 = {
+        "ClusterSize": 9,
+        "ClusterProportion": 8,
+        "mean_with_zeros": 7,
+        "mean_with_na": 6,
+        "max": 5,
+        "50%": 4,
+        "min": 3,
+        "25%": 2,
+        "75%": 1,
+        "# of nan": 0,
+        "# > 0": -1,
+        "sum_with_na": -2,
+    }
+    report = (
+        report.assign(
+            Sorter1=lambda x: x.Type.map(sorter1),
+            Sorter2=lambda x: list(reversed(range(len(x)))),
+        )
+        .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
+        .drop(["Sorter1", "Sorter2"], axis=1)
+    )
+    report.columns.name = ""
+    report = report.reset_index()
+    report.drop(columns=["index"], inplace=True)
+    return report
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()