|
47 | 47 | k
|
48 | 48 | )
|
49 | 49 |
|
50 |
| - 5. Have fun.. |
| 50 | + 5. Transfers Dataframe into excel format it must have feature called |
| 51 | + 'Clust' with k means clustering numbers in it. |
| 52 | +
|
51 | 53 |
|
52 | 54 | """
|
53 | 55 | import numpy as np
|
| 56 | +import pandas as pd |
54 | 57 | from matplotlib import pyplot as plt
|
55 | 58 | from sklearn.metrics import pairwise_distances
|
| 59 | +import warnings |
| 60 | + |
| 61 | +warnings.filterwarnings("ignore") |
56 | 62 |
|
57 | 63 | TAG = "K-MEANS-CLUST/ "
|
58 | 64 |
|
@@ -202,3 +208,158 @@ def kmeans(
|
202 | 208 | verbose=True,
|
203 | 209 | )
|
204 | 210 | plot_heterogeneity(heterogeneity, k)
|
| 211 | + |
| 212 | + |
| 213 | +def ReportGenerator( |
| 214 | + df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None |
| 215 | +) -> pd.DataFrame: |
| 216 | + """ |
| 217 | + Function generates easy-erading clustering report. It takes 2 arguments as an input: |
| 218 | + DataFrame - dataframe with predicted cluester column; |
| 219 | + FillMissingReport - dictionary of rules how we are going to fill missing |
| 220 | + values of for final report generate (not included in modeling); |
| 221 | + in order to run the function following libraries must be imported: |
| 222 | + import pandas as pd |
| 223 | + import numpy as np |
| 224 | +
|
| 225 | + >>> data = pd.DataFrame() |
| 226 | + >>> data['numbers'] = [1, 2, 3] |
| 227 | + >>> data['col1'] = [0.5, 2.5, 4.5] |
| 228 | + >>> data['col2'] = [100, 200, 300] |
| 229 | + >>> data['col3'] = [10, 20, 30] |
| 230 | + >>> data['Cluster'] = [1, 1, 2] |
| 231 | + >>> ReportGenerator(data, ['col1', 'col2'], 0) |
| 232 | + Features Type Mark 1 2 |
| 233 | + 0 # of Customers ClusterSize False 2.000000 1.000000 |
| 234 | + 1 % of Customers ClusterProportion False 0.666667 0.333333 |
| 235 | + 2 col1 mean_with_zeros True 1.500000 4.500000 |
| 236 | + 3 col2 mean_with_zeros True 150.000000 300.000000 |
| 237 | + 4 numbers mean_with_zeros False 1.500000 3.000000 |
| 238 | + .. ... ... ... ... ... |
| 239 | + 99 dummy 5% False 1.000000 1.000000 |
| 240 | + 100 dummy 95% False 1.000000 1.000000 |
| 241 | + 101 dummy stdev False 0.000000 NaN |
| 242 | + 102 dummy mode False 1.000000 1.000000 |
| 243 | + 103 dummy median False 1.000000 1.000000 |
| 244 | + <BLANKLINE> |
| 245 | + [104 rows x 5 columns] |
| 246 | + """ |
| 247 | + # Fill missing values with given rules |
| 248 | + if FillMissingReport: |
| 249 | + df.fillna(value=FillMissingReport, inplace=True) |
| 250 | + df["dummy"] = 1 |
| 251 | + numeric_cols = df.select_dtypes(np.number).columns |
| 252 | + report = ( |
| 253 | + df.groupby(["Cluster"])[ # constract report dataframe |
| 254 | + numeric_cols |
| 255 | + ] # group by cluster number |
| 256 | + .agg( |
| 257 | + [ |
| 258 | + ("sum", np.sum), |
| 259 | + ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), |
| 260 | + ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()), |
| 261 | + ( |
| 262 | + "mean_25-75", |
| 263 | + lambda x: np.mean( |
| 264 | + np.nan_to_num( |
| 265 | + sorted(x)[ |
| 266 | + round((len(x) * 25 / 100)) : round(len(x) * 75 / 100) |
| 267 | + ] |
| 268 | + ) |
| 269 | + ), |
| 270 | + ), |
| 271 | + ("mean_with_na", np.mean), |
| 272 | + ("min", lambda x: x.min()), |
| 273 | + ("5%", lambda x: x.quantile(0.05)), |
| 274 | + ("25%", lambda x: x.quantile(0.25)), |
| 275 | + ("50%", lambda x: x.quantile(0.50)), |
| 276 | + ("75%", lambda x: x.quantile(0.75)), |
| 277 | + ("95%", lambda x: x.quantile(0.95)), |
| 278 | + ("max", lambda x: x.max()), |
| 279 | + ("count", lambda x: x.count()), |
| 280 | + ("stdev", lambda x: x.std()), |
| 281 | + ("mode", lambda x: x.mode()[0]), |
| 282 | + ("median", lambda x: x.median()), |
| 283 | + ("# > 0", lambda x: (x > 0).sum()), |
| 284 | + ] |
| 285 | + ) |
| 286 | + .T.reset_index() |
| 287 | + .rename(index=str, columns={"level_0": "Features", "level_1": "Type"}) |
| 288 | + ) # rename columns |
| 289 | + |
| 290 | + clustersize = report[ |
| 291 | + (report["Features"] == "dummy") & (report["Type"] == "count") |
| 292 | + ] # caclulating size of cluster(count of clientID's) |
| 293 | + clustersize.Type = ( |
| 294 | + "ClusterSize" # rename created cluster df to match report column names |
| 295 | + ) |
| 296 | + clustersize.Features = "# of Customers" |
| 297 | + clusterproportion = pd.DataFrame( |
| 298 | + clustersize.iloc[:, 2:].values |
| 299 | + / clustersize.iloc[:, 2:].values.sum() # caclulating proportion of cluster |
| 300 | + ) |
| 301 | + clusterproportion[ |
| 302 | + "Type" |
| 303 | + ] = "% of Customers" # rename created cluster df to match report column names |
| 304 | + clusterproportion["Features"] = "ClusterProportion" |
| 305 | + cols = clusterproportion.columns.tolist() |
| 306 | + cols = cols[-2:] + cols[:-2] |
| 307 | + clusterproportion = clusterproportion[cols] # rearrange columns to match report |
| 308 | + clusterproportion.columns = report.columns |
| 309 | + a = pd.DataFrame( |
| 310 | + abs( |
| 311 | + report[report["Type"] == "count"].iloc[:, 2:].values |
| 312 | + - clustersize.iloc[:, 2:].values |
| 313 | + ) |
| 314 | + ) # generating df with count of nan values |
| 315 | + a["Features"] = 0 |
| 316 | + a["Type"] = "# of nan" |
| 317 | + a.Features = report[ |
| 318 | + report["Type"] == "count" |
| 319 | + ].Features.tolist() # filling values in order to match report |
| 320 | + cols = a.columns.tolist() |
| 321 | + cols = cols[-2:] + cols[:-2] |
| 322 | + a = a[cols] # rearrange columns to match report |
| 323 | + a.columns = report.columns # rename columns to match report |
| 324 | + report = report.drop( |
| 325 | + report[report.Type == "count"].index |
| 326 | + ) # drop count values except cluster size |
| 327 | + report = pd.concat( |
| 328 | + [report, a, clustersize, clusterproportion], axis=0 |
| 329 | + ) # concat report with clustert size and nan values |
| 330 | + report["Mark"] = report["Features"].isin(ClusteringVariables) |
| 331 | + cols = report.columns.tolist() |
| 332 | + cols = cols[0:2] + cols[-1:] + cols[2:-1] |
| 333 | + report = report[cols] |
| 334 | + sorter1 = { |
| 335 | + "ClusterSize": 9, |
| 336 | + "ClusterProportion": 8, |
| 337 | + "mean_with_zeros": 7, |
| 338 | + "mean_with_na": 6, |
| 339 | + "max": 5, |
| 340 | + "50%": 4, |
| 341 | + "min": 3, |
| 342 | + "25%": 2, |
| 343 | + "75%": 1, |
| 344 | + "# of nan": 0, |
| 345 | + "# > 0": -1, |
| 346 | + "sum_with_na": -2, |
| 347 | + } |
| 348 | + report = ( |
| 349 | + report.assign( |
| 350 | + Sorter1=lambda x: x.Type.map(sorter1), |
| 351 | + Sorter2=lambda x: list(reversed(range(len(x)))), |
| 352 | + ) |
| 353 | + .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False) |
| 354 | + .drop(["Sorter1", "Sorter2"], axis=1) |
| 355 | + ) |
| 356 | + report.columns.name = "" |
| 357 | + report = report.reset_index() |
| 358 | + report.drop(columns=["index"], inplace=True) |
| 359 | + return report |
| 360 | + |
| 361 | + |
| 362 | +if __name__ == "__main__": |
| 363 | + import doctest |
| 364 | + |
| 365 | + doctest.testmod() |
0 commit comments