Handle categorical values

adamhooper · adamhooper · commit 6421b0b327be · 2019-09-26T12:25:12.000-04:00
Fixes old errors on production
diff --git a/groupby.py b/groupby.py
@@ -294,6 +294,16 @@ def groupby(
         # (hopefully) the least computationally-intense.
         agg_sets = "size"
 
+    # Got categoricals? Order the categories, so min/max work
+    category_colnames = {
+        agg.colname
+        for agg in aggregations
+        if agg.operation in {Operation.MIN, Operation.MAX}
+        and hasattr(table[colname], "cat")
+    }
+    for colname in category_colnames:
+        table[colname] = table[colname].cat.as_ordered()
+
     if group_specs:
         # aggs: DataFrame indexed by group
         # out: just the group colnames, no values yet (we'll add them later)
@@ -336,6 +346,14 @@ def groupby(
             except AttributeError:
                 out[outname] = series
 
+    # Remember those category colnames we converted to ordered? Now we need to
+    # undo that (and remove newly-unused categories).
+    for colname in out.columns:
+        column = out[colname]
+        if hasattr(column, "cat") and column.cat.ordered:
+            column.cat.remove_unused_categories(inplace=True)
+            column.cat.as_unordered(inplace=True)
+
     return out
 
 
diff --git a/test_groupby.py b/test_groupby.py
@@ -341,7 +341,7 @@ def test_aggregate_numbers(self):
             ),
         )
 
-    def test_aggregate_strings(self):
+    def test_aggregate_text_values(self):
         result = groupby(
             pd.DataFrame({"A": [1, 1, 1], "B": ["a", "b", "a"]}),
             [Group("A", None)],
@@ -367,6 +367,60 @@ def test_aggregate_strings(self):
             ),
         )
 
+    def test_aggregate_text_category_values(self):
+        result = groupby(
+            pd.DataFrame(
+                {"A": [1, 1, 1], "B": pd.Series(["a", "b", "a"], dtype="category")}
+            ),
+            [Group("A", None)],
+            [
+                Aggregation(Operation.SIZE, "B", "size"),
+                Aggregation(Operation.NUNIQUE, "B", "nunique"),
+                Aggregation(Operation.MIN, "B", "min"),
+                Aggregation(Operation.MAX, "B", "max"),
+                Aggregation(Operation.FIRST, "B", "first"),
+            ],
+        )
+        assert_frame_equal(
+            result,
+            pd.DataFrame(
+                {
+                    "A": [1],
+                    "size": [3],
+                    "nunique": [2],
+                    "min": pd.Series(["a"], dtype="category"),
+                    "max": pd.Series(["b"], dtype="category"),
+                    "first": pd.Series(["a"], dtype="category"),
+                }
+            ),
+        )
+
+    def test_aggregate_text_category_values_empty_still_has_object_dtype(self):
+        result = groupby(
+            pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
+            [Group("A", None)],
+            [
+                Aggregation(Operation.SIZE, "A", "size"),
+                Aggregation(Operation.NUNIQUE, "A", "nunique"),
+                Aggregation(Operation.MIN, "A", "min"),
+                Aggregation(Operation.MAX, "A", "max"),
+                Aggregation(Operation.FIRST, "A", "first"),
+            ],
+        )
+        assert_frame_equal(
+            result,
+            pd.DataFrame(
+                {
+                    "A": pd.Series([], dtype=str).astype("category"),
+                    "size": pd.Series([], dtype=int),
+                    "nunique": pd.Series([], dtype=int),
+                    "min": pd.Series([], dtype=str).astype("category"),
+                    "max": pd.Series([], dtype=str).astype("category"),
+                    "first": pd.Series([], dtype=str).astype("category"),
+                }
+            ),
+        )
+
     def test_aggregate_datetime_no_granularity(self):
         result = groupby(
             pd.DataFrame({"A": [dt(2018, 1, 4), dt(2018, 1, 5), dt(2018, 1, 4)]}),