diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7a10447e3ad40..ce85ad94af8b3 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -176,7 +176,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - -- +- Bug in :meth:`DataFrameGroupBy.indices` raises exception when grouping on multiple columns and one is a categorical with datetime values. (:issue:`26860`) - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) Reshaping diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5db31fe6664ea..3491054cfbde1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -305,6 +305,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): def get_indexer_dict(label_list, keys): """ return a diction of {labels} -> {indexers} """ + # address GH 26860 + keys = [np.asarray(key) for key in keys] shape = list(map(len, keys)) group_index = get_group_index(label_list, shape, sort=True, xnull=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4556b22b57279..eb2021031d5e5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -8,6 +8,8 @@ from pandas.errors import PerformanceWarning +from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64_any_dtype + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv import pandas.core.common as com @@ -351,6 +353,114 @@ def f3(x): df2.groupby("a").apply(f3) +def test_groupby_indices_error(): + # GH 26860 + # Test if DataFrame Groupby builds gb.indices + dt = pd.to_datetime(["2018-01-01", "2018-02-01", "2018-03-01"]) + df = pd.DataFrame( + { + "a": pd.Series(list("abc")), + "b": pd.Series(dt, dtype="category"), + "c": pd.Categorical.from_codes([-1, 0, 1], categories=[0, 1]), + } + ) + + df.groupby(["a", "b"]).indices + + +@pytest.mark.parametrize( + "gb_cols", + [ + "int_series", + "int_series_cat", + "float_series", + "float_series_cat", + "dt_series", + "dt_series_cat", + "period_series", + "period_series_cat", + [ + "int_series", + "int_series_cat", + "float_series", + "float_series_cat", + "dt_series", + "dt_series_cat", + "period_series", + "period_series_cat", + ], + ], +) +def test_groupby_indices_output(gb_cols): + # GH 26860 + # Test if DataFrame Groupby builds gb.indices correctly. + if isinstance(gb_cols, str): + gb_cols = [gb_cols] + + cols = [ + "int_series", + "int_series_cat", + "float_series", + "float_series_cat", + "dt_series", + "dt_series_cat", + "period_series", + "period_series_cat", + ] + + int_series = pd.Series([1, 2, 3]) + dt_series = pd.to_datetime(["2018Q1", "2018Q2", "2018Q3"]) + df = pd.DataFrame( + data={ + "int_series": int_series, + "int_series_cat": int_series.astype("category"), + "float_series": int_series.astype("float"), + "float_series_cat": int_series.astype("float").astype("category"), + "dt_series": dt_series, + "dt_series_cat": dt_series.astype("category"), + "period_series": dt_series.to_period("Q"), + "period_series_cat": dt_series.to_period("Q").astype("category"), + }, + columns=cols, + ) + + def dt_to_ts(elems): + return [pd.Timestamp(el) for el in elems] + + def ts_to_dt(elems): + return [el.to_datetime64() for el in elems] + + num_gb_cols = len(gb_cols) + + if num_gb_cols == 1: + s = df[gb_cols[0]] + col_vals = list(s.unique()) + + if is_datetime64_any_dtype(s): + col_vals = dt_to_ts(col_vals) + + target = {key: np.array([i]) for i, key in enumerate(col_vals)} + else: + col_vals = {col: list(df[col].unique()) for col in gb_cols} + + for col in gb_cols: + is_dt = is_datetime64_any_dtype(df[col]) + is_cat_dt = is_categorical_dtype(df[col]) and is_datetime64_any_dtype( + df[col].cat.categories + ) + if is_dt or is_cat_dt: + col_vals[col] = ts_to_dt(dt_to_ts(col_vals[col])) + + it = zip(*(col_vals[col] for col in gb_cols)) + target = {key: np.array([i]) for i, key in enumerate(it)} + + indices = df.groupby(gb_cols).indices + + assert set(target.keys()) == set(indices.keys()) + for key in target.keys(): + assert pd.core.dtypes.missing.array_equivalent(target[key], indices[key]) + + def test_attr_wrapper(ts): grouped = ts.groupby(lambda x: x.weekday())