Skip to content

fix a indices bug for categorical-datetime columns #26860

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^

-
-
- Bug in :meth:`DataFrameGroupBy.indices` raises exception when grouping on multiple columns and one is a categorical with datetime values. (:issue:`26860`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"raises" -> "raising an"

"and one" -> "when one"

- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)

Reshaping
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels):

def get_indexer_dict(label_list, keys):
""" return a diction of {labels} -> {indexers} """
# address GH 26860
keys = [np.asarray(key) for key in keys]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the types on key here? Series, Index, Array?

I worry a bit about doing this on a DatetimeIndex with tz. That will emit a warning, since we're changing how we handle datetimes in np.asarray.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, I'm not all that sure what is going into get_indexer_dict which was why I put the fix under the indices property since it was more about fixing that particular input.

shape = list(map(len, keys))

group_index = get_group_index(label_list, shape, sort=True, xnull=True)
Expand Down
110 changes: 110 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from pandas.errors import PerformanceWarning

from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64_any_dtype

import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv
import pandas.core.common as com
Expand Down Expand Up @@ -351,6 +353,114 @@ def f3(x):
df2.groupby("a").apply(f3)


def test_groupby_indices_error():
# GH 26860
# Test if DataFrame Groupby builds gb.indices
dt = pd.to_datetime(["2018-01-01", "2018-02-01", "2018-03-01"])
df = pd.DataFrame(
{
"a": pd.Series(list("abc")),
"b": pd.Series(dt, dtype="category"),
"c": pd.Categorical.from_codes([-1, 0, 1], categories=[0, 1]),
}
)

df.groupby(["a", "b"]).indices


@pytest.mark.parametrize(
"gb_cols",
[
"int_series",
"int_series_cat",
"float_series",
"float_series_cat",
"dt_series",
"dt_series_cat",
"period_series",
"period_series_cat",
[
"int_series",
"int_series_cat",
"float_series",
"float_series_cat",
"dt_series",
"dt_series_cat",
"period_series",
"period_series_cat",
],
],
)
def test_groupby_indices_output(gb_cols):
# GH 26860
# Test if DataFrame Groupby builds gb.indices correctly.
if isinstance(gb_cols, str):
gb_cols = [gb_cols]

cols = [
"int_series",
"int_series_cat",
"float_series",
"float_series_cat",
"dt_series",
"dt_series_cat",
"period_series",
"period_series_cat",
]

int_series = pd.Series([1, 2, 3])
dt_series = pd.to_datetime(["2018Q1", "2018Q2", "2018Q3"])
df = pd.DataFrame(
data={
"int_series": int_series,
"int_series_cat": int_series.astype("category"),
"float_series": int_series.astype("float"),
"float_series_cat": int_series.astype("float").astype("category"),
"dt_series": dt_series,
"dt_series_cat": dt_series.astype("category"),
"period_series": dt_series.to_period("Q"),
"period_series_cat": dt_series.to_period("Q").astype("category"),
},
columns=cols,
)

def dt_to_ts(elems):
return [pd.Timestamp(el) for el in elems]

def ts_to_dt(elems):
return [el.to_datetime64() for el in elems]

num_gb_cols = len(gb_cols)

if num_gb_cols == 1:
s = df[gb_cols[0]]
col_vals = list(s.unique())

if is_datetime64_any_dtype(s):
col_vals = dt_to_ts(col_vals)

target = {key: np.array([i]) for i, key in enumerate(col_vals)}
else:
col_vals = {col: list(df[col].unique()) for col in gb_cols}

for col in gb_cols:
is_dt = is_datetime64_any_dtype(df[col])
is_cat_dt = is_categorical_dtype(df[col]) and is_datetime64_any_dtype(
df[col].cat.categories
)
if is_dt or is_cat_dt:
col_vals[col] = ts_to_dt(dt_to_ts(col_vals[col]))

it = zip(*(col_vals[col] for col in gb_cols))
target = {key: np.array([i]) for i, key in enumerate(it)}

indices = df.groupby(gb_cols).indices

assert set(target.keys()) == set(indices.keys())
for key in target.keys():
assert pd.core.dtypes.missing.array_equivalent(target[key], indices[key])


def test_attr_wrapper(ts):
grouped = ts.groupby(lambda x: x.weekday())

Expand Down