Skip to content

fix a indices bug for categorical-datetime columns #26860

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 60 additions & 62 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,27 +351,27 @@ def f3(x):
df2.groupby("a").apply(f3)


def _all_combinations(elems):
from itertools import chain, combinations
def test_groupby_indices_error():
# GH 26860
# Test if DataFrame Groupby builds gb.indices
dt = pd.to_datetime(['2018-01-01', '2018-02-01', '2018-03-01'])
df = pd.DataFrame({
'a': pd.Series(list('abc')),
'b': pd.Series(dt, dtype='category'),
'c': pd.Categorical.from_codes([-1, 0, 1], categories=[0, 1])
})

out = chain.from_iterable(
combinations(elems, n + 1) for n in range(len(elems))
)
return list(out)
df.groupby(['a', 'b']).indices


@pytest.mark.parametrize(
'gb_cols', _all_combinations([
'int_series', 'int_series_cat', 'float_series', 'float_series_cat',
'dt_series', 'dt_series_cat', 'period_series', 'period_series_cat'
]),
ids=lambda cols: ",".join(cols)
)
def test_groupby_indices(gb_cols):
def test_groupby_indices_output():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is parametrizing no longer viable?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I can revert it or find a middle ground so the parameterization isn't overkill. Thoughts on that?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I simplified the test and parametrized it.

# GH 26860
# Test if DataFrame Groupby builds gb.indices correctly.

gb_cols = list(gb_cols)
cols = [
'int_series', 'int_series_cat', 'float_series', 'float_series_cat',
'dt_series', 'dt_series_cat', 'period_series', 'period_series_cat'
]

int_series = pd.Series([1, 2, 3])
dt_series = pd.to_datetime(['2018Q1', '2018Q2', '2018Q3'])
Expand All @@ -386,63 +386,61 @@ def test_groupby_indices(gb_cols):
'period_series': dt_series.to_period('Q'),
'period_series_cat': dt_series.to_period('Q').astype('category')
},
columns=[
'int_series',
'int_series_cat',
'float_series',
'float_series_cat',
'dt_series',
'dt_series_cat',
'period_series',
'period_series_cat'
]
columns=cols
)

num_gb_cols = len(gb_cols)

if num_gb_cols == 1:
s = df[gb_cols[0]]
col_vals = list(s.unique())

if pd.api.types.is_datetime64_any_dtype(s):
col_vals = list(map(pd.Timestamp, col_vals))
from itertools import chain, combinations

target = {
key: np.array([i])
for i, key in enumerate(col_vals)
}
else:
col_vals = {
col: list(df[col].unique())
for col in gb_cols
}
gb_cols_it = chain.from_iterable(
combinations(cols, n + 1) for n in range(len(cols))
)
for gb_cols in gb_cols_it:
gb_cols = list(gb_cols)
num_gb_cols = len(gb_cols)

def to_dt(elems):
elems = map(pd.Timestamp, elems)
elems = map(lambda dt: dt.to_datetime64(), elems)
elems = list(elems)
return elems
if num_gb_cols == 1:
s = df[gb_cols[0]]
col_vals = list(s.unique())

for col in gb_cols:
if pd.api.types.is_datetime64_any_dtype(df[col]):
col_vals[col] = to_dt(col_vals[col])
if pd.api.types.is_datetime64_any_dtype(s):
col_vals = list(map(pd.Timestamp, col_vals))

elif pd.api.types.is_categorical_dtype(df[col]):
if pd.api.types.is_datetime64_any_dtype(df[col].cat.categories):
target = {
key: np.array([i])
for i, key in enumerate(col_vals)
}
else:
col_vals = {
col: list(df[col].unique())
for col in gb_cols
}

def to_dt(elems):
elems = map(pd.Timestamp, elems)
elems = map(lambda dt: dt.to_datetime64(), elems)
elems = list(elems)
return elems

for col in gb_cols:
if pd.api.types.is_datetime64_any_dtype(df[col]):
col_vals[col] = to_dt(col_vals[col])

it = zip(*(col_vals[col] for col in col_vals.keys()))
target = {
key: np.array([i])
for i, key in enumerate(it)
}
elif pd.api.types.is_categorical_dtype(df[col]):
if pd.api.types.is_datetime64_any_dtype(df[col].cat.categories):
col_vals[col] = to_dt(col_vals[col])

it = zip(*(col_vals[col] for col in gb_cols))
target = {
key: np.array([i])
for i, key in enumerate(it)
}

indices = df.groupby(gb_cols).indices
indices = df.groupby(gb_cols).indices

assert set(target.keys()) == set(indices.keys())
for key in target.keys():
assert pd.core.dtypes.missing.array_equivalent(
target[key], indices[key])
assert set(target.keys()) == set(indices.keys())
for key in target.keys():
assert pd.core.dtypes.missing.array_equivalent(
target[key], indices[key])


def test_attr_wrapper(ts):
Expand Down