-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
fix a indices bug for categorical-datetime columns #26860
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
6f8fdc0
cfa0fef
806f980
27086ba
0d5385f
3820a94
51bda3a
b6ba161
f833ece
650a3ec
88630ce
c926c06
39f394e
182de89
7b5b370
c700c1a
5543e0d
4ae7db8
abcfaff
85b3b1a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -305,6 +305,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): | |
|
||
def get_indexer_dict(label_list, keys): | ||
""" return a diction of {labels} -> {indexers} """ | ||
# address GH 26860 | ||
keys = [np.asarray(key) for key in keys] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the types on I worry a bit about doing this on a DatetimeIndex with tz. That will emit a warning, since we're changing how we handle datetimes in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly, I'm not all that sure what is going into |
||
shape = list(map(len, keys)) | ||
|
||
group_index = get_group_index(label_list, shape, sort=True, xnull=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -351,6 +351,98 @@ def f3(x): | |
df2.groupby("a").apply(f3) | ||
|
||
|
||
def test_groupby_indices_error(): | ||
# GH 26860 | ||
# Test if DataFrame Groupby builds gb.indices | ||
dt = pd.to_datetime(['2018-01-01', '2018-02-01', '2018-03-01']) | ||
df = pd.DataFrame({ | ||
'a': pd.Series(list('abc')), | ||
'b': pd.Series(dt, dtype='category'), | ||
'c': pd.Categorical.from_codes([-1, 0, 1], categories=[0, 1]) | ||
}) | ||
|
||
df.groupby(['a', 'b']).indices | ||
|
||
|
||
def test_groupby_indices_output(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is parametrizing no longer viable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, I can revert it or find a middle ground so the parameterization isn't overkill. Thoughts on that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I simplified the test and parametrized it. |
||
# GH 26860 | ||
# Test if DataFrame Groupby builds gb.indices correctly. | ||
|
||
cols = [ | ||
'int_series', 'int_series_cat', 'float_series', 'float_series_cat', | ||
'dt_series', 'dt_series_cat', 'period_series', 'period_series_cat' | ||
] | ||
|
||
int_series = pd.Series([1, 2, 3]) | ||
dt_series = pd.to_datetime(['2018Q1', '2018Q2', '2018Q3']) | ||
df = pd.DataFrame( | ||
data={ | ||
'int_series': int_series, | ||
'int_series_cat': int_series.astype('category'), | ||
'float_series': int_series.astype('float'), | ||
'float_series_cat': int_series.astype('float').astype('category'), | ||
'dt_series': dt_series, | ||
'dt_series_cat': dt_series.astype('category'), | ||
'period_series': dt_series.to_period('Q'), | ||
'period_series_cat': dt_series.to_period('Q').astype('category') | ||
}, | ||
columns=cols | ||
) | ||
|
||
from itertools import chain, combinations | ||
|
||
gb_cols_it = chain.from_iterable( | ||
combinations(cols, n + 1) for n in range(len(cols)) | ||
) | ||
for gb_cols in gb_cols_it: | ||
gb_cols = list(gb_cols) | ||
num_gb_cols = len(gb_cols) | ||
|
||
if num_gb_cols == 1: | ||
s = df[gb_cols[0]] | ||
col_vals = list(s.unique()) | ||
|
||
if pd.api.types.is_datetime64_any_dtype(s): | ||
col_vals = list(map(pd.Timestamp, col_vals)) | ||
|
||
target = { | ||
key: np.array([i]) | ||
for i, key in enumerate(col_vals) | ||
} | ||
else: | ||
col_vals = { | ||
col: list(df[col].unique()) | ||
for col in gb_cols | ||
} | ||
|
||
def to_dt(elems): | ||
elems = map(pd.Timestamp, elems) | ||
elems = map(lambda dt: dt.to_datetime64(), elems) | ||
elems = list(elems) | ||
return elems | ||
alexifm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
for col in gb_cols: | ||
if pd.api.types.is_datetime64_any_dtype(df[col]): | ||
col_vals[col] = to_dt(col_vals[col]) | ||
|
||
elif pd.api.types.is_categorical_dtype(df[col]): | ||
if pd.api.types.is_datetime64_any_dtype(df[col].cat.categories): | ||
col_vals[col] = to_dt(col_vals[col]) | ||
alexifm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
it = zip(*(col_vals[col] for col in gb_cols)) | ||
target = { | ||
key: np.array([i]) | ||
for i, key in enumerate(it) | ||
} | ||
|
||
indices = df.groupby(gb_cols).indices | ||
|
||
assert set(target.keys()) == set(indices.keys()) | ||
for key in target.keys(): | ||
assert pd.core.dtypes.missing.array_equivalent( | ||
target[key], indices[key]) | ||
|
||
|
||
def test_attr_wrapper(ts): | ||
grouped = ts.groupby(lambda x: x.weekday()) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is a public function. Can you rephrase this to make sense for an end user?
And can you move the release note to the 1.0.0 whatsenew?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make more sense to put it in terms of
gb.indices
which was where the problem originally came about?