-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
fix a indices bug for categorical-datetime columns #26860
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
6f8fdc0
cfa0fef
806f980
27086ba
0d5385f
3820a94
51bda3a
b6ba161
f833ece
650a3ec
88630ce
c926c06
39f394e
182de89
7b5b370
c700c1a
5543e0d
4ae7db8
abcfaff
85b3b1a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -305,6 +305,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): | |
|
||
def get_indexer_dict(label_list, keys): | ||
""" return a diction of {labels} -> {indexers} """ | ||
# address GH 26860 | ||
keys = [np.asarray(key) for key in keys] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the types on I worry a bit about doing this on a DatetimeIndex with tz. That will emit a warning, since we're changing how we handle datetimes in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly, I'm not all that sure what is going into |
||
shape = list(map(len, keys)) | ||
|
||
group_index = get_group_index(label_list, shape, sort=True, xnull=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -351,6 +351,100 @@ def f3(x): | |
df2.groupby("a").apply(f3) | ||
|
||
|
||
def _all_combinations(elems): | ||
from itertools import chain, combinations | ||
|
||
out = chain.from_iterable( | ||
combinations(elems, n + 1) for n in range(len(elems)) | ||
) | ||
return list(out) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
'gb_cols', _all_combinations([ | ||
'int_series', 'int_series_cat', 'float_series', 'float_series_cat', | ||
'dt_series', 'dt_series_cat', 'period_series', 'period_series_cat' | ||
]), | ||
ids=lambda cols: ",".join(cols) | ||
) | ||
def test_groupby_indices(gb_cols): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test seems very complicated. I haven't gone through it yet, but I would appreciate at least one test as simple as the example from the issue. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, I think it makes sense to simplify. My idea was that |
||
# GH 26860 | ||
# Test if DataFrame Groupby builds gb.indices correctly. | ||
|
||
gb_cols = list(gb_cols) | ||
|
||
int_series = pd.Series([1, 2, 3]) | ||
dt_series = pd.to_datetime(['2018Q1', '2018Q2', '2018Q3']) | ||
df = pd.DataFrame( | ||
data={ | ||
'int_series': int_series, | ||
'int_series_cat': int_series.astype('category'), | ||
'float_series': int_series.astype('float'), | ||
'float_series_cat': int_series.astype('float').astype('category'), | ||
'dt_series': dt_series, | ||
'dt_series_cat': dt_series.astype('category'), | ||
'period_series': dt_series.to_period('Q'), | ||
'period_series_cat': dt_series.to_period('Q').astype('category') | ||
}, | ||
columns=[ | ||
'int_series', | ||
'int_series_cat', | ||
'float_series', | ||
'float_series_cat', | ||
'dt_series', | ||
'dt_series_cat', | ||
'period_series', | ||
'period_series_cat' | ||
] | ||
) | ||
|
||
num_gb_cols = len(gb_cols) | ||
|
||
if num_gb_cols == 1: | ||
s = df[gb_cols[0]] | ||
col_vals = list(s.unique()) | ||
|
||
if pd.api.types.is_datetime64_any_dtype(s): | ||
col_vals = list(map(pd.Timestamp, col_vals)) | ||
|
||
target = { | ||
key: np.array([i]) | ||
for i, key in enumerate(col_vals) | ||
} | ||
else: | ||
col_vals = { | ||
col: list(df[col].unique()) | ||
for col in gb_cols | ||
} | ||
|
||
def to_dt(elems): | ||
elems = map(pd.Timestamp, elems) | ||
elems = map(lambda dt: dt.to_datetime64(), elems) | ||
elems = list(elems) | ||
return elems | ||
|
||
for col in gb_cols: | ||
if pd.api.types.is_datetime64_any_dtype(df[col]): | ||
col_vals[col] = to_dt(col_vals[col]) | ||
|
||
elif pd.api.types.is_categorical_dtype(df[col]): | ||
if pd.api.types.is_datetime64_any_dtype(df[col].cat.categories): | ||
col_vals[col] = to_dt(col_vals[col]) | ||
|
||
it = zip(*(col_vals[col] for col in col_vals.keys())) | ||
target = { | ||
key: np.array([i]) | ||
for i, key in enumerate(it) | ||
} | ||
|
||
indices = df.groupby(gb_cols).indices | ||
|
||
assert set(target.keys()) == set(indices.keys()) | ||
for key in target.keys(): | ||
assert pd.core.dtypes.missing.array_equivalent( | ||
target[key], indices[key]) | ||
|
||
|
||
def test_attr_wrapper(ts): | ||
grouped = ts.groupby(lambda x: x.weekday()) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is a public function. Can you rephrase this to make sense for an end user?
And can you move the release note to the 1.0.0 whatsenew?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make more sense to put it in terms of
gb.indices
which was where the problem originally came about?