From 72fd66dc6d0a67fbab59a2c284d87642cf9a3bb7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 12:39:27 +0100 Subject: [PATCH 01/73] ENH: A new GroupBy method to slice rows preserving index and order GH#42864 Groupby.iloc is added to perform most grouped slices All the slices preserve the initial order and grouping -ve slice step is not currently handled, and would have to reverse the order. There is no plan to implement Integer lists since these do not preserve the order at all and cannot be easily vectorized. --- pandas/core/groupby/groupby.py | 5 +- pandas/core/groupby/groupbyindexing.py | 198 ++++++++++++++++++++++ pandas/tests/groupby/test_groupby_iloc.py | 134 +++++++++++++++ 3 files changed, 336 insertions(+), 1 deletion(-) create mode 100644 pandas/core/groupby/groupbyindexing.py create mode 100644 pandas/tests/groupby/test_groupby_iloc.py diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5f9b1dec062f8..593ca3a34e174 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -109,6 +109,8 @@ class providing the base-class of operations. maybe_use_numba, ) +from pandas.core.groupby.groupbyindexing import GroupByIndexingMixin + _common_see_also = """ See Also -------- @@ -565,7 +567,7 @@ def group_selection_context(groupby: GroupBy) -> Iterator[GroupBy]: ] -class BaseGroupBy(PandasObject, SelectionMixin[FrameOrSeries]): +class BaseGroupBy(PandasObject, SelectionMixin[FrameOrSeries], GroupByIndexingMixin): _group_selection: IndexLabel | None = None _apply_allowlist: frozenset[str] = frozenset() _hidden_attrs = PandasObject._hidden_attrs | { @@ -3412,3 +3414,4 @@ def get_groupby( mutated=mutated, dropna=dropna, ) + diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py new file mode 100644 index 0000000000000..22a64454f0a2b --- /dev/null +++ b/pandas/core/groupby/groupbyindexing.py @@ -0,0 +1,198 @@ +from __future__ import annotations +from unittest.mock import PropertyMock + +from pandas.util._decorators import doc +import numpy as np + +class GroupByIndexingMixin: + """ + Mixin for adding .iloc to GroupBy. + """ + + @property + def iloc(self) -> _ilocGroupByIndexer: + """ + Integer location-based indexing for selection by position per group. + + Similar to ``.apply(lambda x: x.iloc[i:j, k:l])``, but much faster and returns a subset of rows + from the original DataFrame with the original index and order preserved. + + The output is compatible with head() and tail() + The output is different from take() and nth() which do not preserve the index or order + + Allowed inputs for the first index are: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + + Allowed inputs for the second index are as for DataFrame.iloc, namely: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + - A boolean array. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + + + Returns + ------- + Series or DataFrame + + Use Case + -------- + Supose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted + to a different order for each primary. + To reduce the DataFrame to a middle slice of each secondary, group by the primary and then use iloc. + This preserves the original DataFrame's order and indexing. + (See tests/groupby/test_groupby_iloc) + + Examples + -------- + >>> df = pd.DataFrame([['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]], + ... columns=['A', 'B']) + >>> df.groupby('A').iloc[1:2] + A B + 1 a 2 + 4 b 5 + >>> df.groupby('A').iloc[:-1, -1:] + B + 0 1 + 1 2 + 3 4 + """ + return _ilocGroupByIndexer(self) + +@doc(GroupByIndexingMixin.iloc) +class _ilocGroupByIndexer: + def __init__(self, grouped): + self.grouped = grouped + self.reversed = False + self._cached_ascending_count = None + self._cached_descending_count = None + + def __getitem__(self, arg): + self.reversed = False + + if type(arg) == tuple: + return self._handle_item(arg[0], arg[1]) + + else: + return self._handle_item(arg, None) + + def _handle_item(self, arg0, arg1): + typeof_arg = type(arg0) + + if typeof_arg == slice: + start = arg0.start + stop = arg0.stop + step = arg0.step + + if not step is None and step < 0: + raise ValueError( + f'GroupBy.iloc row slice step must be positive. Slice was {start}:{stop}:{step}' + ) + # self.reversed = True + # start = None if start is None else -start - 1 + # stop = None if stop is None else -stop - 1 + # step = -step + + return self._handle_slice(start, stop, step, arg1) + + elif typeof_arg == int: + return self._handle_slice(arg0, arg0 + 1, 1, arg1) + + else: + raise ValueError( + f'GroupBy.iloc row must be an integer or a slice, not a {typeof_arg}' + ) + + + def _handle_slice(self, start, stop, step, arg1): + + mask = None + if step == None: + step = 1 + + self.grouped._reset_group_selection() + + if start is None: + if step > 1: + mask = self._ascending_count % step == 0 + + else: + if start >= 0: + mask = self._ascending_count >= start + + if step > 1: + mask &= (self._ascending_count - start) % step == 0 + + else: + mask = self._descending_count < -start + + if step > 1: + # + # if start is -ve and -start excedes the length of a group then step must count from the + # first row of that group rather than the calculated offset + # + # count_array + reverse_array gives the length of the current group enabling to switch between + # the offset_array and the count_array depening on whether -start excedes the group size + # + offset_array = self._descending_count + start + 1 + limit_array = (self._ascending_count + self._descending_count + (start + 1)) < 0 + offset_array = np.where(limit_array, self._ascending_count, offset_array) + + mask &= offset_array % step == 0 + + if not stop is None: + if stop >= 0: + if mask is None: + mask = self._ascending_count < stop + + else: + mask &= self._ascending_count < stop + else: + if mask is None: + mask = self._descending_count >= -stop + + else: + mask &= self._descending_count >= -stop + + if mask is None: + arg0 = slice(None) + + else: + arg0 = mask + + if arg1 is None: + return self._selected_obj.iloc[arg0] + + else: + return self._selected_obj.iloc[arg0, arg1] + + @property + def _ascending_count(self): + if self._cached_ascending_count is None: + self._cached_ascending_count = self.grouped._cumcount_array() + if self.reversed: + self._cached_ascending_count = self._cached_ascending_count[::-1] + + return self._cached_ascending_count + + @property + def _descending_count(self): + if self._cached_descending_count is None: + self._cached_descending_count = self.grouped._cumcount_array(ascending=False) + if self.reversed: + self._cached_descending_count = self._cached_descending_count[::-1] + + return self._cached_descending_count + + @property + def _selected_obj(self): + if self.reversed: + return self.grouped._selected_obj.iloc[::-1] + + else: + return self.grouped._selected_obj diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py new file mode 100644 index 0000000000000..b8568d27d2575 --- /dev/null +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -0,0 +1,134 @@ +""" Test positional grouped indexing with iloc GH#42864""" + +import pandas as pd +import pandas._testing as tm +import random + +def test_doc_examples(): + """ Test the examples in the documentation""" + + df = pd.DataFrame([['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]], columns=['A', 'B']) + + grouped = df.groupby('A') + result = grouped.iloc[1:2, :] + expected = pd.DataFrame([['a', 2], ['b', 5]], columns=['A', 'B'], index=[1, 4]) + + tm.assert_frame_equal(result, expected) + + result = grouped.iloc[:-1, -1:] + expected = pd.DataFrame([1, 2, 4], columns=['B'], index=[0, 1, 3]) + + tm.assert_frame_equal(result, expected) + +def test_multiindex(): + """ Test the multiindex mentioned as the use-case in the documentation """ + + def make_df_from_data(data): + rows = {} + for date in dates: + for level in data[date]: + rows[(date, level[0])] = {'A':level[1], 'B':level[2]} + + df = pd.DataFrame.from_dict(rows, orient='index') + df.index.names=('Date','Item') + return df + + ndates = 1000 + nitems = 40 + dates = pd.date_range("20130101", periods=ndates, freq='D') + items = [f'item {i}' for i in range(nitems)] + + data = {} + for date in dates: + levels = [(item, random.randint(0, 10000)/100, random.randint(0, 10000)/100) for item in items] + levels.sort(key=lambda x: x[1]) + data[date] = levels + + df = make_df_from_data(data) + result = df.groupby('Date').iloc[3:7] + + sliced = {date: data[date][3:7] for date in dates} + expected = make_df_from_data(sliced) + + tm.assert_frame_equal(result, expected) + +def test_against_head_and_tail(): + """ Test gives the same results as grouped head and tail""" + + n_groups = 100 + n_rows_per_group = 30 + + data = { + 'group': [f'group {g}' for j in range(n_rows_per_group) for g in range(n_groups)], + 'value': [random.randint(0, 10000)/100 for j in range(n_rows_per_group) for g in range(n_groups)] + } + df = pd.DataFrame(data) + grouped = df.groupby('group') + + for i in [1, 5, 29, 30, 31, 1000]: + result = grouped.iloc[:i, :] + expected = grouped.head(i) + + tm.assert_frame_equal(result, expected) + + result = grouped.iloc[-i:, :] + expected = grouped.tail(i) + + tm.assert_frame_equal(result, expected) + +def test_against_df_iloc(): + """ Test that a single group gives the same results as DataFame.iloc""" + + n_rows_per_group = 30 + + data = { + 'group': [f'group 0' for j in range(n_rows_per_group)], + 'value': [random.randint(0, 10000)/100 for j in range(n_rows_per_group)] + } + df = pd.DataFrame(data) + grouped = df.groupby('group') + + for start in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: + for stop in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: + ### Work in progress! for step in [None, 1, 2, 3, 10, 29, 30, 100, -1, -2, -3, -10, -29, -30. -100]: + for step in [None, 1, 2, 3, 10, 29, 30, 100]: + result = grouped.iloc[start:stop:step, :] + expected = df.iloc[start:stop:step, :] + + tm.assert_frame_equal(result, expected) + +def test_series(): + """ Test grouped Series""" + + ser = pd.Series([1, 2, 3, 4, 5], index=['a', 'a', 'a', 'b', 'b']) + grouped = ser.groupby(level=0) + result = grouped.iloc[1:2] + expected = pd.Series([2, 5], index=['a', 'b']) + + tm.assert_series_equal(result, expected) + +def test_step(): + """ Test grouped slice with step""" + + data = [['x', f'x{i}'] for i in range(5)] + data += [['y', f'y{i}'] for i in range(4)] + data += [['z', f'z{i}'] for i in range(3)] + df = pd.DataFrame(data, columns=['A', 'B']) + + grouped = df.groupby('A') + + for step in [1, 2, 3, 4, 5]: + result = grouped.iloc[::step, :] + + data = [['x', f'x{i}'] for i in range(0, 5, step)] + data += [['y', f'y{i}'] for i in range(0, 4, step)] + data += [['z', f'z{i}'] for i in range(0, 3, step)] + + index = [i for i in range(0, 5, step)] + index += [5 + i for i in range(0, 4, step)] + index += [9 + i for i in range(0, 3, step)] + + expected = pd.DataFrame(data, columns=['A', 'B'], index=index) + + tm.assert_frame_equal(result, expected) + \ No newline at end of file From d0ebbeb8e8a6e858906d5e49df4df72db8553fa6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 13:44:14 +0100 Subject: [PATCH 02/73] Formatting --- pandas/core/groupby/groupby.py | 1 - pandas/core/groupby/groupbyindexing.py | 17 ++++++++++------- pandas/tests/groupby/test_groupby_iloc.py | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 593ca3a34e174..9fd052d7ee272 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3414,4 +3414,3 @@ def get_groupby( mutated=mutated, dropna=dropna, ) - diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 22a64454f0a2b..6a570f35cd6c8 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -4,6 +4,7 @@ from pandas.util._decorators import doc import numpy as np + class GroupByIndexingMixin: """ Mixin for adding .iloc to GroupBy. @@ -14,12 +15,13 @@ def iloc(self) -> _ilocGroupByIndexer: """ Integer location-based indexing for selection by position per group. - Similar to ``.apply(lambda x: x.iloc[i:j, k:l])``, but much faster and returns a subset of rows - from the original DataFrame with the original index and order preserved. - + Similar to ``.apply(lambda x: x.iloc[i:j, k:l])``, but much faster and returns + a subset of rows from the original DataFrame with the original index and order + preserved. + The output is compatible with head() and tail() The output is different from take() and nth() which do not preserve the index or order - + Allowed inputs for the first index are: - An integer, e.g. ``5``. @@ -43,11 +45,12 @@ def iloc(self) -> _ilocGroupByIndexer: Use Case -------- Supose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted - to a different order for each primary. - To reduce the DataFrame to a middle slice of each secondary, group by the primary and then use iloc. + to a different order for each primary. + To reduce the DataFrame to a middle slice of each secondary, group by the primary and then + use iloc. This preserves the original DataFrame's order and indexing. (See tests/groupby/test_groupby_iloc) - + Examples -------- >>> df = pd.DataFrame([['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]], diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index b8568d27d2575..9fa2a0f2d657a 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -40,7 +40,7 @@ def make_df_from_data(data): data = {} for date in dates: - levels = [(item, random.randint(0, 10000)/100, random.randint(0, 10000)/100) for item in items] + levels = [(item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) for item in items] levels.sort(key=lambda x: x[1]) data[date] = levels @@ -76,6 +76,7 @@ def test_against_head_and_tail(): tm.assert_frame_equal(result, expected) + def test_against_df_iloc(): """ Test that a single group gives the same results as DataFame.iloc""" @@ -87,10 +88,9 @@ def test_against_df_iloc(): } df = pd.DataFrame(data) grouped = df.groupby('group') - + for start in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: for stop in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: - ### Work in progress! for step in [None, 1, 2, 3, 10, 29, 30, 100, -1, -2, -3, -10, -29, -30. -100]: for step in [None, 1, 2, 3, 10, 29, 30, 100]: result = grouped.iloc[start:stop:step, :] expected = df.iloc[start:stop:step, :] From 33d7992b013fad7d2f62c191f9a17d1cfeb61382 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 14:23:30 +0100 Subject: [PATCH 03/73] Formatting --- pandas/core/groupby/groupbyindexing.py | 54 +++++++++++++---------- pandas/tests/groupby/test_groupby_iloc.py | 36 +++++++++------ 2 files changed, 54 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 6a570f35cd6c8..bd300f3608ab6 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -27,9 +27,9 @@ def iloc(self) -> _ilocGroupByIndexer: - An integer, e.g. ``5``. - A list or array of integers, e.g. ``[4, 3, 0]``. - A slice object with ints, e.g. ``1:7``. - + Allowed inputs for the second index are as for DataFrame.iloc, namely: - + - An integer, e.g. ``5``. - A list or array of integers, e.g. ``[4, 3, 0]``. - A slice object with ints, e.g. ``1:7``. @@ -67,22 +67,23 @@ def iloc(self) -> _ilocGroupByIndexer: """ return _ilocGroupByIndexer(self) + @doc(GroupByIndexingMixin.iloc) class _ilocGroupByIndexer: def __init__(self, grouped): - self.grouped = grouped - self.reversed = False - self._cached_ascending_count = None - self._cached_descending_count = None + self.grouped = grouped + self.reversed = False + self._cached_ascending_count = None + self._cached_descending_count = None def __getitem__(self, arg): self.reversed = False if type(arg) == tuple: - return self._handle_item(arg[0], arg[1]) + return self._handle_item(arg[0], arg[1]) else: - return self._handle_item(arg, None) + return self._handle_item(arg, None) def _handle_item(self, arg0, arg1): typeof_arg = type(arg0) @@ -92,10 +93,10 @@ def _handle_item(self, arg0, arg1): stop = arg0.stop step = arg0.step - if not step is None and step < 0: + if step is not None and step < 0: raise ValueError( - f'GroupBy.iloc row slice step must be positive. Slice was {start}:{stop}:{step}' - ) + f'GroupBy.iloc row slice step must be positive. Slice was {start}:{stop}:{step}' + ) # self.reversed = True # start = None if start is None else -start - 1 # stop = None if stop is None else -stop - 1 @@ -115,8 +116,8 @@ def _handle_item(self, arg0, arg1): def _handle_slice(self, start, stop, step, arg1): mask = None - if step == None: - step = 1 + if step is None: + step = 1 self.grouped._reset_group_selection() @@ -127,7 +128,7 @@ def _handle_slice(self, start, stop, step, arg1): else: if start >= 0: mask = self._ascending_count >= start - + if step > 1: mask &= (self._ascending_count - start) % step == 0 @@ -136,29 +137,34 @@ def _handle_slice(self, start, stop, step, arg1): if step > 1: # - # if start is -ve and -start excedes the length of a group then step must count from the + # if start is -ve and -start excedes the length of a group + # then step must count from the # first row of that group rather than the calculated offset # - # count_array + reverse_array gives the length of the current group enabling to switch between - # the offset_array and the count_array depening on whether -start excedes the group size + # count_array + reverse_array gives the length of the + # current group enabling to switch between + # the offset_array and the count_array depening on whether + # -start excedes the group size # offset_array = self._descending_count + start + 1 limit_array = (self._ascending_count + self._descending_count + (start + 1)) < 0 - offset_array = np.where(limit_array, self._ascending_count, offset_array) + offset_array = np.where( + limit_array, self._ascending_count, offset_array + ) mask &= offset_array % step == 0 - if not stop is None: + if stop is not None: if stop >= 0: if mask is None: mask = self._ascending_count < stop - + else: mask &= self._ascending_count < stop else: if mask is None: mask = self._descending_count >= -stop - + else: mask &= self._descending_count >= -stop @@ -186,7 +192,9 @@ def _ascending_count(self): @property def _descending_count(self): if self._cached_descending_count is None: - self._cached_descending_count = self.grouped._cumcount_array(ascending=False) + self._cached_descending_count = self.grouped._cumcount_array( + ascending=False + ) if self.reversed: self._cached_descending_count = self._cached_descending_count[::-1] @@ -198,4 +206,4 @@ def _selected_obj(self): return self.grouped._selected_obj.iloc[::-1] else: - return self.grouped._selected_obj + return self.grouped._selected_obj \ No newline at end of file diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index 9fa2a0f2d657a..211c43bebb538 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -4,22 +4,26 @@ import pandas._testing as tm import random + def test_doc_examples(): """ Test the examples in the documentation""" - df = pd.DataFrame([['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]], columns=['A', 'B']) + df = pd.DataFrame( + [['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]], columns=['A', 'B'] + ) grouped = df.groupby('A') result = grouped.iloc[1:2, :] - expected = pd.DataFrame([['a', 2], ['b', 5]], columns=['A', 'B'], index=[1, 4]) + expected = pd.DataFrame([['a', 2], ['b', 5]], columns=['A', 'B'], index=[1, 4]) tm.assert_frame_equal(result, expected) result = grouped.iloc[:-1, -1:] - expected = pd.DataFrame([1, 2, 4], columns=['B'], index=[0, 1, 3]) + expected = pd.DataFrame([1, 2, 4], columns=['B'], index=[0, 1, 3]) tm.assert_frame_equal(result, expected) + def test_multiindex(): """ Test the multiindex mentioned as the use-case in the documentation """ @@ -27,10 +31,10 @@ def make_df_from_data(data): rows = {} for date in dates: for level in data[date]: - rows[(date, level[0])] = {'A':level[1], 'B':level[2]} + rows[(date, level[0])] = {'A': level[1], 'B': level[2]} df = pd.DataFrame.from_dict(rows, orient='index') - df.index.names=('Date','Item') + df.index.names = ('Date', 'Item') return df ndates = 1000 @@ -40,7 +44,9 @@ def make_df_from_data(data): data = {} for date in dates: - levels = [(item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) for item in items] + levels = [ + (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) for item in items + ] levels.sort(key=lambda x: x[1]) data[date] = levels @@ -52,19 +58,22 @@ def make_df_from_data(data): tm.assert_frame_equal(result, expected) + def test_against_head_and_tail(): """ Test gives the same results as grouped head and tail""" - + n_groups = 100 n_rows_per_group = 30 data = { 'group': [f'group {g}' for j in range(n_rows_per_group) for g in range(n_groups)], - 'value': [random.randint(0, 10000)/100 for j in range(n_rows_per_group) for g in range(n_groups)] + 'value': [ + random.randint(0, 10000)/100 for j in range(n_rows_per_group) for g in range(n_groups) + ] } df = pd.DataFrame(data) grouped = df.groupby('group') - + for i in [1, 5, 29, 30, 31, 1000]: result = grouped.iloc[:i, :] expected = grouped.head(i) @@ -84,7 +93,7 @@ def test_against_df_iloc(): data = { 'group': [f'group 0' for j in range(n_rows_per_group)], - 'value': [random.randint(0, 10000)/100 for j in range(n_rows_per_group)] + 'value': [random.randint(0, 10000) / 100 for j in range(n_rows_per_group)] } df = pd.DataFrame(data) grouped = df.groupby('group') @@ -97,16 +106,18 @@ def test_against_df_iloc(): tm.assert_frame_equal(result, expected) + def test_series(): """ Test grouped Series""" - ser = pd.Series([1, 2, 3, 4, 5], index=['a', 'a', 'a', 'b', 'b']) + ser = pd.Series([1, 2, 3, 4, 5], index=['a', 'a', 'a', 'b', 'b']) grouped = ser.groupby(level=0) result = grouped.iloc[1:2] expected = pd.Series([2, 5], index=['a', 'b']) tm.assert_series_equal(result, expected) + def test_step(): """ Test grouped slice with step""" @@ -130,5 +141,4 @@ def test_step(): expected = pd.DataFrame(data, columns=['A', 'B'], index=index) - tm.assert_frame_equal(result, expected) - \ No newline at end of file + tm.assert_frame_equal(result, expected) \ No newline at end of file From 78e9ced49c21ca08d9e2bd0818495d5fd02b1838 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 14:28:52 +0100 Subject: [PATCH 04/73] Formatting --- pandas/core/groupby/groupbyindexing.py | 17 ++++++++--------- pandas/tests/groupby/test_groupby_iloc.py | 5 +++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index bd300f3608ab6..6a310b6c10f50 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -27,9 +27,9 @@ def iloc(self) -> _ilocGroupByIndexer: - An integer, e.g. ``5``. - A list or array of integers, e.g. ``[4, 3, 0]``. - A slice object with ints, e.g. ``1:7``. - + Allowed inputs for the second index are as for DataFrame.iloc, namely: - + - An integer, e.g. ``5``. - A list or array of integers, e.g. ``[4, 3, 0]``. - A slice object with ints, e.g. ``1:7``. @@ -47,7 +47,7 @@ def iloc(self) -> _ilocGroupByIndexer: Supose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted to a different order for each primary. To reduce the DataFrame to a middle slice of each secondary, group by the primary and then - use iloc. + use iloc. This preserves the original DataFrame's order and indexing. (See tests/groupby/test_groupby_iloc) @@ -81,10 +81,10 @@ def __getitem__(self, arg): if type(arg) == tuple: return self._handle_item(arg[0], arg[1]) - + else: return self._handle_item(arg, None) - + def _handle_item(self, arg0, arg1): typeof_arg = type(arg0) @@ -114,7 +114,6 @@ def _handle_item(self, arg0, arg1): def _handle_slice(self, start, stop, step, arg1): - mask = None if step is None: step = 1 @@ -137,7 +136,7 @@ def _handle_slice(self, start, stop, step, arg1): if step > 1: # - # if start is -ve and -start excedes the length of a group + # if start is -ve and -start excedes the length of a group # then step must count from the # first row of that group rather than the calculated offset # @@ -147,7 +146,7 @@ def _handle_slice(self, start, stop, step, arg1): # -start excedes the group size # offset_array = self._descending_count + start + 1 - limit_array = (self._ascending_count + self._descending_count + (start + 1)) < 0 + limit_array = (self._ascending_count + self._descending_count + (start + 1)) < 0 offset_array = np.where( limit_array, self._ascending_count, offset_array ) @@ -206,4 +205,4 @@ def _selected_obj(self): return self.grouped._selected_obj.iloc[::-1] else: - return self.grouped._selected_obj \ No newline at end of file + return self.grouped._selected_obj diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index 211c43bebb538..bdbc41324d33a 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -68,7 +68,7 @@ def test_against_head_and_tail(): data = { 'group': [f'group {g}' for j in range(n_rows_per_group) for g in range(n_groups)], 'value': [ - random.randint(0, 10000)/100 for j in range(n_rows_per_group) for g in range(n_groups) + random.randint(0, 10000) / 100 for j in range(n_rows_per_group) for g in range(n_groups) ] } df = pd.DataFrame(data) @@ -141,4 +141,5 @@ def test_step(): expected = pd.DataFrame(data, columns=['A', 'B'], index=index) - tm.assert_frame_equal(result, expected) \ No newline at end of file + tm.assert_frame_equal(result, expected) + \ No newline at end of file From 4d098cd0a8d3dffb00b5eb7d1c7e896431dd8ba6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 14:30:36 +0100 Subject: [PATCH 05/73] Formatting --- pandas/core/groupby/groupbyindexing.py | 1 - pandas/tests/groupby/test_groupby_iloc.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 6a310b6c10f50..8d2c9a76be53f 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -112,7 +112,6 @@ def _handle_item(self, arg0, arg1): f'GroupBy.iloc row must be an integer or a slice, not a {typeof_arg}' ) - def _handle_slice(self, start, stop, step, arg1): mask = None if step is None: diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index bdbc41324d33a..36ce4fab3ceb6 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -142,4 +142,4 @@ def test_step(): expected = pd.DataFrame(data, columns=['A', 'B'], index=index) tm.assert_frame_equal(result, expected) - \ No newline at end of file + \ No newline at end of file From f84c36560a6718cf11c7a37986f95b59c424d472 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 14:32:00 +0100 Subject: [PATCH 06/73] Formatting --- pandas/tests/groupby/test_groupby_iloc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index 36ce4fab3ceb6..8e314afed64c6 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -142,4 +142,3 @@ def test_step(): expected = pd.DataFrame(data, columns=['A', 'B'], index=index) tm.assert_frame_equal(result, expected) - \ No newline at end of file From d93775781458e0fc005493ddfa179376d23252f4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 17:09:37 +0100 Subject: [PATCH 07/73] Add iloc to test_tab_completion --- pandas/tests/groupby/test_allowlist.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index 8be721c13eea8..a0c09b6c47cf9 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -309,6 +309,7 @@ def test_tab_completion(mframe): "rank", "cumprod", "tail", + "iloc", "resample", "cummin", "fillna", From e2069122649432ecec5ec774beb5310fc94e3ba0 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 9 Aug 2021 17:49:21 +0100 Subject: [PATCH 08/73] Add iloc to groupby/base.py --- pandas/core/groupby/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index d4e042122a9c3..fd0267147620a 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -125,6 +125,7 @@ "groups", "head", "hist", + "iloc", "indices", "ndim", "ngroups", From 1788f1bc27fe93b67b04fc132c61491e4c249a27 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 10 Aug 2021 12:33:49 +0100 Subject: [PATCH 09/73] Documentation --- pandas/core/groupby/groupbyindexing.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 8d2c9a76be53f..d903bdfcfa69b 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -22,11 +22,12 @@ def iloc(self) -> _ilocGroupByIndexer: The output is compatible with head() and tail() The output is different from take() and nth() which do not preserve the index or order + Inputs + ------ Allowed inputs for the first index are: - An integer, e.g. ``5``. - - A list or array of integers, e.g. ``[4, 3, 0]``. - - A slice object with ints, e.g. ``1:7``. + - A slice object with ints and positive step, e.g. ``1:``, ``4:-3:2``. Allowed inputs for the second index are as for DataFrame.iloc, namely: @@ -37,11 +38,16 @@ def iloc(self) -> _ilocGroupByIndexer: - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). - Returns ------- Series or DataFrame + Note + ---- + Neither GroupBy.nth() nor GroupBy.take() take a slice argument and + neither of them preserve the original DataFrame order and index. + They are both slow for large integer lists and take() is very slow for large group counts. + Use Case -------- Supose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted From f6977fa3b342847b534b55d08d5529e744ca9676 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 10 Aug 2021 15:09:11 +0100 Subject: [PATCH 10/73] Cosmetics to make pre-commit happy --- pandas/core/groupby/groupbyindexing.py | 27 +++++---- pandas/tests/groupby/test_groupby_iloc.py | 72 ++++++++++++----------- 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index d903bdfcfa69b..30a0ab10be282 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -1,5 +1,4 @@ from __future__ import annotations -from unittest.mock import PropertyMock from pandas.util._decorators import doc import numpy as np @@ -50,22 +49,22 @@ def iloc(self) -> _ilocGroupByIndexer: Use Case -------- - Supose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted + Suppose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted to a different order for each primary. To reduce the DataFrame to a middle slice of each secondary, group by the primary and then use iloc. - This preserves the original DataFrame's order and indexing. + This preserves the original DataFrame"s order and indexing. (See tests/groupby/test_groupby_iloc) Examples -------- - >>> df = pd.DataFrame([['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]], - ... columns=['A', 'B']) - >>> df.groupby('A').iloc[1:2] + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A").iloc[1:2] A B 1 a 2 4 b 5 - >>> df.groupby('A').iloc[:-1, -1:] + >>> df.groupby("A").iloc[:-1, -1:] B 0 1 1 2 @@ -101,7 +100,7 @@ def _handle_item(self, arg0, arg1): if step is not None and step < 0: raise ValueError( - f'GroupBy.iloc row slice step must be positive. Slice was {start}:{stop}:{step}' + f"GroupBy.iloc row slice step must be positive. Slice was {start}:{stop}:{step}" ) # self.reversed = True # start = None if start is None else -start - 1 @@ -115,7 +114,7 @@ def _handle_item(self, arg0, arg1): else: raise ValueError( - f'GroupBy.iloc row must be an integer or a slice, not a {typeof_arg}' + f"GroupBy.iloc row must be an integer or a slice, not a {typeof_arg}" ) def _handle_slice(self, start, stop, step, arg1): @@ -141,17 +140,19 @@ def _handle_slice(self, start, stop, step, arg1): if step > 1: # - # if start is -ve and -start excedes the length of a group + # if start is -ve and -start exceedes the length of a group # then step must count from the # first row of that group rather than the calculated offset # # count_array + reverse_array gives the length of the # current group enabling to switch between - # the offset_array and the count_array depening on whether - # -start excedes the group size + # the offset_array and the count_array depending on whether + # -start exceedes the group size # offset_array = self._descending_count + start + 1 - limit_array = (self._ascending_count + self._descending_count + (start + 1)) < 0 + limit_array = ( + self._ascending_count + self._descending_count + (start + 1) + ) < 0 offset_array = np.where( limit_array, self._ascending_count, offset_array ) diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index 8e314afed64c6..bca61c3db33ab 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -6,41 +6,41 @@ def test_doc_examples(): - """ Test the examples in the documentation""" + """Test the examples in the documentation""" df = pd.DataFrame( - [['a', 1], ['a', 2], ['a', 3], ['b', 4], ['b', 5]], columns=['A', 'B'] + [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] ) - grouped = df.groupby('A') + grouped = df.groupby("A") result = grouped.iloc[1:2, :] - expected = pd.DataFrame([['a', 2], ['b', 5]], columns=['A', 'B'], index=[1, 4]) + expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) tm.assert_frame_equal(result, expected) result = grouped.iloc[:-1, -1:] - expected = pd.DataFrame([1, 2, 4], columns=['B'], index=[0, 1, 3]) + expected = pd.DataFrame([1, 2, 4], columns=["B"], index=[0, 1, 3]) tm.assert_frame_equal(result, expected) def test_multiindex(): - """ Test the multiindex mentioned as the use-case in the documentation """ + """Test the multiindex mentioned as the use-case in the documentation""" def make_df_from_data(data): rows = {} for date in dates: for level in data[date]: - rows[(date, level[0])] = {'A': level[1], 'B': level[2]} + rows[(date, level[0])] = {"A": level[1], "B": level[2]} - df = pd.DataFrame.from_dict(rows, orient='index') - df.index.names = ('Date', 'Item') + df = pd.DataFrame.from_dict(rows, orient="index") + df.index.names = ("Date", "Item") return df ndates = 1000 nitems = 40 - dates = pd.date_range("20130101", periods=ndates, freq='D') - items = [f'item {i}' for i in range(nitems)] + dates = pd.date_range("20130101", periods=ndates, freq="D") + items = [f"item {i}" for i in range(nitems)] data = {} for date in dates: @@ -51,7 +51,7 @@ def make_df_from_data(data): data[date] = levels df = make_df_from_data(data) - result = df.groupby('Date').iloc[3:7] + result = df.groupby("Date").iloc[3:7] sliced = {date: data[date][3:7] for date in dates} expected = make_df_from_data(sliced) @@ -60,19 +60,21 @@ def make_df_from_data(data): def test_against_head_and_tail(): - """ Test gives the same results as grouped head and tail""" + """Test gives the same results as grouped head and tail""" n_groups = 100 n_rows_per_group = 30 data = { - 'group': [f'group {g}' for j in range(n_rows_per_group) for g in range(n_groups)], - 'value': [ - random.randint(0, 10000) / 100 for j in range(n_rows_per_group) for g in range(n_groups) + "group": [f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)], + "value": [ + random.randint(0, 10000) / 100 + for j in range(n_rows_per_group) + for g in range(n_groups) ] } df = pd.DataFrame(data) - grouped = df.groupby('group') + grouped = df.groupby("group") for i in [1, 5, 29, 30, 31, 1000]: result = grouped.iloc[:i, :] @@ -87,16 +89,16 @@ def test_against_head_and_tail(): def test_against_df_iloc(): - """ Test that a single group gives the same results as DataFame.iloc""" + """Test that a single group gives the same results as DataFame.iloc""" n_rows_per_group = 30 data = { - 'group': [f'group 0' for j in range(n_rows_per_group)], - 'value': [random.randint(0, 10000) / 100 for j in range(n_rows_per_group)] + "group": ["group 0" for j in range(n_rows_per_group)], + "value": [random.randint(0, 10000) / 100 for j in range(n_rows_per_group)] } df = pd.DataFrame(data) - grouped = df.groupby('group') + grouped = df.groupby("group") for start in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: for stop in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: @@ -108,37 +110,37 @@ def test_against_df_iloc(): def test_series(): - """ Test grouped Series""" + """Test grouped Series""" - ser = pd.Series([1, 2, 3, 4, 5], index=['a', 'a', 'a', 'b', 'b']) + ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) grouped = ser.groupby(level=0) result = grouped.iloc[1:2] - expected = pd.Series([2, 5], index=['a', 'b']) + expected = pd.Series([2, 5], index=["a", "b"]) tm.assert_series_equal(result, expected) def test_step(): - """ Test grouped slice with step""" + """Test grouped slice with step""" - data = [['x', f'x{i}'] for i in range(5)] - data += [['y', f'y{i}'] for i in range(4)] - data += [['z', f'z{i}'] for i in range(3)] - df = pd.DataFrame(data, columns=['A', 'B']) + data = [["x", f"x{i}"] for i in range(5)] + data += [["y", f"y{i}"] for i in range(4)] + data += [["z", f"z{i}"] for i in range(3)] + df = pd.DataFrame(data, columns=["A", "B"]) - grouped = df.groupby('A') + grouped = df.groupby("A") for step in [1, 2, 3, 4, 5]: result = grouped.iloc[::step, :] - data = [['x', f'x{i}'] for i in range(0, 5, step)] - data += [['y', f'y{i}'] for i in range(0, 4, step)] - data += [['z', f'z{i}'] for i in range(0, 3, step)] + data = [["x", f"x{i}"] for i in range(0, 5, step)] + data += [["y", f"y{i}"] for i in range(0, 4, step)] + data += [["z", f"z{i}"] for i in range(0, 3, step)] - index = [i for i in range(0, 5, step)] + index = [0 + i for i in range(0, 5, step)] index += [5 + i for i in range(0, 4, step)] index += [9 + i for i in range(0, 3, step)] - expected = pd.DataFrame(data, columns=['A', 'B'], index=index) + expected = pd.DataFrame(data, columns=["A", "B"], index=index) tm.assert_frame_equal(result, expected) From bca4fddaba072730dc52b94fd46834cd62f43346 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 11 Aug 2021 11:25:39 +0100 Subject: [PATCH 11/73] Improve docstring --- pandas/core/groupby/a.md | 86 +++++++++++++++++++++ pandas/core/groupby/groupbyindexing.py | 100 ++++++++++++++++--------- 2 files changed, 152 insertions(+), 34 deletions(-) create mode 100644 pandas/core/groupby/a.md diff --git a/pandas/core/groupby/a.md b/pandas/core/groupby/a.md new file mode 100644 index 0000000000000..77f416d8a575a --- /dev/null +++ b/pandas/core/groupby/a.md @@ -0,0 +1,86 @@ +Purely integer-location based indexing for selection by position per group. + +``.iloc[]`` is primarily integer position based (from ``0`` to +``length-1`` of the axis), + +Allowed inputs for the first index are: + +- An integer, e.g. ``5``. +- A slice object with ints and positive step, e.g. ``1:``, ``4:-3:2``. + +Allowed inputs for the second index are as for DataFrame.iloc, namely: + +- An integer, e.g. ``5``. +- A list or array of integers, e.g. ``[4, 3, 0]``. +- A slice object with ints, e.g. ``1:7``. +- A boolean array. +- A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + +The output format is the same as GroupBy.head and GroupBy.tail, namely a subset +of the original DataFrame or Series with the index and order preserved. + +The effect of ``grouped.iloc[i:j, k:l]`` is similar to + + grouped.apply(lambda x: x.iloc[i:j, k:l]) + +but very much faster and preserving the original index and order. + +The behaviour is different from GroupBy.take: +- Input to iloc is a slice of indexes rather than a list of indexes. +- Output from iloc is: + - In the same order as the original grouped DataFrame or Series. + - Has the same index columns as the original grouped DataFrame or Series. + (GroupBy.take introduces an additional index) +- GroupBy.take is extremely slow when there is a high group count. + +The behaviour is different from GroupBy.nth: +- Input to iloc is a slice of indexes rather than a list of indexes. +- Output from iloc is: + - In the same order as the original grouped DataFrame or Series. + - Has the same index columns as the original grouped DataFrame or Series. + (nth removes the grouped index) +- GroupBy.nth is quite fast for a high group count but the processing time + grows with the length of the list of indexes. + +Since GroupBy.take and GroupBy.nth only accept a list of individual indexes +it is not possible to define a slice that ends relative to the last row of +each group. + +An important use case for GroupBy.iloc is a multi-indexed DataFrame with a +large primary index (Date, say) and a secondary index sorted to a different +order for each Date. +To reduce the DataFrame to a middle slice of each Date: + + df.groupby("Date").iloc[5:-5] + +This returns a subset of df containing just the middle rows for each Date +and with its original order and indexing preserved. +(See test_multiindex() in tests/groupby/test_groupby_iloc.py) + +Returns +------- +Series or DataFrame + +See Also +-------- +DataFrame.iloc : Purely integer-location based indexing for selection by position. +GroupBy.head : Return first n rows of each group. +GroupBy.tail : Return last n rows of each group. +GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows +if n is a list of ints. +GroupBy.take : Return the elements in the given positional indices along an axis. + +Examples +-------- + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A").iloc[1:2] + A B + 1 a 2 + 4 b 5 + >>> df.groupby("A").iloc[:-1, -1:] + B + 0 1 + 1 2 + 3 4 \ No newline at end of file diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 30a0ab10be282..9eeb6838b193a 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -12,17 +12,11 @@ class GroupByIndexingMixin: @property def iloc(self) -> _ilocGroupByIndexer: """ - Integer location-based indexing for selection by position per group. + Purely integer-location based indexing for selection by position per group. - Similar to ``.apply(lambda x: x.iloc[i:j, k:l])``, but much faster and returns - a subset of rows from the original DataFrame with the original index and order - preserved. + ``.iloc[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), - The output is compatible with head() and tail() - The output is different from take() and nth() which do not preserve the index or order - - Inputs - ------ Allowed inputs for the first index are: - An integer, e.g. ``5``. @@ -37,38 +31,76 @@ def iloc(self) -> _ilocGroupByIndexer: - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). + The output format is the same as GroupBy.head and GroupBy.tail, namely a subset + of the original DataFrame or Series with the index and order preserved. + + The effect of ``grouped.iloc[i:j, k:l]`` is similar to + + grouped.apply(lambda x: x.iloc[i:j, k:l]) + + but very much faster and preserving the original index and order. + + The behaviour is different from GroupBy.take: + - Input to iloc is a slice of indexes rather than a list of indexes. + - Output from iloc is: + - In the same order as the original grouped DataFrame or Series. + - Has the same index columns as the original grouped DataFrame or Series. + (GroupBy.take introduces an additional index) + - GroupBy.take is extremely slow when there is a high group count. + + The behaviour is different from GroupBy.nth: + - Input to iloc is a slice of indexes rather than a list of indexes. + - Output from iloc is: + - In the same order as the original grouped DataFrame or Series. + - Has the same index columns as the original grouped DataFrame or Series. + (nth behaves like an aggregator and removes the non-grouped indexes) + - GroupBy.nth is quite fast for a high group count but slower than head, + tail and iloc. + + Since GroupBy.take and GroupBy.nth only accept a list of individual indexes + it is not possible to define a slice that ends relative to the last row of + each group. + + An important use case for GroupBy.iloc is a multi-indexed DataFrame with a + large primary index (Date, say) and a secondary index sorted to a different + order for each Date. + To reduce the DataFrame to a middle slice of each Date: + + df.groupby("Date").iloc[5:-5] + + This returns a subset of df containing just the middle rows for each Date + and with its original order and indexing preserved. + (See test_multiindex() in tests/groupby/test_groupby_iloc.py) + Returns ------- - Series or DataFrame - - Note - ---- - Neither GroupBy.nth() nor GroupBy.take() take a slice argument and - neither of them preserve the original DataFrame order and index. - They are both slow for large integer lists and take() is very slow for large group counts. + Series + The filtered subset of the original grouped Series. + DataFrame + The filtered subset of the original grouped DataFrame. - Use Case + See Also -------- - Suppose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted - to a different order for each primary. - To reduce the DataFrame to a middle slice of each secondary, group by the primary and then - use iloc. - This preserves the original DataFrame"s order and indexing. - (See tests/groupby/test_groupby_iloc) + DataFrame.iloc : Purely integer-location based indexing for selection by position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows + if n is a list of ints. + GroupBy.take : Return the elements in the given positional indices along an axis. Examples -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A").iloc[1:2] - A B - 1 a 2 - 4 b 5 - >>> df.groupby("A").iloc[:-1, -1:] - B - 0 1 - 1 2 - 3 4 + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A").iloc[1:2] + A B + 1 a 2 + 4 b 5 + >>> df.groupby("A").iloc[:-1, -1:] + B + 0 1 + 1 2 + 3 4 """ return _ilocGroupByIndexer(self) From 66536b13ad95e50ca8a42e15cc25ef4db979bf2c Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 11 Aug 2021 11:27:18 +0100 Subject: [PATCH 12/73] Delete a.md --- pandas/core/groupby/a.md | 86 ---------------------------------------- 1 file changed, 86 deletions(-) delete mode 100644 pandas/core/groupby/a.md diff --git a/pandas/core/groupby/a.md b/pandas/core/groupby/a.md deleted file mode 100644 index 77f416d8a575a..0000000000000 --- a/pandas/core/groupby/a.md +++ /dev/null @@ -1,86 +0,0 @@ -Purely integer-location based indexing for selection by position per group. - -``.iloc[]`` is primarily integer position based (from ``0`` to -``length-1`` of the axis), - -Allowed inputs for the first index are: - -- An integer, e.g. ``5``. -- A slice object with ints and positive step, e.g. ``1:``, ``4:-3:2``. - -Allowed inputs for the second index are as for DataFrame.iloc, namely: - -- An integer, e.g. ``5``. -- A list or array of integers, e.g. ``[4, 3, 0]``. -- A slice object with ints, e.g. ``1:7``. -- A boolean array. -- A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above). - -The output format is the same as GroupBy.head and GroupBy.tail, namely a subset -of the original DataFrame or Series with the index and order preserved. - -The effect of ``grouped.iloc[i:j, k:l]`` is similar to - - grouped.apply(lambda x: x.iloc[i:j, k:l]) - -but very much faster and preserving the original index and order. - -The behaviour is different from GroupBy.take: -- Input to iloc is a slice of indexes rather than a list of indexes. -- Output from iloc is: - - In the same order as the original grouped DataFrame or Series. - - Has the same index columns as the original grouped DataFrame or Series. - (GroupBy.take introduces an additional index) -- GroupBy.take is extremely slow when there is a high group count. - -The behaviour is different from GroupBy.nth: -- Input to iloc is a slice of indexes rather than a list of indexes. -- Output from iloc is: - - In the same order as the original grouped DataFrame or Series. - - Has the same index columns as the original grouped DataFrame or Series. - (nth removes the grouped index) -- GroupBy.nth is quite fast for a high group count but the processing time - grows with the length of the list of indexes. - -Since GroupBy.take and GroupBy.nth only accept a list of individual indexes -it is not possible to define a slice that ends relative to the last row of -each group. - -An important use case for GroupBy.iloc is a multi-indexed DataFrame with a -large primary index (Date, say) and a secondary index sorted to a different -order for each Date. -To reduce the DataFrame to a middle slice of each Date: - - df.groupby("Date").iloc[5:-5] - -This returns a subset of df containing just the middle rows for each Date -and with its original order and indexing preserved. -(See test_multiindex() in tests/groupby/test_groupby_iloc.py) - -Returns -------- -Series or DataFrame - -See Also --------- -DataFrame.iloc : Purely integer-location based indexing for selection by position. -GroupBy.head : Return first n rows of each group. -GroupBy.tail : Return last n rows of each group. -GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows -if n is a list of ints. -GroupBy.take : Return the elements in the given positional indices along an axis. - -Examples --------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A").iloc[1:2] - A B - 1 a 2 - 4 b 5 - >>> df.groupby("A").iloc[:-1, -1:] - B - 0 1 - 1 2 - 3 4 \ No newline at end of file From d075c67f8ff196f2f1f1996191106164520f765e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 19 Aug 2021 16:41:05 +0100 Subject: [PATCH 13/73] Add to doc and improve test --- doc/source/reference/groupby.rst | 1 + pandas/core/groupby/groupbyindexing.py | 4 ++-- pandas/tests/groupby/test_groupby_iloc.py | 11 ++++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index ccf130d03418c..da89c27e43bf8 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -67,6 +67,7 @@ Computations / descriptive stats GroupBy.min GroupBy.ngroup GroupBy.nth + GroupBy.iloc GroupBy.ohlc GroupBy.pad GroupBy.prod diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 9eeb6838b193a..90e3c81955353 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -56,7 +56,7 @@ def iloc(self) -> _ilocGroupByIndexer: (nth behaves like an aggregator and removes the non-grouped indexes) - GroupBy.nth is quite fast for a high group count but slower than head, tail and iloc. - + Since GroupBy.take and GroupBy.nth only accept a list of individual indexes it is not possible to define a slice that ends relative to the last row of each group. @@ -86,7 +86,7 @@ def iloc(self) -> _ilocGroupByIndexer: GroupBy.tail : Return last n rows of each group. GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows if n is a list of ints. - GroupBy.take : Return the elements in the given positional indices along an axis. + DataFrameGroupBy.take : Return the elements in the given positional indices along an axis. Examples -------- diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index bca61c3db33ab..7a6951d9d597d 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -37,23 +37,24 @@ def make_df_from_data(data): df.index.names = ("Date", "Item") return df - ndates = 1000 - nitems = 40 + ndates = 100 + nitems = 20 dates = pd.date_range("20130101", periods=ndates, freq="D") items = [f"item {i}" for i in range(nitems)] data = {} for date in dates: + nitems_for_date = nitems - random.randint(0, 12) levels = [ - (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) for item in items + (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) for item in items[:nitems_for_date] ] levels.sort(key=lambda x: x[1]) data[date] = levels df = make_df_from_data(data) - result = df.groupby("Date").iloc[3:7] + result = df.groupby("Date").iloc[3:-3] - sliced = {date: data[date][3:7] for date in dates} + sliced = {date: data[date][3:-3] for date in dates} expected = make_df_from_data(sliced) tm.assert_frame_equal(result, expected) From df1a767873077b278c7c07c1e6fb92980d062215 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 19 Aug 2021 17:46:13 +0100 Subject: [PATCH 14/73] Tidy-up for pre-commit --- pandas/core/groupby/groupby.py | 3 +-- pandas/core/groupby/groupbyindexing.py | 13 ++++++++----- pandas/tests/groupby/test_groupby_iloc.py | 17 ++++++++++++----- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9fd052d7ee272..cef0b7aae322d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -95,6 +95,7 @@ class providing the base-class of operations. numba_, ops, ) +from pandas.core.groupby.groupbyindexing import GroupByIndexingMixin from pandas.core.indexes.api import ( CategoricalIndex, Index, @@ -109,8 +110,6 @@ class providing the base-class of operations. maybe_use_numba, ) -from pandas.core.groupby.groupbyindexing import GroupByIndexingMixin - _common_see_also = """ See Also -------- diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 90e3c81955353..1fa7d46e271af 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -1,8 +1,9 @@ from __future__ import annotations -from pandas.util._decorators import doc import numpy as np +from pandas.util._decorators import doc + class GroupByIndexingMixin: """ @@ -81,12 +82,14 @@ def iloc(self) -> _ilocGroupByIndexer: See Also -------- - DataFrame.iloc : Purely integer-location based indexing for selection by position. + DataFrame.iloc : Purely integer-location based indexing for selection by + position. GroupBy.head : Return first n rows of each group. GroupBy.tail : Return last n rows of each group. - GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows - if n is a list of ints. - DataFrameGroupBy.take : Return the elements in the given positional indices along an axis. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + DataFrameGroupBy.take : Return the elements in the given positional indices along + an axis. Examples -------- diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py index 7a6951d9d597d..8ba971c7da13d 100644 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ b/pandas/tests/groupby/test_groupby_iloc.py @@ -1,8 +1,9 @@ """ Test positional grouped indexing with iloc GH#42864""" +import random + import pandas as pd import pandas._testing as tm -import random def test_doc_examples(): @@ -67,12 +68,14 @@ def test_against_head_and_tail(): n_rows_per_group = 30 data = { - "group": [f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)], + "group": [ + f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) + ], "value": [ random.randint(0, 10000) / 100 for j in range(n_rows_per_group) for g in range(n_groups) - ] + ], } df = pd.DataFrame(data) grouped = df.groupby("group") @@ -95,8 +98,12 @@ def test_against_df_iloc(): n_rows_per_group = 30 data = { - "group": ["group 0" for j in range(n_rows_per_group)], - "value": [random.randint(0, 10000) / 100 for j in range(n_rows_per_group)] + "group": [ + "group 0" for j in range(n_rows_per_group) + ], + "value": [ + random.randint(0, 10000) / 100 for j in range(n_rows_per_group) + ], } df = pd.DataFrame(data) grouped = df.groupby("group") From f2e9f79f238ef6df117be5142bd348b3ebad93e4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 19 Aug 2021 23:47:24 +0100 Subject: [PATCH 15/73] Update groupbyindexing.py Docstring changes for Sphinx --- pandas/core/groupby/groupbyindexing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 1fa7d46e271af..1fe56c06bfd20 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -45,16 +45,17 @@ def iloc(self) -> _ilocGroupByIndexer: - Input to iloc is a slice of indexes rather than a list of indexes. - Output from iloc is: - In the same order as the original grouped DataFrame or Series. - - Has the same index columns as the original grouped DataFrame or Series. - (GroupBy.take introduces an additional index) + - Has the same index columns as the original grouped DataFrame or + Series. (GroupBy.take introduces an additional index) - GroupBy.take is extremely slow when there is a high group count. The behaviour is different from GroupBy.nth: - Input to iloc is a slice of indexes rather than a list of indexes. - Output from iloc is: - In the same order as the original grouped DataFrame or Series. - - Has the same index columns as the original grouped DataFrame or Series. - (nth behaves like an aggregator and removes the non-grouped indexes) + - Has the same index columns as the original grouped DataFrame or + Series. (nth behaves like an aggregator and removes the non-grouped + indexes) - GroupBy.nth is quite fast for a high group count but slower than head, tail and iloc. From a9f9848203cadd5ad29a622452ec953239b3f2a8 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 20 Aug 2021 11:23:01 +0100 Subject: [PATCH 16/73] Split a long line --- pandas/core/groupby/groupbyindexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py index 1fe56c06bfd20..2066927a2feeb 100644 --- a/pandas/core/groupby/groupbyindexing.py +++ b/pandas/core/groupby/groupbyindexing.py @@ -136,7 +136,8 @@ def _handle_item(self, arg0, arg1): if step is not None and step < 0: raise ValueError( - f"GroupBy.iloc row slice step must be positive. Slice was {start}:{stop}:{step}" + f"GroupBy.iloc row slice step must be positive." + " Slice was {start}:{stop}:{step}" ) # self.reversed = True # start = None if start is None else -start - 1 From e42c86d3375f19468209cfa640298818496d1c8c Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Sep 2021 08:19:19 +0100 Subject: [PATCH 17/73] GroupBy.rows implementation --- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/groupbyindexing.py | 251 ------------------- pandas/core/groupby/indexing.py | 241 ++++++++++++++++++ pandas/tests/groupby/test_groupby_iloc.py | 154 ------------ pandas/tests/groupby/test_rows.py | 290 ++++++++++++++++++++++ 5 files changed, 532 insertions(+), 406 deletions(-) delete mode 100644 pandas/core/groupby/groupbyindexing.py create mode 100644 pandas/core/groupby/indexing.py delete mode 100644 pandas/tests/groupby/test_groupby_iloc.py create mode 100644 pandas/tests/groupby/test_rows.py diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cef0b7aae322d..dc0f94301eae4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -95,7 +95,7 @@ class providing the base-class of operations. numba_, ops, ) -from pandas.core.groupby.groupbyindexing import GroupByIndexingMixin +from pandas.core.groupby.indexing import GroupByIndexingMixin from pandas.core.indexes.api import ( CategoricalIndex, Index, diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py deleted file mode 100644 index 2066927a2feeb..0000000000000 --- a/pandas/core/groupby/groupbyindexing.py +++ /dev/null @@ -1,251 +0,0 @@ -from __future__ import annotations - -import numpy as np - -from pandas.util._decorators import doc - - -class GroupByIndexingMixin: - """ - Mixin for adding .iloc to GroupBy. - """ - - @property - def iloc(self) -> _ilocGroupByIndexer: - """ - Purely integer-location based indexing for selection by position per group. - - ``.iloc[]`` is primarily integer position based (from ``0`` to - ``length-1`` of the axis), - - Allowed inputs for the first index are: - - - An integer, e.g. ``5``. - - A slice object with ints and positive step, e.g. ``1:``, ``4:-3:2``. - - Allowed inputs for the second index are as for DataFrame.iloc, namely: - - - An integer, e.g. ``5``. - - A list or array of integers, e.g. ``[4, 3, 0]``. - - A slice object with ints, e.g. ``1:7``. - - A boolean array. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above). - - The output format is the same as GroupBy.head and GroupBy.tail, namely a subset - of the original DataFrame or Series with the index and order preserved. - - The effect of ``grouped.iloc[i:j, k:l]`` is similar to - - grouped.apply(lambda x: x.iloc[i:j, k:l]) - - but very much faster and preserving the original index and order. - - The behaviour is different from GroupBy.take: - - Input to iloc is a slice of indexes rather than a list of indexes. - - Output from iloc is: - - In the same order as the original grouped DataFrame or Series. - - Has the same index columns as the original grouped DataFrame or - Series. (GroupBy.take introduces an additional index) - - GroupBy.take is extremely slow when there is a high group count. - - The behaviour is different from GroupBy.nth: - - Input to iloc is a slice of indexes rather than a list of indexes. - - Output from iloc is: - - In the same order as the original grouped DataFrame or Series. - - Has the same index columns as the original grouped DataFrame or - Series. (nth behaves like an aggregator and removes the non-grouped - indexes) - - GroupBy.nth is quite fast for a high group count but slower than head, - tail and iloc. - - Since GroupBy.take and GroupBy.nth only accept a list of individual indexes - it is not possible to define a slice that ends relative to the last row of - each group. - - An important use case for GroupBy.iloc is a multi-indexed DataFrame with a - large primary index (Date, say) and a secondary index sorted to a different - order for each Date. - To reduce the DataFrame to a middle slice of each Date: - - df.groupby("Date").iloc[5:-5] - - This returns a subset of df containing just the middle rows for each Date - and with its original order and indexing preserved. - (See test_multiindex() in tests/groupby/test_groupby_iloc.py) - - Returns - ------- - Series - The filtered subset of the original grouped Series. - DataFrame - The filtered subset of the original grouped DataFrame. - - See Also - -------- - DataFrame.iloc : Purely integer-location based indexing for selection by - position. - GroupBy.head : Return first n rows of each group. - GroupBy.tail : Return last n rows of each group. - GroupBy.nth : Take the nth row from each group if n is an int, or a - subset of rows, if n is a list of ints. - DataFrameGroupBy.take : Return the elements in the given positional indices along - an axis. - - Examples - -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A").iloc[1:2] - A B - 1 a 2 - 4 b 5 - >>> df.groupby("A").iloc[:-1, -1:] - B - 0 1 - 1 2 - 3 4 - """ - return _ilocGroupByIndexer(self) - - -@doc(GroupByIndexingMixin.iloc) -class _ilocGroupByIndexer: - def __init__(self, grouped): - self.grouped = grouped - self.reversed = False - self._cached_ascending_count = None - self._cached_descending_count = None - - def __getitem__(self, arg): - self.reversed = False - - if type(arg) == tuple: - return self._handle_item(arg[0], arg[1]) - - else: - return self._handle_item(arg, None) - - def _handle_item(self, arg0, arg1): - typeof_arg = type(arg0) - - if typeof_arg == slice: - start = arg0.start - stop = arg0.stop - step = arg0.step - - if step is not None and step < 0: - raise ValueError( - f"GroupBy.iloc row slice step must be positive." - " Slice was {start}:{stop}:{step}" - ) - # self.reversed = True - # start = None if start is None else -start - 1 - # stop = None if stop is None else -stop - 1 - # step = -step - - return self._handle_slice(start, stop, step, arg1) - - elif typeof_arg == int: - return self._handle_slice(arg0, arg0 + 1, 1, arg1) - - else: - raise ValueError( - f"GroupBy.iloc row must be an integer or a slice, not a {typeof_arg}" - ) - - def _handle_slice(self, start, stop, step, arg1): - mask = None - if step is None: - step = 1 - - self.grouped._reset_group_selection() - - if start is None: - if step > 1: - mask = self._ascending_count % step == 0 - - else: - if start >= 0: - mask = self._ascending_count >= start - - if step > 1: - mask &= (self._ascending_count - start) % step == 0 - - else: - mask = self._descending_count < -start - - if step > 1: - # - # if start is -ve and -start exceedes the length of a group - # then step must count from the - # first row of that group rather than the calculated offset - # - # count_array + reverse_array gives the length of the - # current group enabling to switch between - # the offset_array and the count_array depending on whether - # -start exceedes the group size - # - offset_array = self._descending_count + start + 1 - limit_array = ( - self._ascending_count + self._descending_count + (start + 1) - ) < 0 - offset_array = np.where( - limit_array, self._ascending_count, offset_array - ) - - mask &= offset_array % step == 0 - - if stop is not None: - if stop >= 0: - if mask is None: - mask = self._ascending_count < stop - - else: - mask &= self._ascending_count < stop - else: - if mask is None: - mask = self._descending_count >= -stop - - else: - mask &= self._descending_count >= -stop - - if mask is None: - arg0 = slice(None) - - else: - arg0 = mask - - if arg1 is None: - return self._selected_obj.iloc[arg0] - - else: - return self._selected_obj.iloc[arg0, arg1] - - @property - def _ascending_count(self): - if self._cached_ascending_count is None: - self._cached_ascending_count = self.grouped._cumcount_array() - if self.reversed: - self._cached_ascending_count = self._cached_ascending_count[::-1] - - return self._cached_ascending_count - - @property - def _descending_count(self): - if self._cached_descending_count is None: - self._cached_descending_count = self.grouped._cumcount_array( - ascending=False - ) - if self.reversed: - self._cached_descending_count = self._cached_descending_count[::-1] - - return self._cached_descending_count - - @property - def _selected_obj(self): - if self.reversed: - return self.grouped._selected_obj.iloc[::-1] - - else: - return self.grouped._selected_obj diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py new file mode 100644 index 0000000000000..044f666a183b8 --- /dev/null +++ b/pandas/core/groupby/indexing.py @@ -0,0 +1,241 @@ +from __future__ import annotations + +import numpy as np + +from pandas.util._decorators import doc + + +class GroupByIndexingMixin: + """ + Mixin for adding .rows to GroupBy. + """ + + @property + def rows(self) -> _rowsGroupByIndexer: + """ + Purely integer-location based indexing for selection by position per group. + + ``.rows[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), + + Allowed inputs for the index are: + + - An integer valued iterable, e.g. ``range(2, 4)``. + - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``. + + Note: the slice step cannot be negative. + + The output format is the same as GroupBy.head and GroupBy.tail, namely a subset + of the original DataFrame or Series with the index and order preserved. + + The effect of ``grouped.rows[i:j]`` is similar to + + grouped.apply(lambda x: x.iloc[i:j]) + + but very much faster and preserving the original index and order. + + The behaviour is different from GroupBy.nth: + - Input to rows can include one or more slices whereas nth just handles + a list of indexes. + - Output from rows is: + - In the same order as the original grouped DataFrame or Series. + - Has the same index columns as the original grouped DataFrame or + Series. (nth behaves like an aggregator and removes the non-grouped + indexes) + - GroupBy.rows can define a slice relative to the last row of each group. + - GroupBy.rows is faster than nth. + - GroupBy.rows does not handle dropna. + + An important use case for GroupBy.rows is a multi-indexed DataFrame with a + large primary index (Date, say) and a secondary index sorted to a different + order for each Date. + To reduce the DataFrame to a middle slice of each Date: + + df.groupby("Date").rows[5:-5] + + This returns a subset of df containing just the middle rows for each Date + and with its original order and indexing preserved. + + To reduce the DataFrame to the remaining rows: + + df.groupby("Date").rows[:5, -5:] + + Returns + ------- + Series + The filtered subset of the original grouped Series. + DataFrame + The filtered subset of the original grouped DataFrame. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + + Examples + -------- + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A").rows[1:2] + A B + 1 a 2 + 4 b 5 + >>> df.groupby("A").rows[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 + """ + return _rowsGroupByIndexer(self) + + +@doc(GroupByIndexingMixin.rows) +class _rowsGroupByIndexer: + def __init__(self, grouped): + self.grouped = grouped + + def __getitem__(self, arg): + self.grouped._reset_group_selection() + self._cached_ascending_count = None + self._cached_descending_count = None + + if isinstance(arg, tuple): + mask = self._handle_tuple(arg) + + elif isinstance(arg, slice): + mask = self._handle_slice(arg) + + elif isinstance(arg, int): + mask = self._handle_int(arg) + + elif isinstance(arg, list): + mask = self._handle_list(arg) + + else: + try: + list_arg = list(arg) + + except TypeError: + raise ValueError( + f"Invalid index {type(arg)}. Must be iterable or a list of " + "integers and slices" + ) + + mask = self._handle_list(list(arg)) + + return self.grouped._selected_obj.iloc[slice(None) if mask is None else mask] + + def _handle_int(self, arg): + if arg >= 0: + return self._ascending_count == arg + + else: + return self._descending_count == (-arg - 1) + + def _handle_list(self, args): + positive = [arg for arg in args if arg >= 0] + negative = [-arg - 1 for arg in args if arg < 0] + + if positive: + mask = np.isin(self._ascending_count, positive) + + else: + mask = False + + if negative: + mask |= np.isin(self._descending_count, negative) + + return mask + + def _handle_tuple(self, args): + mask = False + + for arg in args: + if isinstance(arg, int): + mask |= self._handle_int(arg) + + elif isinstance(arg, slice): + mask |= self._handle_slice(arg) + + else: + raise ValueError( + f"Invalid argument {type(arg)}. " + "Should be int or slice." + ) + + return mask + + def _handle_slice(self, arg): + start = arg.start + stop = arg.stop + step = arg.step + + if step is not None and step < 0: + raise ValueError( + f"Invalid step {step}. Must be non-negative" + ) + + mask = None + if step is None: + step = 1 + + if start is None: + if step > 1: + mask = self._ascending_count % step == 0 + + else: + if start >= 0: + mask = self._ascending_count >= start + + if step > 1: + mask &= (self._ascending_count - start) % step == 0 + + else: + mask = self._descending_count < -start + + offset_array = self._descending_count + start + 1 + limit_array = ( + self._ascending_count + self._descending_count + (start + 1) + ) < 0 + offset_array = np.where( + limit_array, self._ascending_count, offset_array + ) + + mask &= offset_array % step == 0 + + if stop is not None: + if stop >= 0: + if mask is None: + mask = self._ascending_count < stop + + else: + mask &= self._ascending_count < stop + + else: + if mask is None: + mask = self._descending_count >= -stop + + else: + mask &= self._descending_count >= -stop + + return mask + + @property + def _ascending_count(self): + if self._cached_ascending_count is None: + self._cached_ascending_count = self.grouped._cumcount_array() + + return self._cached_ascending_count + + @property + def _descending_count(self): + if self._cached_descending_count is None: + self._cached_descending_count = self.grouped._cumcount_array( + ascending=False + ) + + return self._cached_descending_count diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py deleted file mode 100644 index 8ba971c7da13d..0000000000000 --- a/pandas/tests/groupby/test_groupby_iloc.py +++ /dev/null @@ -1,154 +0,0 @@ -""" Test positional grouped indexing with iloc GH#42864""" - -import random - -import pandas as pd -import pandas._testing as tm - - -def test_doc_examples(): - """Test the examples in the documentation""" - - df = pd.DataFrame( - [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] - ) - - grouped = df.groupby("A") - result = grouped.iloc[1:2, :] - expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) - - tm.assert_frame_equal(result, expected) - - result = grouped.iloc[:-1, -1:] - expected = pd.DataFrame([1, 2, 4], columns=["B"], index=[0, 1, 3]) - - tm.assert_frame_equal(result, expected) - - -def test_multiindex(): - """Test the multiindex mentioned as the use-case in the documentation""" - - def make_df_from_data(data): - rows = {} - for date in dates: - for level in data[date]: - rows[(date, level[0])] = {"A": level[1], "B": level[2]} - - df = pd.DataFrame.from_dict(rows, orient="index") - df.index.names = ("Date", "Item") - return df - - ndates = 100 - nitems = 20 - dates = pd.date_range("20130101", periods=ndates, freq="D") - items = [f"item {i}" for i in range(nitems)] - - data = {} - for date in dates: - nitems_for_date = nitems - random.randint(0, 12) - levels = [ - (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) for item in items[:nitems_for_date] - ] - levels.sort(key=lambda x: x[1]) - data[date] = levels - - df = make_df_from_data(data) - result = df.groupby("Date").iloc[3:-3] - - sliced = {date: data[date][3:-3] for date in dates} - expected = make_df_from_data(sliced) - - tm.assert_frame_equal(result, expected) - - -def test_against_head_and_tail(): - """Test gives the same results as grouped head and tail""" - - n_groups = 100 - n_rows_per_group = 30 - - data = { - "group": [ - f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) - ], - "value": [ - random.randint(0, 10000) / 100 - for j in range(n_rows_per_group) - for g in range(n_groups) - ], - } - df = pd.DataFrame(data) - grouped = df.groupby("group") - - for i in [1, 5, 29, 30, 31, 1000]: - result = grouped.iloc[:i, :] - expected = grouped.head(i) - - tm.assert_frame_equal(result, expected) - - result = grouped.iloc[-i:, :] - expected = grouped.tail(i) - - tm.assert_frame_equal(result, expected) - - -def test_against_df_iloc(): - """Test that a single group gives the same results as DataFame.iloc""" - - n_rows_per_group = 30 - - data = { - "group": [ - "group 0" for j in range(n_rows_per_group) - ], - "value": [ - random.randint(0, 10000) / 100 for j in range(n_rows_per_group) - ], - } - df = pd.DataFrame(data) - grouped = df.groupby("group") - - for start in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: - for stop in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]: - for step in [None, 1, 2, 3, 10, 29, 30, 100]: - result = grouped.iloc[start:stop:step, :] - expected = df.iloc[start:stop:step, :] - - tm.assert_frame_equal(result, expected) - - -def test_series(): - """Test grouped Series""" - - ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) - grouped = ser.groupby(level=0) - result = grouped.iloc[1:2] - expected = pd.Series([2, 5], index=["a", "b"]) - - tm.assert_series_equal(result, expected) - - -def test_step(): - """Test grouped slice with step""" - - data = [["x", f"x{i}"] for i in range(5)] - data += [["y", f"y{i}"] for i in range(4)] - data += [["z", f"z{i}"] for i in range(3)] - df = pd.DataFrame(data, columns=["A", "B"]) - - grouped = df.groupby("A") - - for step in [1, 2, 3, 4, 5]: - result = grouped.iloc[::step, :] - - data = [["x", f"x{i}"] for i in range(0, 5, step)] - data += [["y", f"y{i}"] for i in range(0, 4, step)] - data += [["z", f"z{i}"] for i in range(0, 3, step)] - - index = [0 + i for i in range(0, 5, step)] - index += [5 + i for i in range(0, 4, step)] - index += [9 + i for i in range(0, 3, step)] - - expected = pd.DataFrame(data, columns=["A", "B"], index=index) - - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_rows.py new file mode 100644 index 0000000000000..b4a65405d0401 --- /dev/null +++ b/pandas/tests/groupby/test_rows.py @@ -0,0 +1,290 @@ +""" Test GroupBy.rows positional grouped indexing GH#42864""" + +import random + +import pytest +import pandas as pd +import pandas._testing as tm + +@pytest.fixture() +def small_df(): + data = [ + [0, "a", "a0_at_0"], + [1, "b", "b0_at_1"], + [2, "a", "a1_at_2"], + [3, "b", "b1_at_3"], + [4, "c", "c0_at_4"], + [5, "a", "a2_at_5"], + [6, "a", "a3_at_6"], + [7, "a", "a4_at_7"], + ] + df = pd.DataFrame(data, columns=["Index", "Category", "Value"]) + return df.set_index("Index") + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [0, [0, 1, 4]], + [2, [5]], + [5, []], + [-1, [3, 4, 7]], + [-2, [1, 6]], + [-6, []], + ] +) +def test_int(small_df, arg, expected_rows): + """Test single integer""" + + result = small_df.groupby("Category").rows[arg] + expected = small_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_slice(small_df): + """Test single slice""" + + result = small_df.groupby("Category").rows[0:3:2] + expected = small_df.iloc[[0, 1, 4, 5]] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [[0,2], [0, 1, 4, 5]], + [[0, 2, -1], [0, 1, 3, 4, 5, 7]], + [range(0, 3, 2), [0, 1, 4, 5]], + [{0, 2}, [0, 1, 4, 5]], + ], + ids=[ + "list", + "negative", + "range", + "set", + ] +) +def test_list(small_df, arg, expected_rows): + """Test lists of integers and integer valued iterables""" + + result = small_df.groupby("Category").rows[arg] + expected = small_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_ints(small_df): + """Test tuple of ints""" + + result = small_df.groupby("Category").rows[0, 2, -1] + expected = small_df.iloc[[0, 1, 3, 4, 5, 7]] + + tm.assert_frame_equal(result, expected) + + +def test_slices(small_df): + """Test tuple of slices""" + + result = small_df.groupby("Category").rows[:2, -2:] + expected = small_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + + +def test_mix(small_df): + """Test mixed tuple of ints and slices""" + + result = small_df.groupby("Category").rows[0, 1, -2:] + expected = small_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + + +def test_doc_examples(): + """Test the examples in the documentation""" + + df = pd.DataFrame( + [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] + ) + + grouped = df.groupby("A") + + result = grouped.rows[1:2] + expected = pd.DataFrame( + [["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4] + ) + + tm.assert_frame_equal(result, expected) + + result = grouped.rows[1, -1] + expected = pd.DataFrame( + [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] + ) + + tm.assert_frame_equal(result, expected) + + +def test_multiindex(): + """Test the multiindex mentioned as the use-case in the documentation""" + + def make_df_from_data(data): + rows = {} + for date in dates: + for level in data[date]: + rows[(date, level[0])] = {"A": level[1], "B": level[2]} + + df = pd.DataFrame.from_dict(rows, orient="index") + df.index.names = ("Date", "Item") + return df + + ndates = 100 + nitems = 20 + dates = pd.date_range("20130101", periods=ndates, freq="D") + items = [f"item {i}" for i in range(nitems)] + + data = {} + for date in dates: + nitems_for_date = nitems - random.randint(0, 12) + levels = [ + (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) + for item in items[:nitems_for_date] + ] + levels.sort(key=lambda x: x[1]) + data[date] = levels + + df = make_df_from_data(data) + result = df.groupby("Date").rows[3:-3] + + sliced = {date: data[date][3:-3] for date in dates} + expected = make_df_from_data(sliced) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "arg", [1, 5, 30, 1000] +) +@pytest.mark.parametrize( + "method", ["head", "tail"] +) +@pytest.mark.parametrize( + "simulated", [True, False] +) +def test_against_head_and_tail(arg, method, simulated): + """Test gives the same results as grouped head and tail""" + + n_groups = 100 + n_rows_per_group = 30 + + data = { + "group": [ + f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) + ], + "value": [ + f"group {g} row {j}" + for j in range(n_rows_per_group) + for g in range(n_groups) + ], + } + df = pd.DataFrame(data) + grouped = df.groupby("group") + + if method == "head": + result = grouped.rows[:arg] + + if simulated: + indices = [] + for j in range(arg): + for i in range(n_groups): + if j * n_groups + i < n_groups * n_rows_per_group: + indices.append(j * n_groups + i) + + expected = df.iloc[indices] + + else: + expected = grouped.head(arg) + + else: + result = grouped.rows[-arg:] + + if simulated: + indices = [] + for j in range(arg): + for i in range(n_groups): + if (n_rows_per_group + j - arg) * n_groups + i >= 0: + indices.append((n_rows_per_group + j - arg) * n_groups + i) + + expected = df.iloc[indices] + + else: + expected = grouped.tail(arg) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "start", [None, 0, 1, 10, -1, -10] +) +@pytest.mark.parametrize( + "stop", [None, 0, 1, 10, -1, -10] +) +@pytest.mark.parametrize( + "step", [None, 1, 5] +) +def test_against_df_iloc(start, stop, step): + """Test that a single group gives the same results as DataFame.iloc""" + + n_rows = 30 + + data = { + "group": ["group 0"] * n_rows, + "value": list(range(n_rows)), + } + df = pd.DataFrame(data) + grouped = df.groupby("group") + + result = grouped.rows[start:stop:step] + expected = df.iloc[start:stop:step, :] + + tm.assert_frame_equal(result, expected) + + +def test_series(): + """Test grouped Series""" + + ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) + grouped = ser.groupby(level=0) + result = grouped.rows[1:2] + expected = pd.Series([2, 5], index=["a", "b"]) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "step", [1, 2, 3, 4, 5] +) +def test_step(step): + """Test slice with various step values""" + + data = [["x", f"x{i}"] for i in range(5)] + data += [["y", f"y{i}"] for i in range(4)] + data += [["z", f"z{i}"] for i in range(3)] + df = pd.DataFrame(data, columns=["A", "B"]) + + grouped = df.groupby("A") + + result = grouped.rows[::step] + + data = [["x", f"x{i}"] for i in range(0, 5, step)] + data += [["y", f"y{i}"] for i in range(0, 4, step)] + data += [["z", f"z{i}"] for i in range(0, 3, step)] + + index = [0 + i for i in range(0, 5, step)] + index += [5 + i for i in range(0, 4, step)] + index += [9 + i for i in range(0, 3, step)] + + expected = pd.DataFrame(data, columns=["A", "B"], index=index) + + tm.assert_frame_equal(result, expected) From bab88c90a3e4da973101b4a0998003101e0c0b22 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Sep 2021 09:43:07 +0100 Subject: [PATCH 18/73] Add rows to rst file --- doc/source/reference/groupby.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index da89c27e43bf8..ce82d7e9c482f 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -67,12 +67,12 @@ Computations / descriptive stats GroupBy.min GroupBy.ngroup GroupBy.nth - GroupBy.iloc GroupBy.ohlc GroupBy.pad GroupBy.prod GroupBy.rank GroupBy.pct_change + GroupBy.rows GroupBy.size GroupBy.sem GroupBy.std From c77de1dbd129dab6878399bcbfd46d85b8a15dbc Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Sep 2021 10:55:41 +0100 Subject: [PATCH 19/73] Change iloc to rows in test_allowlist.py --- pandas/tests/groupby/test_allowlist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index a0c09b6c47cf9..aa690dd75eb7d 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -309,7 +309,7 @@ def test_tab_completion(mframe): "rank", "cumprod", "tail", - "iloc", + "rows", "resample", "cummin", "fillna", From e952c25ca2875bdf12b2909fe8127c44a5de8d83 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Sep 2021 12:06:38 +0100 Subject: [PATCH 20/73] Add to base.py --- pandas/core/groupby/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 4a1d71e452713..ec52f90caa420 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -131,7 +131,6 @@ class OutputKey: "groups", "head", "hist", - "iloc", "indices", "ndim", "ngroups", @@ -140,6 +139,7 @@ class OutputKey: "plot", "resample", "rolling", + "rows", "tail", "take", "transform", From 2a6aafc2a6332e8d854d3f28041de5788394c29d Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Sep 2021 15:42:46 +0100 Subject: [PATCH 21/73] Tidy some whitespace for pep8speaks --- pandas/tests/groupby/test_rows.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_rows.py index b4a65405d0401..062d4acba5082 100644 --- a/pandas/tests/groupby/test_rows.py +++ b/pandas/tests/groupby/test_rows.py @@ -6,6 +6,7 @@ import pandas as pd import pandas._testing as tm + @pytest.fixture() def small_df(): data = [ @@ -54,7 +55,7 @@ def test_slice(small_df): @pytest.mark.parametrize( "arg, expected_rows", [ - [[0,2], [0, 1, 4, 5]], + [[0, 2], [0, 1, 4, 5]], [[0, 2, -1], [0, 1, 3, 4, 5, 7]], [range(0, 3, 2), [0, 1, 4, 5]], [{0, 2}, [0, 1, 4, 5]], From b7f8bfe867861db1205b4ed1739742e1da6f9807 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Sep 2021 09:15:13 +0100 Subject: [PATCH 22/73] Tidied mask code --- pandas/core/groupby/indexing.py | 49 +++++++++++++++------------------ 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 044f666a183b8..1a610cf601250 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -30,18 +30,19 @@ def rows(self) -> _rowsGroupByIndexer: The effect of ``grouped.rows[i:j]`` is similar to - grouped.apply(lambda x: x.iloc[i:j]) + ``grouped.apply(lambda x: x.iloc[i:j])`` but very much faster and preserving the original index and order. The behaviour is different from GroupBy.nth: + - Input to rows can include one or more slices whereas nth just handles - a list of indexes. - - Output from rows is: - - In the same order as the original grouped DataFrame or Series. - - Has the same index columns as the original grouped DataFrame or - Series. (nth behaves like an aggregator and removes the non-grouped - indexes) + a list of indexes. + - Output from rows is in the same order as the original grouped DataFrame + or Series. + - Output from rows has the same index columns as the original grouped DataFrame + or Series. (nth behaves like an aggregator and removes the non-grouped + indexes.) - GroupBy.rows can define a slice relative to the last row of each group. - GroupBy.rows is faster than nth. - GroupBy.rows does not handle dropna. @@ -51,14 +52,14 @@ def rows(self) -> _rowsGroupByIndexer: order for each Date. To reduce the DataFrame to a middle slice of each Date: - df.groupby("Date").rows[5:-5] + ``df.groupby("Date").rows[5:-5]`` This returns a subset of df containing just the middle rows for each Date and with its original order and indexing preserved. To reduce the DataFrame to the remaining rows: - df.groupby("Date").rows[:5, -5:] + ``df.groupby("Date").rows[:5, -5:]`` Returns ------- @@ -84,11 +85,13 @@ def rows(self) -> _rowsGroupByIndexer: A B 1 a 2 4 b 5 + >>> df.groupby("A").rows[1, -1] A B 1 a 2 2 a 3 4 b 5 + """ return _rowsGroupByIndexer(self) @@ -125,9 +128,12 @@ def __getitem__(self, arg): "integers and slices" ) - mask = self._handle_list(list(arg)) + mask = self._handle_list(list_arg) + + if mask is None or mask is True: + mask = slice(None) - return self.grouped._selected_obj.iloc[slice(None) if mask is None else mask] + return self.grouped._selected_obj.iloc[mask] def _handle_int(self, arg): if arg >= 0: @@ -163,8 +169,7 @@ def _handle_tuple(self, args): else: raise ValueError( - f"Invalid argument {type(arg)}. " - "Should be int or slice." + f"Invalid argument {type(arg)}. Should be int or slice." ) return mask @@ -175,11 +180,9 @@ def _handle_slice(self, arg): step = arg.step if step is not None and step < 0: - raise ValueError( - f"Invalid step {step}. Must be non-negative" - ) + raise ValueError(f"Invalid step {step}. Must be non-negative") - mask = None + mask = True if step is None: step = 1 @@ -209,18 +212,10 @@ def _handle_slice(self, arg): if stop is not None: if stop >= 0: - if mask is None: - mask = self._ascending_count < stop - - else: - mask &= self._ascending_count < stop + mask &= self._ascending_count < stop else: - if mask is None: - mask = self._descending_count >= -stop - - else: - mask &= self._descending_count >= -stop + mask &= self._descending_count >= -stop return mask From 86e0c2e9b8a03b59bc7d87724e9d912cd7e4da75 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Sep 2021 10:22:29 +0100 Subject: [PATCH 23/73] test_rows.py formatting --- pandas/tests/groupby/test_rows.py | 37 +++++++++---------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_rows.py index 062d4acba5082..a5f3807e974f8 100644 --- a/pandas/tests/groupby/test_rows.py +++ b/pandas/tests/groupby/test_rows.py @@ -3,6 +3,7 @@ import random import pytest + import pandas as pd import pandas._testing as tm @@ -32,7 +33,7 @@ def small_df(): [-1, [3, 4, 7]], [-2, [1, 6]], [-6, []], - ] + ], ) def test_int(small_df, arg, expected_rows): """Test single integer""" @@ -65,7 +66,7 @@ def test_slice(small_df): "negative", "range", "set", - ] + ], ) def test_list(small_df, arg, expected_rows): """Test lists of integers and integer valued iterables""" @@ -113,9 +114,7 @@ def test_doc_examples(): grouped = df.groupby("A") result = grouped.rows[1:2] - expected = pd.DataFrame( - [["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4] - ) + expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) tm.assert_frame_equal(result, expected) @@ -164,15 +163,9 @@ def make_df_from_data(data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "arg", [1, 5, 30, 1000] -) -@pytest.mark.parametrize( - "method", ["head", "tail"] -) -@pytest.mark.parametrize( - "simulated", [True, False] -) +@pytest.mark.parametrize("arg", [1, 5, 30, 1000]) +@pytest.mark.parametrize("method", ["head", "tail"]) +@pytest.mark.parametrize("simulated", [True, False]) def test_against_head_and_tail(arg, method, simulated): """Test gives the same results as grouped head and tail""" @@ -225,15 +218,9 @@ def test_against_head_and_tail(arg, method, simulated): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "start", [None, 0, 1, 10, -1, -10] -) -@pytest.mark.parametrize( - "stop", [None, 0, 1, 10, -1, -10] -) -@pytest.mark.parametrize( - "step", [None, 1, 5] -) +@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10]) +@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10]) +@pytest.mark.parametrize("step", [None, 1, 5]) def test_against_df_iloc(start, stop, step): """Test that a single group gives the same results as DataFame.iloc""" @@ -263,9 +250,7 @@ def test_series(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "step", [1, 2, 3, 4, 5] -) +@pytest.mark.parametrize("step", [1, 2, 3, 4, 5]) def test_step(step): """Test slice with various step values""" From 6f7550293a1e4077d317fb1f53a03914b0ab9555 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Sep 2021 11:07:03 +0100 Subject: [PATCH 24/73] Correct docstring bullet format --- pandas/core/groupby/indexing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 1a610cf601250..9d2a7d97fde8b 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -37,12 +37,12 @@ def rows(self) -> _rowsGroupByIndexer: The behaviour is different from GroupBy.nth: - Input to rows can include one or more slices whereas nth just handles - a list of indexes. + a list of indexes. - Output from rows is in the same order as the original grouped DataFrame - or Series. + or Series. - Output from rows has the same index columns as the original grouped DataFrame - or Series. (nth behaves like an aggregator and removes the non-grouped - indexes.) + or Series. (nth behaves like an aggregator and removes the non-grouped + indexes.) - GroupBy.rows can define a slice relative to the last row of each group. - GroupBy.rows is faster than nth. - GroupBy.rows does not handle dropna. From 8de5ff27cfc9a04a2664118354c0a8f98363ada4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 5 Sep 2021 19:58:42 +0100 Subject: [PATCH 25/73] Update test_rows.py --- pandas/tests/groupby/test_rows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_rows.py index a5f3807e974f8..c662b1301f490 100644 --- a/pandas/tests/groupby/test_rows.py +++ b/pandas/tests/groupby/test_rows.py @@ -234,7 +234,7 @@ def test_against_df_iloc(start, stop, step): grouped = df.groupby("group") result = grouped.rows[start:stop:step] - expected = df.iloc[start:stop:step, :] + expected = df.iloc[start:stop:step] tm.assert_frame_equal(result, expected) From f51fa88e669a7943488a3c1093b602ec0a61ad0d Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 6 Sep 2021 09:10:54 +0100 Subject: [PATCH 26/73] Remove blank line at end of docstring --- pandas/core/groupby/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 9d2a7d97fde8b..32a607a75401c 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -91,7 +91,6 @@ def rows(self) -> _rowsGroupByIndexer: 1 a 2 2 a 3 4 b 5 - """ return _rowsGroupByIndexer(self) From 3063f3a51c58357cbae6cf1605c334bac9f44b54 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 6 Sep 2021 20:24:57 +0100 Subject: [PATCH 27/73] Small change to force rebuild --- pandas/core/groupby/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 32a607a75401c..07aaf980bd9a2 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -32,7 +32,7 @@ def rows(self) -> _rowsGroupByIndexer: ``grouped.apply(lambda x: x.iloc[i:j])`` - but very much faster and preserving the original index and order. + but much faster and preserving the original index and order. The behaviour is different from GroupBy.nth: From 4228251f7b5aadf73db2daa0a0f2375ed0ca3456 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 8 Sep 2021 14:45:51 +0100 Subject: [PATCH 28/73] Make rows 100% compatible with nth --- pandas/core/groupby/indexing.py | 91 +++++++++++++++++++----------- pandas/tests/groupby/test_rows.py | 92 ++++++++++++++++++++----------- 2 files changed, 118 insertions(+), 65 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 07aaf980bd9a2..6fc4fbf049a9c 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -4,6 +4,9 @@ from pandas.util._decorators import doc +from pandas.core.groupby import groupby +from pandas.core.indexes.api import CategoricalIndex + class GroupByIndexingMixin: """ @@ -81,12 +84,12 @@ def rows(self) -> _rowsGroupByIndexer: -------- >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], ... columns=["A", "B"]) - >>> df.groupby("A").rows[1:2] + >>> df.groupby("A", as_index=False).rows[1:2] A B 1 a 2 4 b 5 - >>> df.groupby("A").rows[1, -1] + >>> df.groupby("A", as_index=False).rows[1, -1] A B 1 a 2 2 a 3 @@ -101,38 +104,62 @@ def __init__(self, grouped): self.grouped = grouped def __getitem__(self, arg): - self.grouped._reset_group_selection() - self._cached_ascending_count = None - self._cached_descending_count = None + with groupby.group_selection_context(self.grouped): + self._cached_ascending_count = None + self._cached_descending_count = None - if isinstance(arg, tuple): - mask = self._handle_tuple(arg) + if isinstance(arg, tuple): + if all(isinstance(i, int) for i in arg): + mask = self._handle_list(arg) - elif isinstance(arg, slice): - mask = self._handle_slice(arg) + else: + mask = self._handle_tuple(arg) - elif isinstance(arg, int): - mask = self._handle_int(arg) + elif isinstance(arg, slice): + mask = self._handle_slice(arg) - elif isinstance(arg, list): - mask = self._handle_list(arg) + elif isinstance(arg, int): + mask = self._handle_int(arg) - else: - try: - list_arg = list(arg) + elif isinstance(arg, list): + mask = self._handle_list(arg) - except TypeError: - raise ValueError( - f"Invalid index {type(arg)}. Must be iterable or a list of " - "integers and slices" - ) + else: + try: + list_arg = list(arg) + + except TypeError: + raise ValueError( + f"Invalid index {type(arg)}. Must be iterable or a list of " + "integers and slices" + ) + + mask = self._handle_list(list_arg) + + if mask is None or mask is True: + mask = slice(None) - mask = self._handle_list(list_arg) + result = self.grouped._selected_obj[mask] - if mask is None or mask is True: - mask = slice(None) + if self.grouped.as_index: + ids, _, _ = self.grouped.grouper.group_info - return self.grouped._selected_obj.iloc[mask] + # Drop NA values in grouping + mask &= ids != -1 + + result_index = self.grouped.grouper.result_index + result.index = result_index[ids[mask]] + + if not self.grouped.observed and isinstance( + result_index, CategoricalIndex + ): + result = result.reindex(result_index) + + result = self.grouped._reindex_output(result) + if self.grouped.sort: + result = result.sort_index() + + return result def _handle_int(self, arg): if arg >= 0: @@ -145,11 +172,10 @@ def _handle_list(self, args): positive = [arg for arg in args if arg >= 0] negative = [-arg - 1 for arg in args if arg < 0] - if positive: - mask = np.isin(self._ascending_count, positive) + mask = False - else: - mask = False + if positive: + mask |= np.isin(self._ascending_count, positive) if negative: mask |= np.isin(self._descending_count, negative) @@ -182,22 +208,23 @@ def _handle_slice(self, arg): raise ValueError(f"Invalid step {step}. Must be non-negative") mask = True + if step is None: step = 1 if start is None: if step > 1: - mask = self._ascending_count % step == 0 + mask &= self._ascending_count % step == 0 else: if start >= 0: - mask = self._ascending_count >= start + mask &= self._ascending_count >= start if step > 1: mask &= (self._ascending_count - start) % step == 0 else: - mask = self._descending_count < -start + mask &= self._descending_count < -start offset_array = self._descending_count + start + 1 limit_array = ( diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_rows.py index c662b1301f490..f96dc38755128 100644 --- a/pandas/tests/groupby/test_rows.py +++ b/pandas/tests/groupby/test_rows.py @@ -24,6 +24,11 @@ def small_df(): return df.set_index("Index") +@pytest.fixture() +def small_grouped(small_df): + return small_df.groupby("Category", as_index=False) + + @pytest.mark.parametrize( "arg, expected_rows", [ @@ -35,19 +40,19 @@ def small_df(): [-6, []], ], ) -def test_int(small_df, arg, expected_rows): +def test_int(small_df, small_grouped, arg, expected_rows): """Test single integer""" - result = small_df.groupby("Category").rows[arg] + result = small_grouped.rows[arg] expected = small_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) -def test_slice(small_df): +def test_slice(small_df, small_grouped): """Test single slice""" - result = small_df.groupby("Category").rows[0:3:2] + result = small_grouped.rows[0:3:2] expected = small_df.iloc[[0, 1, 4, 5]] tm.assert_frame_equal(result, expected) @@ -68,42 +73,57 @@ def test_slice(small_df): "set", ], ) -def test_list(small_df, arg, expected_rows): +def test_list(small_df, small_grouped, arg, expected_rows): """Test lists of integers and integer valued iterables""" - result = small_df.groupby("Category").rows[arg] + result = small_grouped.rows[arg] expected = small_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) -def test_ints(small_df): +def test_ints(small_df, small_grouped): """Test tuple of ints""" - result = small_df.groupby("Category").rows[0, 2, -1] + result = small_grouped.rows[0, 2, -1] expected = small_df.iloc[[0, 1, 3, 4, 5, 7]] tm.assert_frame_equal(result, expected) -def test_slices(small_df): +def test_slices(small_df, small_grouped): """Test tuple of slices""" - result = small_df.groupby("Category").rows[:2, -2:] + result = small_grouped.rows[:2, -2:] expected = small_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) -def test_mix(small_df): +def test_mix(small_df, small_grouped): """Test mixed tuple of ints and slices""" - result = small_df.groupby("Category").rows[0, 1, -2:] + result = small_grouped.rows[0, 1, -2:] expected = small_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [0, [0, 1, 4]], + [[0, 2, -1], [0, 1, 3, 4, 5, 7]], + [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]], + ], +) +def test_as_index(small_df, arg, expected_rows): + result = small_df.groupby("Category", sort=False).rows[arg] + expected = small_df.iloc[expected_rows].set_index("Category") + + tm.assert_frame_equal(result, expected) + + def test_doc_examples(): """Test the examples in the documentation""" @@ -111,7 +131,7 @@ def test_doc_examples(): [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] ) - grouped = df.groupby("A") + grouped = df.groupby("A", as_index=False) result = grouped.rows[1:2] expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) @@ -126,19 +146,8 @@ def test_doc_examples(): tm.assert_frame_equal(result, expected) -def test_multiindex(): - """Test the multiindex mentioned as the use-case in the documentation""" - - def make_df_from_data(data): - rows = {} - for date in dates: - for level in data[date]: - rows[(date, level[0])] = {"A": level[1], "B": level[2]} - - df = pd.DataFrame.from_dict(rows, orient="index") - df.index.names = ("Date", "Item") - return df - +@pytest.fixture() +def multiindex_data(): ndates = 100 nitems = 20 dates = pd.date_range("20130101", periods=ndates, freq="D") @@ -154,11 +163,28 @@ def make_df_from_data(data): levels.sort(key=lambda x: x[1]) data[date] = levels - df = make_df_from_data(data) - result = df.groupby("Date").rows[3:-3] + return data + + +def _make_df_from_data(data): + rows = {} + for date in data: + for level in data[date]: + rows[(date, level[0])] = {"A": level[1], "B": level[2]} + + df = pd.DataFrame.from_dict(rows, orient="index") + df.index.names = ("Date", "Item") + return df + + +def test_multiindex(multiindex_data): + """Test the multiindex mentioned as the use-case in the documentation""" + + df = _make_df_from_data(multiindex_data) + result = df.groupby("Date", as_index=False).nth(slice(3, -3)) - sliced = {date: data[date][3:-3] for date in dates} - expected = make_df_from_data(sliced) + sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data} + expected = _make_df_from_data(sliced) tm.assert_frame_equal(result, expected) @@ -183,7 +209,7 @@ def test_against_head_and_tail(arg, method, simulated): ], } df = pd.DataFrame(data) - grouped = df.groupby("group") + grouped = df.groupby("group", as_index=False) if method == "head": result = grouped.rows[:arg] @@ -231,7 +257,7 @@ def test_against_df_iloc(start, stop, step): "value": list(range(n_rows)), } df = pd.DataFrame(data) - grouped = df.groupby("group") + grouped = df.groupby("group", as_index=False) result = grouped.rows[start:stop:step] expected = df.iloc[start:stop:step] @@ -259,7 +285,7 @@ def test_step(step): data += [["z", f"z{i}"] for i in range(3)] df = pd.DataFrame(data, columns=["A", "B"]) - grouped = df.groupby("A") + grouped = df.groupby("A", as_index=False) result = grouped.rows[::step] From 41b1c7323cb294f6d32c6264bc84ec959c323ff6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 8 Sep 2021 14:46:58 +0100 Subject: [PATCH 29/73] Temporarily reroute nth list and slice to rows --- pandas/core/groupby/groupby.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 742ac597c6033..d6a63cc778e32 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2299,6 +2299,13 @@ def nth( 1 1 2.0 4 2 5.0 """ + from collections.abc import Iterable + if isinstance(n, Iterable): + return self.rows[tuple(n)] + + elif isinstance(n, slice): + return self.rows[n] + valid_containers = (set, list, tuple) if not isinstance(n, (valid_containers, int)): raise TypeError("n needs to be an int or a list/set/tuple of ints") From ce36210a8168321ecbabf491ac957ee195cbcfb2 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 9 Sep 2021 17:15:32 +0100 Subject: [PATCH 30/73] Rows for all non-dropna calls + types and tests --- doc/source/reference/groupby.rst | 1 - pandas/core/groupby/groupby.py | 52 +++------- pandas/core/groupby/indexing.py | 158 ++++++++----------------------- pandas/tests/groupby/test_nth.py | 46 +++++++++ 4 files changed, 99 insertions(+), 158 deletions(-) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index ce82d7e9c482f..ccf130d03418c 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -72,7 +72,6 @@ Computations / descriptive stats GroupBy.prod GroupBy.rank GroupBy.pct_change - GroupBy.rows GroupBy.size GroupBy.sem GroupBy.std diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d6a63cc778e32..24ee8273df863 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -26,6 +26,7 @@ class providing the base-class of operations. Literal, Mapping, Sequence, + Iterable, TypeVar, Union, cast, @@ -2224,11 +2225,10 @@ def backfill(self, limit=None): @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def nth( - self, n: int | list[int], dropna: Literal["any", "all", None] = None + self, n: int | slice | list[int | slice], dropna: Literal["any", "all", None] = None ) -> DataFrame: """ - Take the nth row from each group if n is an int, or a subset of rows - if n is a list of ints. + Take the nth row from each group if n is an int, otherwise a subset of rows. If dropna, will take the nth non-null row, dropna is either 'all' or 'any'; this is equivalent to calling dropna(how=dropna) @@ -2236,8 +2236,8 @@ def nth( Parameters ---------- - n : int or list of ints - A single nth value for the row or a list of nth values. + n : int, slice or list of ints and slices + A single nth value for the row or a list of nth values or slices. dropna : {'any', 'all', None}, default None Apply the specified dropna operation before counting which row is the nth row. @@ -2275,6 +2275,12 @@ def nth( 1 2.0 2 3.0 2 5.0 + >>> g.nth(slice(None, -1)) + B + A + 1 NaN + 1 2.0 + 2 3.0 Specifying `dropna` allows count ignoring ``NaN`` @@ -2306,43 +2312,15 @@ def nth( elif isinstance(n, slice): return self.rows[n] - valid_containers = (set, list, tuple) + valid_containers = (set, list, tuple, range, slice) if not isinstance(n, (valid_containers, int)): raise TypeError("n needs to be an int or a list/set/tuple of ints") if not dropna: + if isinstance(n, Iterable): + return self.rows[tuple(n)] - if isinstance(n, int): - nth_values = [n] - elif isinstance(n, valid_containers): - nth_values = list(set(n)) - - nth_array = np.array(nth_values, dtype=np.intp) - with group_selection_context(self): - - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d( - self._cumcount_array(ascending=False) + 1, -nth_array - ) - mask = mask_left | mask_right - - ids, _, _ = self.grouper.group_info - - # Drop NA values in grouping - mask = mask & (ids != -1) - - out = self._selected_obj[mask] - if not self.as_index: - return out - - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] - - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) - - out = self._reindex_output(out) - return out.sort_index() if self.sort else out + return self.rows[n] # dropna is truthy if isinstance(n, valid_containers): diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 6fc4fbf049a9c..946c4fd34b6d1 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -1,8 +1,22 @@ from __future__ import annotations +from typing import Iterable + import numpy as np -from pandas.util._decorators import doc +from pandas._typing import ( + FrameOrSeries, + PositionalIndexer, +) +from pandas.util._decorators import ( + cache_readonly, + doc, +) + +from pandas.core.dtypes.common import ( + is_integer, + is_list_like, +) from pandas.core.groupby import groupby from pandas.core.indexes.api import CategoricalIndex @@ -15,101 +29,18 @@ class GroupByIndexingMixin: @property def rows(self) -> _rowsGroupByIndexer: - """ - Purely integer-location based indexing for selection by position per group. - - ``.rows[]`` is primarily integer position based (from ``0`` to - ``length-1`` of the axis), - - Allowed inputs for the index are: - - - An integer valued iterable, e.g. ``range(2, 4)``. - - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``. - - Note: the slice step cannot be negative. - - The output format is the same as GroupBy.head and GroupBy.tail, namely a subset - of the original DataFrame or Series with the index and order preserved. - - The effect of ``grouped.rows[i:j]`` is similar to - - ``grouped.apply(lambda x: x.iloc[i:j])`` - - but much faster and preserving the original index and order. - - The behaviour is different from GroupBy.nth: - - - Input to rows can include one or more slices whereas nth just handles - a list of indexes. - - Output from rows is in the same order as the original grouped DataFrame - or Series. - - Output from rows has the same index columns as the original grouped DataFrame - or Series. (nth behaves like an aggregator and removes the non-grouped - indexes.) - - GroupBy.rows can define a slice relative to the last row of each group. - - GroupBy.rows is faster than nth. - - GroupBy.rows does not handle dropna. - - An important use case for GroupBy.rows is a multi-indexed DataFrame with a - large primary index (Date, say) and a secondary index sorted to a different - order for each Date. - To reduce the DataFrame to a middle slice of each Date: - - ``df.groupby("Date").rows[5:-5]`` - - This returns a subset of df containing just the middle rows for each Date - and with its original order and indexing preserved. - - To reduce the DataFrame to the remaining rows: - - ``df.groupby("Date").rows[:5, -5:]`` - - Returns - ------- - Series - The filtered subset of the original grouped Series. - DataFrame - The filtered subset of the original grouped DataFrame. - - See Also - -------- - DataFrame.iloc : Purely integer-location based indexing for selection by - position. - GroupBy.head : Return first n rows of each group. - GroupBy.tail : Return last n rows of each group. - GroupBy.nth : Take the nth row from each group if n is an int, or a - subset of rows, if n is a list of ints. - - Examples - -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A", as_index=False).rows[1:2] - A B - 1 a 2 - 4 b 5 - - >>> df.groupby("A", as_index=False).rows[1, -1] - A B - 1 a 2 - 2 a 3 - 4 b 5 - """ return _rowsGroupByIndexer(self) @doc(GroupByIndexingMixin.rows) class _rowsGroupByIndexer: - def __init__(self, grouped): + def __init__(self, grouped: groupby.GroupBy): self.grouped = grouped - def __getitem__(self, arg): + def __getitem__(self, arg: PositionalIndexer) -> FrameOrSeries: with groupby.group_selection_context(self.grouped): - self._cached_ascending_count = None - self._cached_descending_count = None - if isinstance(arg, tuple): - if all(isinstance(i, int) for i in arg): + if all(is_integer(i) for i in arg): mask = self._handle_list(arg) else: @@ -118,23 +49,23 @@ def __getitem__(self, arg): elif isinstance(arg, slice): mask = self._handle_slice(arg) - elif isinstance(arg, int): + elif is_integer(arg): mask = self._handle_int(arg) - elif isinstance(arg, list): + elif is_list_like(arg): mask = self._handle_list(arg) else: - try: - list_arg = list(arg) + raise ValueError( + f"Invalid index {type(arg)}. " + "Must be integer, list-like, slice or a tuple of " + "integers and slices" + ) - except TypeError: - raise ValueError( - f"Invalid index {type(arg)}. Must be iterable or a list of " - "integers and slices" - ) + ids, _, _ = self.grouped.grouper.group_info - mask = self._handle_list(list_arg) + # Drop NA values in grouping + mask &= ids != -1 if mask is None or mask is True: mask = slice(None) @@ -142,11 +73,6 @@ def __getitem__(self, arg): result = self.grouped._selected_obj[mask] if self.grouped.as_index: - ids, _, _ = self.grouped.grouper.group_info - - # Drop NA values in grouping - mask &= ids != -1 - result_index = self.grouped.grouper.result_index result.index = result_index[ids[mask]] @@ -161,14 +87,14 @@ def __getitem__(self, arg): return result - def _handle_int(self, arg): + def _handle_int(self, arg: int) -> np.ndarray: if arg >= 0: return self._ascending_count == arg else: return self._descending_count == (-arg - 1) - def _handle_list(self, args): + def _handle_list(self, args: Iterable[int]) -> np.ndarray: positive = [arg for arg in args if arg >= 0] negative = [-arg - 1 for arg in args if arg < 0] @@ -182,7 +108,7 @@ def _handle_list(self, args): return mask - def _handle_tuple(self, args): + def _handle_tuple(self, args: tuple) -> np.ndarray: mask = False for arg in args: @@ -199,7 +125,7 @@ def _handle_tuple(self, args): return mask - def _handle_slice(self, arg): + def _handle_slice(self, arg: slice) -> np.ndarray: start = arg.start stop = arg.stop step = arg.step @@ -245,18 +171,10 @@ def _handle_slice(self, arg): return mask - @property - def _ascending_count(self): - if self._cached_ascending_count is None: - self._cached_ascending_count = self.grouped._cumcount_array() - - return self._cached_ascending_count - - @property - def _descending_count(self): - if self._cached_descending_count is None: - self._cached_descending_count = self.grouped._cumcount_array( - ascending=False - ) + @cache_readonly + def _ascending_count(self) -> np.ndarray: + return self.grouped._cumcount_array() - return self._cached_descending_count + @cache_readonly + def _descending_count(self) -> np.ndarray: + return self.grouped._cumcount_array(ascending=False) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index e7a5e931f5297..0367bd8b164d6 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -689,3 +689,49 @@ def test_first_multi_key_groupbby_categorical(): [(1, 100), (1, 200), (2, 100)], names=["A", "B"] ) tm.assert_frame_equal(result, expected) + + +@pytest.fixture() +def small_df(): + data = [ + [0, "a", "a0_at_0"], + [1, "b", "b0_at_1"], + [2, "a", "a1_at_2"], + [3, "b", "b1_at_3"], + [4, "c", "c0_at_4"], + [5, "a", "a2_at_5"], + [6, "a", "a3_at_6"], + [7, "a", "a4_at_7"], + ] + df = pd.DataFrame(data, columns=["Index", "Category", "Value"]) + return df.set_index("Index") + + +@pytest.fixture() +def small_grouped(small_df): + return small_df.groupby("Category", as_index=False) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [slice(None, 3, 2), [0, 1, 4, 5]], + [slice(None, -2), [0, 2, 5]], + [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + ], +) +def test_slice(small_df, small_grouped, arg, expected_rows): + """Test slices GH #42947""" + + result = small_grouped.nth(arg) + expected = small_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_negative_step(small_grouped): + """Test for error on negative slice step""" + + with pytest.raises(ValueError, match="Invalid step"): + result = small_grouped.nth(slice(None, None, -1)) \ No newline at end of file From c024e410f5350ac2e3d316ace2d1e62967dd9b11 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 9 Sep 2021 17:50:59 +0100 Subject: [PATCH 31/73] Changes for flake8 --- my build.cmd | 4 ++++ pandas/core/groupby/groupby.py | 6 +++--- pandas/tests/groupby/test_nth.py | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) create mode 100644 my build.cmd diff --git a/my build.cmd b/my build.cmd new file mode 100644 index 0000000000000..9be478cea6198 --- /dev/null +++ b/my build.cmd @@ -0,0 +1,4 @@ +call conda activate pandas-dev +python setup.py build_ext -j 4 +python -m pip install -e . --no-build-isolation --no-use-pep517 +pause diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 894b5c196be1c..adf2ca4aff0ff 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -26,7 +26,6 @@ class providing the base-class of operations. Literal, Mapping, Sequence, - Iterable, TypeVar, Union, cast, @@ -2308,7 +2307,9 @@ def backfill(self, limit=None): @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def nth( - self, n: int | slice | list[int | slice], dropna: Literal["any", "all", None] = None + self, + n: int | slice | list[int | slice], + dropna: Literal["any", "all", None] = None ) -> DataFrame: """ Take the nth row from each group if n is an int, otherwise a subset of rows. @@ -2388,7 +2389,6 @@ def nth( 1 1 2.0 4 2 5.0 """ - from collections.abc import Iterable if isinstance(n, Iterable): return self.rows[tuple(n)] diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 1c34915cf5153..c05744f464845 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -703,7 +703,7 @@ def small_df(): [6, "a", "a3_at_6"], [7, "a", "a4_at_7"], ] - df = pd.DataFrame(data, columns=["Index", "Category", "Value"]) + df = DataFrame(data, columns=["Index", "Category", "Value"]) return df.set_index("Index") @@ -751,4 +751,4 @@ def test_negative_step(small_grouped): """Test for error on negative slice step""" with pytest.raises(ValueError, match="Invalid step"): - result = small_grouped.nth(slice(None, None, -1)) \ No newline at end of file + small_grouped.nth(slice(None, None, -1)) From 8abcac32b504909cf3812c2a6bb6e8b65d1ff8ea Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 9 Sep 2021 17:58:41 +0100 Subject: [PATCH 32/73] just one more comma... --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index adf2ca4aff0ff..b9c5b74c7a471 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2309,7 +2309,7 @@ def backfill(self, limit=None): def nth( self, n: int | slice | list[int | slice], - dropna: Literal["any", "all", None] = None + dropna: Literal["any", "all", None] = None, ) -> DataFrame: """ Take the nth row from each group if n is an int, otherwise a subset of rows. From add5727ae14d2ce0df5b87157298f40b5ea3d3f4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 10 Sep 2021 11:06:23 +0100 Subject: [PATCH 33/73] Add type hints --- pandas/core/groupby/groupby.py | 6 ------ pandas/core/groupby/indexing.py | 36 ++++++++++++++++++--------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b9c5b74c7a471..6c4b1f7e21f47 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2389,12 +2389,6 @@ def nth( 1 1 2.0 4 2 5.0 """ - if isinstance(n, Iterable): - return self.rows[tuple(n)] - - elif isinstance(n, slice): - return self.rows[n] - valid_containers = (set, list, tuple, range, slice) if not isinstance(n, (valid_containers, int)): raise TypeError("n needs to be an int or a list/set/tuple of ints") diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 946c4fd34b6d1..eff9d47a7822c 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import Iterable +from typing import ( + Iterable, + cast, +) import numpy as np @@ -29,7 +32,7 @@ class GroupByIndexingMixin: @property def rows(self) -> _rowsGroupByIndexer: - return _rowsGroupByIndexer(self) + return _rowsGroupByIndexer(cast(groupby.GroupBy, self)) @doc(GroupByIndexingMixin.rows) @@ -37,7 +40,7 @@ class _rowsGroupByIndexer: def __init__(self, grouped: groupby.GroupBy): self.grouped = grouped - def __getitem__(self, arg: PositionalIndexer) -> FrameOrSeries: + def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: with groupby.group_selection_context(self.grouped): if isinstance(arg, tuple): if all(is_integer(i) for i in arg): @@ -50,10 +53,10 @@ def __getitem__(self, arg: PositionalIndexer) -> FrameOrSeries: mask = self._handle_slice(arg) elif is_integer(arg): - mask = self._handle_int(arg) + mask = self._handle_int(cast(int, arg)) elif is_list_like(arg): - mask = self._handle_list(arg) + mask = self._handle_list(cast(Iterable[int], arg)) else: raise ValueError( @@ -68,9 +71,10 @@ def __getitem__(self, arg: PositionalIndexer) -> FrameOrSeries: mask &= ids != -1 if mask is None or mask is True: - mask = slice(None) + result = self.grouped._selected_obj[:] - result = self.grouped._selected_obj[mask] + else: + result = self.grouped._selected_obj[mask] if self.grouped.as_index: result_index = self.grouped.grouper.result_index @@ -87,18 +91,18 @@ def __getitem__(self, arg: PositionalIndexer) -> FrameOrSeries: return result - def _handle_int(self, arg: int) -> np.ndarray: + def _handle_int(self, arg: int) -> bool | np.ndarray: if arg >= 0: return self._ascending_count == arg else: return self._descending_count == (-arg - 1) - def _handle_list(self, args: Iterable[int]) -> np.ndarray: + def _handle_list(self, args: Iterable[int]) -> bool | np.ndarray: positive = [arg for arg in args if arg >= 0] negative = [-arg - 1 for arg in args if arg < 0] - mask = False + mask: bool | np.ndarray = False if positive: mask |= np.isin(self._ascending_count, positive) @@ -108,12 +112,12 @@ def _handle_list(self, args: Iterable[int]) -> np.ndarray: return mask - def _handle_tuple(self, args: tuple) -> np.ndarray: - mask = False + def _handle_tuple(self, args: tuple) -> bool | np.ndarray: + mask: bool | np.ndarray = False for arg in args: - if isinstance(arg, int): - mask |= self._handle_int(arg) + if is_integer(arg): + mask |= self._handle_int(cast(int, arg)) elif isinstance(arg, slice): mask |= self._handle_slice(arg) @@ -125,7 +129,7 @@ def _handle_tuple(self, args: tuple) -> np.ndarray: return mask - def _handle_slice(self, arg: slice) -> np.ndarray: + def _handle_slice(self, arg: slice) -> bool | np.ndarray: start = arg.start stop = arg.stop step = arg.step @@ -133,7 +137,7 @@ def _handle_slice(self, arg: slice) -> np.ndarray: if step is not None and step < 0: raise ValueError(f"Invalid step {step}. Must be non-negative") - mask = True + mask: bool | np.ndarray = True if step is None: step = 1 From 25459f7e0adf3cbc17cc443b4b9d9997518e7999 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 12 Sep 2021 17:11:46 +0100 Subject: [PATCH 34/73] Delete my build.cmd. Accidental commit --- my build.cmd | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 my build.cmd diff --git a/my build.cmd b/my build.cmd deleted file mode 100644 index 9be478cea6198..0000000000000 --- a/my build.cmd +++ /dev/null @@ -1,4 +0,0 @@ -call conda activate pandas-dev -python setup.py build_ext -j 4 -python -m pip install -e . --no-build-isolation --no-use-pep517 -pause From fefbacf26ff10fe3200dbf53e99fc426335629c9 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 14:22:44 +0100 Subject: [PATCH 35/73] jreback 12 Sep requested changes --- pandas/core/groupby/base.py | 1 - pandas/core/groupby/groupby.py | 28 ++++++---- pandas/core/groupby/indexing.py | 6 +- pandas/tests/groupby/conftest.py | 21 +++++++ pandas/tests/groupby/test_allowlist.py | 1 - pandas/tests/groupby/test_nth.py | 52 ++++++++--------- pandas/tests/groupby/test_rows.py | 77 ++++++++++---------------- 7 files changed, 94 insertions(+), 92 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index ec52f90caa420..986aaa07a913c 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -139,7 +139,6 @@ class OutputKey: "plot", "resample", "rolling", - "rows", "tail", "take", "transform", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d25b0f0cfa274..cedd5e898f36c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -47,6 +47,7 @@ class providing the base-class of operations. F, FrameOrSeries, IndexLabel, + PositionalIndexer, RandomState, Scalar, T, @@ -66,6 +67,7 @@ class providing the base-class of operations. is_bool_dtype, is_datetime64_dtype, is_float_dtype, + is_integer, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -2387,9 +2389,9 @@ def backfill(self, limit=None): @Substitution(see_also=_common_see_also) def nth( self, - n: int | slice | list[int | slice], + arg: PositionalIndexer | tuple, dropna: Literal["any", "all", None] = None, - ) -> DataFrame: + ) -> FrameOrSeries: """ Take the nth row from each group if n is an int, otherwise a subset of rows. @@ -2401,6 +2403,11 @@ def nth( ---------- n : int, slice or list of ints and slices A single nth value for the row or a list of nth values or slices. + + .. versionchanged:: 1.4.0 + Added slice and lists containiing slices + + dropna : {'any', 'all', None}, default None Apply the specified dropna operation before counting which row is the nth row. @@ -2468,30 +2475,27 @@ def nth( 1 1 2.0 4 2 5.0 """ - valid_containers = (set, list, tuple, range, slice) - if not isinstance(n, (valid_containers, int)): - raise TypeError("n needs to be an int or a list/set/tuple of ints") - if not dropna: - if isinstance(n, Iterable): - return self.rows[tuple(n)] + if isinstance(arg, Iterable): + return self._rows[tuple(arg)] - return self.rows[n] + return self._rows[arg] # dropna is truthy - if isinstance(n, valid_containers): - raise ValueError("dropna option with a list of nth values is not supported") + if not is_integer(arg): + raise ValueError("dropna option only supported for an integer argument") if dropna not in ["any", "all"]: # Note: when agg-ing picker doesn't raise this, just returns NaN raise ValueError( - "For a DataFrame groupby, dropna must be " + "For a DataFrame groupby.nth, dropna must be " "either None, 'any' or 'all', " f"(was passed {dropna})." ) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf + n = cast(int, arg) max_len = n if n >= 0 else -1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index eff9d47a7822c..6716198abd3a9 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -31,11 +31,11 @@ class GroupByIndexingMixin: """ @property - def rows(self) -> _rowsGroupByIndexer: + def _rows(self) -> _rowsGroupByIndexer: return _rowsGroupByIndexer(cast(groupby.GroupBy, self)) -@doc(GroupByIndexingMixin.rows) +@doc(GroupByIndexingMixin._rows) class _rowsGroupByIndexer: def __init__(self, grouped: groupby.GroupBy): self.grouped = grouped @@ -59,7 +59,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: mask = self._handle_list(cast(Iterable[int], arg)) else: - raise ValueError( + raise TypeError( f"Invalid index {type(arg)}. " "Must be integer, list-like, slice or a tuple of " "integers and slices" diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index d699d05963b46..622c56d707ead 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -116,6 +116,27 @@ def three_group(): ) +@pytest.fixture() +def slice_test_df(): + data = [ + [0, "a", "a0_at_0"], + [1, "b", "b0_at_1"], + [2, "a", "a1_at_2"], + [3, "b", "b1_at_3"], + [4, "c", "c0_at_4"], + [5, "a", "a2_at_5"], + [6, "a", "a3_at_6"], + [7, "a", "a4_at_7"], + ] + df = DataFrame(data, columns=["Index", "Group", "Value"]) + return df.set_index("Index") + + +@pytest.fixture() +def slice_test_grouped(slice_test_df): + return slice_test_df.groupby("Group", as_index=False) + + @pytest.fixture(params=sorted(reduction_kernels)) def reduction_func(request): """ diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index aa690dd75eb7d..8be721c13eea8 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -309,7 +309,6 @@ def test_tab_completion(mframe): "rank", "cumprod", "tail", - "rows", "resample", "cummin", "fillna", diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index c05744f464845..5a35e7d1c6f92 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -691,22 +691,6 @@ def test_first_multi_key_groupbby_categorical(): tm.assert_frame_equal(result, expected) -@pytest.fixture() -def small_df(): - data = [ - [0, "a", "a0_at_0"], - [1, "b", "b0_at_1"], - [2, "a", "a1_at_2"], - [3, "b", "b1_at_3"], - [4, "c", "c0_at_4"], - [5, "a", "a2_at_5"], - [6, "a", "a3_at_6"], - [7, "a", "a4_at_7"], - ] - df = DataFrame(data, columns=["Index", "Category", "Value"]) - return df.set_index("Index") - - @pytest.mark.parametrize("method", ["first", "last", "nth"]) def test_groupby_last_first_nth_with_none(method, nulls_fixture): # GH29645 @@ -724,11 +708,6 @@ def test_groupby_last_first_nth_with_none(method, nulls_fixture): tm.assert_series_equal(result, expected) -@pytest.fixture() -def small_grouped(small_df): - return small_df.groupby("Category", as_index=False) - - @pytest.mark.parametrize( "arg, expected_rows", [ @@ -738,17 +717,38 @@ def small_grouped(small_df): [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], ], ) -def test_slice(small_df, small_grouped, arg, expected_rows): +def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): """Test slices GH #42947""" - result = small_grouped.nth(arg) - expected = small_df.iloc[expected_rows] + result = slice_test_grouped.nth(arg) + expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) -def test_negative_step(small_grouped): +def test_invalid_argument(slice_test_grouped): + """Test for error on invalid argument""" + + with pytest.raises(TypeError, match="Invalid index"): + slice_test_grouped.nth(3.14) + + +def test_negative_step(slice_test_grouped): """Test for error on negative slice step""" with pytest.raises(ValueError, match="Invalid step"): - small_grouped.nth(slice(None, None, -1)) + slice_test_grouped.nth(slice(None, None, -1)) + + +def test_np_ints(slice_test_df, slice_test_grouped): + """Test np ints work""" + + result = slice_test_grouped.nth(np.int(0)) + expected = slice_test_df.iloc[[0, 1, 4]] + + tm.assert_frame_equal(result, expected) + + result = slice_test_grouped.nth(np.array([0, 1])) + expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_rows.py index f96dc38755128..a8db715e668d7 100644 --- a/pandas/tests/groupby/test_rows.py +++ b/pandas/tests/groupby/test_rows.py @@ -8,27 +8,6 @@ import pandas._testing as tm -@pytest.fixture() -def small_df(): - data = [ - [0, "a", "a0_at_0"], - [1, "b", "b0_at_1"], - [2, "a", "a1_at_2"], - [3, "b", "b1_at_3"], - [4, "c", "c0_at_4"], - [5, "a", "a2_at_5"], - [6, "a", "a3_at_6"], - [7, "a", "a4_at_7"], - ] - df = pd.DataFrame(data, columns=["Index", "Category", "Value"]) - return df.set_index("Index") - - -@pytest.fixture() -def small_grouped(small_df): - return small_df.groupby("Category", as_index=False) - - @pytest.mark.parametrize( "arg, expected_rows", [ @@ -40,20 +19,20 @@ def small_grouped(small_df): [-6, []], ], ) -def test_int(small_df, small_grouped, arg, expected_rows): +def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): """Test single integer""" - result = small_grouped.rows[arg] - expected = small_df.iloc[expected_rows] + result = slice_test_grouped._rows[arg] + expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) -def test_slice(small_df, small_grouped): +def test_slice(slice_test_df, slice_test_grouped): """Test single slice""" - result = small_grouped.rows[0:3:2] - expected = small_df.iloc[[0, 1, 4, 5]] + result = slice_test_grouped._rows[0:3:2] + expected = slice_test_df.iloc[[0, 1, 4, 5]] tm.assert_frame_equal(result, expected) @@ -73,38 +52,38 @@ def test_slice(small_df, small_grouped): "set", ], ) -def test_list(small_df, small_grouped, arg, expected_rows): +def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): """Test lists of integers and integer valued iterables""" - result = small_grouped.rows[arg] - expected = small_df.iloc[expected_rows] + result = slice_test_grouped._rows[arg] + expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) -def test_ints(small_df, small_grouped): +def test_ints(slice_test_df, slice_test_grouped): """Test tuple of ints""" - result = small_grouped.rows[0, 2, -1] - expected = small_df.iloc[[0, 1, 3, 4, 5, 7]] + result = slice_test_grouped._rows[0, 2, -1] + expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] tm.assert_frame_equal(result, expected) -def test_slices(small_df, small_grouped): +def test_slices(slice_test_df, slice_test_grouped): """Test tuple of slices""" - result = small_grouped.rows[:2, -2:] - expected = small_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + result = slice_test_grouped._rows[:2, -2:] + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) -def test_mix(small_df, small_grouped): +def test_mix(slice_test_df, slice_test_grouped): """Test mixed tuple of ints and slices""" - result = small_grouped.rows[0, 1, -2:] - expected = small_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + result = slice_test_grouped._rows[0, 1, -2:] + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) @@ -117,9 +96,9 @@ def test_mix(small_df, small_grouped): [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]], ], ) -def test_as_index(small_df, arg, expected_rows): - result = small_df.groupby("Category", sort=False).rows[arg] - expected = small_df.iloc[expected_rows].set_index("Category") +def test_as_index(slice_test_df, arg, expected_rows): + result = slice_test_df.groupby("Group", sort=False)._rows[arg] + expected = slice_test_df.iloc[expected_rows].set_index("Group") tm.assert_frame_equal(result, expected) @@ -133,12 +112,12 @@ def test_doc_examples(): grouped = df.groupby("A", as_index=False) - result = grouped.rows[1:2] + result = grouped._rows[1:2] expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) tm.assert_frame_equal(result, expected) - result = grouped.rows[1, -1] + result = grouped._rows[1, -1] expected = pd.DataFrame( [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] ) @@ -212,7 +191,7 @@ def test_against_head_and_tail(arg, method, simulated): grouped = df.groupby("group", as_index=False) if method == "head": - result = grouped.rows[:arg] + result = grouped._rows[:arg] if simulated: indices = [] @@ -227,7 +206,7 @@ def test_against_head_and_tail(arg, method, simulated): expected = grouped.head(arg) else: - result = grouped.rows[-arg:] + result = grouped._rows[-arg:] if simulated: indices = [] @@ -259,7 +238,7 @@ def test_against_df_iloc(start, stop, step): df = pd.DataFrame(data) grouped = df.groupby("group", as_index=False) - result = grouped.rows[start:stop:step] + result = grouped._rows[start:stop:step] expected = df.iloc[start:stop:step] tm.assert_frame_equal(result, expected) @@ -270,7 +249,7 @@ def test_series(): ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) grouped = ser.groupby(level=0) - result = grouped.rows[1:2] + result = grouped._rows[1:2] expected = pd.Series([2, 5], index=["a", "b"]) tm.assert_series_equal(result, expected) @@ -287,7 +266,7 @@ def test_step(step): grouped = df.groupby("A", as_index=False) - result = grouped.rows[::step] + result = grouped._rows[::step] data = [["x", f"x{i}"] for i in range(0, 5, step)] data += [["y", f"y{i}"] for i in range(0, 4, step)] From b5894208ad3acd807e2a3a27eda7d2c27374775b Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 14:33:58 +0100 Subject: [PATCH 36/73] remove white-space --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 49f0b7e5c8d4a..138232bdb3dbf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2395,7 +2395,6 @@ def nth( .. versionchanged:: 1.4.0 Added slice and lists containiing slices - dropna : {'any', 'all', None}, default None Apply the specified dropna operation before counting which row is the nth row. From 89deee3ea979f8bff8fdda79a1e50ab70087ae42 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 15:36:49 +0100 Subject: [PATCH 37/73] Get rid of np.int test --- .gitignore | 13 +++++++++++++ Dockerfile | 2 +- pandas/tests/groupby/test_nth.py | 5 ----- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 2c337be60e94e..1d92e92640638 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,16 @@ doc/build/html/index.html doc/tmp.sv env/ doc/source/savefig/ +Dockerfile +my_tests/ +my notebooks.cmd +*.ipynb +my build.cmd +my documentation.cmd +my tests.cmd +my docstring.cmd +my pre-commit.cmd +my test nth.cmd +my test rows.cmd +pandas/core/groupby/indexing.pyMINE_WITH_DOC +my static analysis.cmd diff --git a/Dockerfile b/Dockerfile index de1c564921de9..ae9aef34a556a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM quay.io/condaforge/miniforge3 # if you forked pandas, you can pass in your own GitHub username to use your fork # i.e. gh_username=myname -ARG gh_username=pandas-dev +ARG gh_username=johnzangwill ARG pandas_home="/home/pandas" # Avoid warnings by switching to noninteractive diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 5a35e7d1c6f92..edf64689e84cc 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -743,11 +743,6 @@ def test_negative_step(slice_test_grouped): def test_np_ints(slice_test_df, slice_test_grouped): """Test np ints work""" - result = slice_test_grouped.nth(np.int(0)) - expected = slice_test_df.iloc[[0, 1, 4]] - - tm.assert_frame_equal(result, expected) - result = slice_test_grouped.nth(np.array([0, 1])) expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] From e28cdfba7c45c8f56e75821eb7161294dfe68adf Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 16:54:29 +0100 Subject: [PATCH 38/73] Revert "Get rid of np.int test" This reverts commit 89deee3ea979f8bff8fdda79a1e50ab70087ae42. --- .gitignore | 13 ------------- Dockerfile | 2 +- pandas/tests/groupby/test_nth.py | 5 +++++ 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 1d92e92640638..2c337be60e94e 100644 --- a/.gitignore +++ b/.gitignore @@ -120,16 +120,3 @@ doc/build/html/index.html doc/tmp.sv env/ doc/source/savefig/ -Dockerfile -my_tests/ -my notebooks.cmd -*.ipynb -my build.cmd -my documentation.cmd -my tests.cmd -my docstring.cmd -my pre-commit.cmd -my test nth.cmd -my test rows.cmd -pandas/core/groupby/indexing.pyMINE_WITH_DOC -my static analysis.cmd diff --git a/Dockerfile b/Dockerfile index ae9aef34a556a..de1c564921de9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM quay.io/condaforge/miniforge3 # if you forked pandas, you can pass in your own GitHub username to use your fork # i.e. gh_username=myname -ARG gh_username=johnzangwill +ARG gh_username=pandas-dev ARG pandas_home="/home/pandas" # Avoid warnings by switching to noninteractive diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index edf64689e84cc..5a35e7d1c6f92 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -743,6 +743,11 @@ def test_negative_step(slice_test_grouped): def test_np_ints(slice_test_df, slice_test_grouped): """Test np ints work""" + result = slice_test_grouped.nth(np.int(0)) + expected = slice_test_df.iloc[[0, 1, 4]] + + tm.assert_frame_equal(result, expected) + result = slice_test_grouped.nth(np.array([0, 1])) expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] From 424ab1413b8d07768bd308eef8e583ad10c5eb1f Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 17:00:20 +0100 Subject: [PATCH 39/73] Try again... --- pandas/tests/groupby/test_nth.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 5a35e7d1c6f92..edf64689e84cc 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -743,11 +743,6 @@ def test_negative_step(slice_test_grouped): def test_np_ints(slice_test_df, slice_test_grouped): """Test np ints work""" - result = slice_test_grouped.nth(np.int(0)) - expected = slice_test_df.iloc[[0, 1, 4]] - - tm.assert_frame_equal(result, expected) - result = slice_test_grouped.nth(np.array([0, 1])) expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] From 258530d89b63baf42530783149bec763aecd33f5 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 19:05:19 +0100 Subject: [PATCH 40/73] More jreback requested changes --- pandas/core/groupby/indexing.py | 86 +++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 6716198abd3a9..87b7b6f61e8d0 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -41,6 +41,61 @@ def __init__(self, grouped: groupby.GroupBy): self.grouped = grouped def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: + """ + Positional index for selection by integer location per group. + + Used to implement GroupBy._rows which is used to implement GroupBy.nth + when keyword dropna is None or absent. + The behaviour extends GroupBy.nth and handles DataFrame.groupby() + keyword parameters such as as_index and dropna in a compatible way. + + The additions to nth(arg) are: + - Handles iterables such as range. + - Handles slice(start, stop, step) with + start: positive, negative or None. + stop: positive, negative or None. + step: positive or None. + + Parameters + ---------- + arg : PositionalIndexer | tuple + Allowed values are: + - Integer + - Integer values iterable such as list or range + - Slice + - Comma separated list of integers and slices + + Returns + ------- + Series + The filtered subset of the original groupby Series. + DataFrame + The filtered subset of the original groupby DataFrame. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + + Examples + -------- + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A", as_index=False)._rows[1:2] + A B + 1 a 2 + 4 b 5 + + >>> df.groupby("A", as_index=False)._rows[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 + """ with groupby.group_selection_context(self.grouped): if isinstance(arg, tuple): if all(is_integer(i) for i in arg): @@ -146,25 +201,24 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: if step > 1: mask &= self._ascending_count % step == 0 - else: - if start >= 0: - mask &= self._ascending_count >= start + elif start >= 0: + mask &= self._ascending_count >= start - if step > 1: - mask &= (self._ascending_count - start) % step == 0 + if step > 1: + mask &= (self._ascending_count - start) % step == 0 - else: - mask &= self._descending_count < -start - - offset_array = self._descending_count + start + 1 - limit_array = ( - self._ascending_count + self._descending_count + (start + 1) - ) < 0 - offset_array = np.where( - limit_array, self._ascending_count, offset_array - ) + else: + mask &= self._descending_count < -start + + offset_array = self._descending_count + start + 1 + limit_array = ( + self._ascending_count + self._descending_count + (start + 1) + ) < 0 + offset_array = np.where( + limit_array, self._ascending_count, offset_array + ) - mask &= offset_array % step == 0 + mask &= offset_array % step == 0 if stop is not None: if stop >= 0: From d49e48fe50a83298abc0c0717eae8a565eca3375 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 19:33:00 +0100 Subject: [PATCH 41/73] More tweaks --- pandas/core/groupby/groupby.py | 4 ++-- pandas/core/groupby/indexing.py | 28 ++++++++++++++-------------- pandas/tests/groupby/test_nth.py | 10 +++++----- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 138232bdb3dbf..d4661f76f445f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2389,7 +2389,7 @@ def nth( Parameters ---------- - n : int, slice or list of ints and slices + arg : int, slice or list of ints and slices A single nth value for the row or a list of nth values or slices. .. versionchanged:: 1.4.0 @@ -2475,7 +2475,7 @@ def nth( if dropna not in ["any", "all"]: # Note: when agg-ing picker doesn't raise this, just returns NaN raise ValueError( - "For a DataFrame groupby.nth, dropna must be " + "For a DataFrame or Series groupby.nth, dropna must be " "either None, 'any' or 'all', " f"(was passed {dropna})." ) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 87b7b6f61e8d0..0744ab7147c08 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -30,15 +30,15 @@ class GroupByIndexingMixin: Mixin for adding .rows to GroupBy. """ - @property + @cache_readonly def _rows(self) -> _rowsGroupByIndexer: return _rowsGroupByIndexer(cast(groupby.GroupBy, self)) @doc(GroupByIndexingMixin._rows) class _rowsGroupByIndexer: - def __init__(self, grouped: groupby.GroupBy): - self.grouped = grouped + def __init__(self, groupByObject: groupby.GroupBy): + self.groupByObject = groupByObject def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: """ @@ -96,7 +96,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: 2 a 3 4 b 5 """ - with groupby.group_selection_context(self.grouped): + with groupby.group_selection_context(self.groupByObject): if isinstance(arg, tuple): if all(is_integer(i) for i in arg): mask = self._handle_list(arg) @@ -120,28 +120,28 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: "integers and slices" ) - ids, _, _ = self.grouped.grouper.group_info + ids, _, _ = self.groupByObject.grouper.group_info # Drop NA values in grouping mask &= ids != -1 if mask is None or mask is True: - result = self.grouped._selected_obj[:] + result = self.groupByObject._selected_obj[:] else: - result = self.grouped._selected_obj[mask] + result = self.groupByObject._selected_obj[mask] - if self.grouped.as_index: - result_index = self.grouped.grouper.result_index + if self.groupByObject.as_index: + result_index = self.groupByObject.grouper.result_index result.index = result_index[ids[mask]] - if not self.grouped.observed and isinstance( + if not self.groupByObject.observed and isinstance( result_index, CategoricalIndex ): result = result.reindex(result_index) - result = self.grouped._reindex_output(result) - if self.grouped.sort: + result = self.groupByObject._reindex_output(result) + if self.groupByObject.sort: result = result.sort_index() return result @@ -231,8 +231,8 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: @cache_readonly def _ascending_count(self) -> np.ndarray: - return self.grouped._cumcount_array() + return self.groupByObject._cumcount_array() @cache_readonly def _descending_count(self) -> np.ndarray: - return self.grouped._cumcount_array(ascending=False) + return self.groupByObject._cumcount_array(ascending=False) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index edf64689e84cc..115da0414df2d 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -270,7 +270,7 @@ def test_nth(): result = s.groupby(g, sort=False).nth(0, dropna="all") tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match="For a DataFrame groupby"): + with pytest.raises(ValueError, match="For a DataFrame"): s.groupby(g, sort=False).nth(0, dropna=True) # doc example @@ -718,7 +718,7 @@ def test_groupby_last_first_nth_with_none(method, nulls_fixture): ], ) def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): - """Test slices GH #42947""" + #Test slices GH #42947 result = slice_test_grouped.nth(arg) expected = slice_test_df.iloc[expected_rows] @@ -727,21 +727,21 @@ def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): def test_invalid_argument(slice_test_grouped): - """Test for error on invalid argument""" + #Test for error on invalid argument with pytest.raises(TypeError, match="Invalid index"): slice_test_grouped.nth(3.14) def test_negative_step(slice_test_grouped): - """Test for error on negative slice step""" + #Test for error on negative slice step with pytest.raises(ValueError, match="Invalid step"): slice_test_grouped.nth(slice(None, None, -1)) def test_np_ints(slice_test_df, slice_test_grouped): - """Test np ints work""" + #Test np ints work result = slice_test_grouped.nth(np.array([0, 1])) expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] From 1dd625889c2c145fe34bb5f526b18d8bb5e14220 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 13 Sep 2021 19:38:41 +0100 Subject: [PATCH 42/73] Whitespace --- pandas/core/groupby/indexing.py | 14 ++++++-------- pandas/tests/groupby/test_nth.py | 8 ++++---- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 0744ab7147c08..bd9320fbcadba 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -43,19 +43,19 @@ def __init__(self, groupByObject: groupby.GroupBy): def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: """ Positional index for selection by integer location per group. - + Used to implement GroupBy._rows which is used to implement GroupBy.nth when keyword dropna is None or absent. - The behaviour extends GroupBy.nth and handles DataFrame.groupby() + The behaviour extends GroupBy.nth and handles DataFrame.groupby() keyword parameters such as as_index and dropna in a compatible way. - + The additions to nth(arg) are: - Handles iterables such as range. - Handles slice(start, stop, step) with start: positive, negative or None. stop: positive, negative or None. step: positive or None. - + Parameters ---------- arg : PositionalIndexer | tuple @@ -64,7 +64,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: - Integer values iterable such as list or range - Slice - Comma separated list of integers and slices - + Returns ------- Series @@ -214,9 +214,7 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: limit_array = ( self._ascending_count + self._descending_count + (start + 1) ) < 0 - offset_array = np.where( - limit_array, self._ascending_count, offset_array - ) + offset_array = np.where(limit_array, self._ascending_count, offset_array) mask &= offset_array % step == 0 diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 115da0414df2d..ad9b16583def7 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -718,7 +718,7 @@ def test_groupby_last_first_nth_with_none(method, nulls_fixture): ], ) def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): - #Test slices GH #42947 + # Test slices GH #42947 result = slice_test_grouped.nth(arg) expected = slice_test_df.iloc[expected_rows] @@ -727,21 +727,21 @@ def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): def test_invalid_argument(slice_test_grouped): - #Test for error on invalid argument + # Test for error on invalid argument with pytest.raises(TypeError, match="Invalid index"): slice_test_grouped.nth(3.14) def test_negative_step(slice_test_grouped): - #Test for error on negative slice step + # Test for error on negative slice step with pytest.raises(ValueError, match="Invalid step"): slice_test_grouped.nth(slice(None, None, -1)) def test_np_ints(slice_test_df, slice_test_grouped): - #Test np ints work + # Test np ints work result = slice_test_grouped.nth(np.array([0, 1])) expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] From c068162eb48a59cbb8ee2fd8e02d2167d5c5e6e7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Sep 2021 09:12:57 +0100 Subject: [PATCH 43/73] Remove blank lines in conditionals --- pandas/core/groupby/indexing.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index bd9320fbcadba..242aabdd6171c 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -100,19 +100,15 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: if isinstance(arg, tuple): if all(is_integer(i) for i in arg): mask = self._handle_list(arg) - else: mask = self._handle_tuple(arg) elif isinstance(arg, slice): mask = self._handle_slice(arg) - elif is_integer(arg): mask = self._handle_int(cast(int, arg)) - elif is_list_like(arg): mask = self._handle_list(cast(Iterable[int], arg)) - else: raise TypeError( f"Invalid index {type(arg)}. " @@ -127,7 +123,6 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: if mask is None or mask is True: result = self.groupByObject._selected_obj[:] - else: result = self.groupByObject._selected_obj[mask] @@ -149,7 +144,6 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: def _handle_int(self, arg: int) -> bool | np.ndarray: if arg >= 0: return self._ascending_count == arg - else: return self._descending_count == (-arg - 1) @@ -173,10 +167,8 @@ def _handle_tuple(self, args: tuple) -> bool | np.ndarray: for arg in args: if is_integer(arg): mask |= self._handle_int(cast(int, arg)) - elif isinstance(arg, slice): mask |= self._handle_slice(arg) - else: raise ValueError( f"Invalid argument {type(arg)}. Should be int or slice." @@ -221,7 +213,6 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: if stop is not None: if stop >= 0: mask &= self._ascending_count < stop - else: mask &= self._descending_count >= -stop From 4cfde7b08d9b5f98be36779985f47d9a05f836c8 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Sep 2021 15:22:40 +0100 Subject: [PATCH 44/73] Mainly variable changes and some formatting --- pandas/core/groupby/indexing.py | 79 +++++++++++++++------------------ 1 file changed, 35 insertions(+), 44 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 242aabdd6171c..ee28e000cd1d8 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -27,43 +27,34 @@ class GroupByIndexingMixin: """ - Mixin for adding .rows to GroupBy. + Mixin for adding ._rows to GroupBy. """ - @cache_readonly - def _rows(self) -> _rowsGroupByIndexer: - return _rowsGroupByIndexer(cast(groupby.GroupBy, self)) + @property + def _rows(self) -> RowsGroupByIndexer: + return RowsGroupByIndexer(cast(groupby.GroupBy, self)) @doc(GroupByIndexingMixin._rows) -class _rowsGroupByIndexer: - def __init__(self, groupByObject: groupby.GroupBy): - self.groupByObject = groupByObject +class RowsGroupByIndexer: + def __init__(self, groupby_object: groupby.GroupBy): + self.groupby_object = groupby_object def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: """ Positional index for selection by integer location per group. Used to implement GroupBy._rows which is used to implement GroupBy.nth - when keyword dropna is None or absent. - The behaviour extends GroupBy.nth and handles DataFrame.groupby() - keyword parameters such as as_index and dropna in a compatible way. - - The additions to nth(arg) are: - - Handles iterables such as range. - - Handles slice(start, stop, step) with - start: positive, negative or None. - stop: positive, negative or None. - step: positive or None. + in the case when the keyword dropna is None or absent. Parameters ---------- arg : PositionalIndexer | tuple Allowed values are: - - Integer - - Integer values iterable such as list or range - - Slice - - Comma separated list of integers and slices + - int + - int valued iterable such as list or range + - slice with step either None or positive + - tuple of integers and slices Returns ------- @@ -83,20 +74,20 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: Examples -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A", as_index=False)._rows[1:2] - A B - 1 a 2 - 4 b 5 - - >>> df.groupby("A", as_index=False)._rows[1, -1] - A B - 1 a 2 - 2 a 3 - 4 b 5 + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A", as_index=False)._rows[1:2] + A B + 1 a 2 + 4 b 5 + + >>> df.groupby("A", as_index=False)._rows[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 """ - with groupby.group_selection_context(self.groupByObject): + with groupby.group_selection_context(self.groupby_object): if isinstance(arg, tuple): if all(is_integer(i) for i in arg): mask = self._handle_list(arg) @@ -116,27 +107,27 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: "integers and slices" ) - ids, _, _ = self.groupByObject.grouper.group_info + ids, _, _ = self.groupby_object.grouper.group_info # Drop NA values in grouping mask &= ids != -1 if mask is None or mask is True: - result = self.groupByObject._selected_obj[:] + result = self.groupby_object._selected_obj[:] else: - result = self.groupByObject._selected_obj[mask] + result = self.groupby_object._selected_obj[mask] - if self.groupByObject.as_index: - result_index = self.groupByObject.grouper.result_index + if self.groupby_object.as_index: + result_index = self.groupby_object.grouper.result_index result.index = result_index[ids[mask]] - if not self.groupByObject.observed and isinstance( + if not self.groupby_object.observed and isinstance( result_index, CategoricalIndex ): result = result.reindex(result_index) - result = self.groupByObject._reindex_output(result) - if self.groupByObject.sort: + result = self.groupby_object._reindex_output(result) + if self.groupby_object.sort: result = result.sort_index() return result @@ -220,8 +211,8 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: @cache_readonly def _ascending_count(self) -> np.ndarray: - return self.groupByObject._cumcount_array() + return self.groupby_object._cumcount_array() @cache_readonly def _descending_count(self) -> np.ndarray: - return self.groupByObject._cumcount_array(ascending=False) + return self.groupby_object._cumcount_array(ascending=False) From 33a2225143d5ca54d968a16e418299e6ad42ec06 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Sep 2021 17:02:13 +0100 Subject: [PATCH 45/73] Make group_selection_context a private GroupBy class method I've just changed the groupby.group_selection_context reference. If this works then I will try to remove the import using TYPE_CHECKING. I'm not yet quite sure how to remove the casts without reducing the code to complete spaghetti... --- pandas/core/groupby/generic.py | 5 ++--- pandas/core/groupby/groupby.py | 39 ++++++++++++++++----------------- pandas/core/groupby/indexing.py | 2 +- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 46e42326d4191..c6eacfeea2a32 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -74,7 +74,6 @@ _agg_template, _apply_docs, _transform_template, - group_selection_context, ) from pandas.core.indexes.api import ( Index, @@ -236,7 +235,7 @@ def apply(self, func, *args, **kwargs): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - with group_selection_context(self): + with self._group_selection_context(): data = self._selected_obj result = self._aggregate_with_numba( data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs @@ -878,7 +877,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - with group_selection_context(self): + with self._group_selection_context(): data = self._selected_obj result = self._aggregate_with_numba( data, func, *args, engine_kwargs=engine_kwargs, **kwargs diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e512823788d76..38eb16586913b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -550,18 +550,6 @@ def f(self): return attr -@contextmanager -def group_selection_context(groupby: GroupBy) -> Iterator[GroupBy]: - """ - Set / reset the group_selection_context. - """ - groupby._set_group_selection() - try: - yield groupby - finally: - groupby._reset_group_selection() - - _KeysArgType = Union[ Hashable, List[Hashable], @@ -919,7 +907,7 @@ def __getattr__(self, attr: str): def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist - with group_selection_context(self): + with self._group_selection_context(): # need to setup the selection # as are not passed directly but in the grouper f = getattr(self._obj_with_exclusions, name) @@ -996,6 +984,17 @@ def _reset_group_selection(self) -> None: self._group_selection = None self._reset_cache("_selected_obj") + @contextmanager + def _group_selection_context(self) -> Iterator[GroupBy]: + """ + Set / reset the _group_selection_context. + """ + self._set_group_selection() + try: + yield self + finally: + self._reset_group_selection() + def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) @@ -1368,7 +1367,7 @@ def f(g): # fails on *some* columns, e.g. a numeric operation # on a string grouper column - with group_selection_context(self): + with self._group_selection_context(): return self._python_apply_general(f, self._selected_obj) return result @@ -1452,7 +1451,7 @@ def _agg_general( npfunc: Callable, ): - with group_selection_context(self): + with self._group_selection_context(): # try a cython aggregation if we can result = self._cython_agg_general( how=alias, @@ -1517,7 +1516,7 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): # TODO: tests with self._selected_obj.ndim == 1 on DataFrameGroupBy - with group_selection_context(self): + with self._group_selection_context(): data = self._selected_obj df = data if data.ndim == 2 else data.to_frame() result = self._transform_with_numba( @@ -1907,7 +1906,7 @@ def var(self, ddof: int = 1): ) else: func = lambda x: x.var(ddof=ddof) - with group_selection_context(self): + with self._group_selection_context(): return self._python_agg_general(func) @final @@ -2099,7 +2098,7 @@ def ohlc(self) -> DataFrame: @doc(DataFrame.describe) def describe(self, **kwargs): - with group_selection_context(self): + with self._group_selection_context(): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T @@ -2767,7 +2766,7 @@ def ngroup(self, ascending: bool = True): 5 0 dtype: int64 """ - with group_selection_context(self): + with self._group_selection_context(): index = self._selected_obj.index result = self._obj_1d_constructor( self.grouper.group_info[0], index, dtype=np.int64 @@ -2831,7 +2830,7 @@ def cumcount(self, ascending: bool = True): 5 0 dtype: int64 """ - with group_selection_context(self): + with self._group_selection_context(): index = self._selected_obj._get_axis(self.axis) cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index ee28e000cd1d8..5803b7bc39aef 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -87,7 +87,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: 2 a 3 4 b 5 """ - with groupby.group_selection_context(self.groupby_object): + with self.groupby_object._group_selection_context(): if isinstance(arg, tuple): if all(is_integer(i) for i in arg): mask = self._handle_list(arg) From 0d91dca32e58cb0630987a59ae9547779f1613f5 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 14 Sep 2021 23:02:33 +0100 Subject: [PATCH 46/73] Add conditional typing for groupby import --- pandas/core/groupby/indexing.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 5803b7bc39aef..4065f3e3b5e5f 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import ( + TYPE_CHECKING, Iterable, cast, ) @@ -21,7 +22,9 @@ is_list_like, ) -from pandas.core.groupby import groupby +if TYPE_CHECKING: + from pandas.core.groupby import groupby + from pandas.core.indexes.api import CategoricalIndex @@ -32,7 +35,12 @@ class GroupByIndexingMixin: @property def _rows(self) -> RowsGroupByIndexer: - return RowsGroupByIndexer(cast(groupby.GroupBy, self)) + if TYPE_CHECKING: + groupby_object = cast(groupby.GroupBy, self) + else: + groupby_object = self + + return RowsGroupByIndexer(groupby_object) @doc(GroupByIndexingMixin._rows) From f42ae412c75aecd36a764592900f1055d8141ce7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 15 Sep 2021 08:48:47 +0100 Subject: [PATCH 47/73] Delete Example section --- pandas/core/groupby/indexing.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 4065f3e3b5e5f..0f9ccf344a079 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -79,21 +79,6 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: GroupBy.tail : Return last n rows of each group. GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows, if n is a list of ints. - - Examples - -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A", as_index=False)._rows[1:2] - A B - 1 a 2 - 4 b 5 - - >>> df.groupby("A", as_index=False)._rows[1, -1] - A B - 1 a 2 - 2 a 3 - 4 b 5 """ with self.groupby_object._group_selection_context(): if isinstance(arg, tuple): From 898fad41187c844c9865efe75e77e784e55797be Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 17 Sep 2021 10:44:09 +0100 Subject: [PATCH 48/73] Changes for @rhshadrach. --- pandas/core/groupby/groupby.py | 18 +++++++++--------- pandas/core/groupby/indexing.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 38eb16586913b..008a677184013 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2396,7 +2396,7 @@ def backfill(self, limit=None): @Substitution(see_also=_common_see_also) def nth( self, - arg: PositionalIndexer | tuple, + n: PositionalIndexer | tuple, dropna: Literal["any", "all", None] = None, ) -> FrameOrSeries: """ @@ -2408,15 +2408,15 @@ def nth( Parameters ---------- - arg : int, slice or list of ints and slices + n : int, slice or list of ints and slices A single nth value for the row or a list of nth values or slices. .. versionchanged:: 1.4.0 - Added slice and lists containiing slices + Added slice and lists containiing slices. dropna : {'any', 'all', None}, default None Apply the specified dropna operation before counting which row is - the nth row. + the nth row. Only supported if n is an int. Returns ------- @@ -2482,13 +2482,13 @@ def nth( 4 2 5.0 """ if not dropna: - if isinstance(arg, Iterable): - return self._rows[tuple(arg)] + if isinstance(n, Iterable): + return self._rows[tuple(n)] - return self._rows[arg] + return self._rows[n] # dropna is truthy - if not is_integer(arg): + if not is_integer(n): raise ValueError("dropna option only supported for an integer argument") if dropna not in ["any", "all"]: @@ -2501,7 +2501,7 @@ def nth( # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf - n = cast(int, arg) + n = cast(int, n) max_len = n if n >= 0 else -1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 0f9ccf344a079..f60ae55129d9e 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -125,7 +125,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: return result - def _handle_int(self, arg: int) -> bool | np.ndarray: + def _handle_int(self, arg: int) -> np.ndarray: if arg >= 0: return self._ascending_count == arg else: From 02ec03cfb43cb7e1798e8aeeb208428e6ebb5dc7 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 17 Sep 2021 11:47:51 +0100 Subject: [PATCH 49/73] Remove more docstrings from tests --- pandas/tests/groupby/test_rows.py | 39 +++++++++++-------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_rows.py index a8db715e668d7..6f7cd2358ed0e 100644 --- a/pandas/tests/groupby/test_rows.py +++ b/pandas/tests/groupby/test_rows.py @@ -1,4 +1,4 @@ -""" Test GroupBy.rows positional grouped indexing GH#42864""" +# Test GroupBy._rows positional grouped indexing GH#42864 import random @@ -20,8 +20,7 @@ ], ) def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): - """Test single integer""" - + # Test single integer result = slice_test_grouped._rows[arg] expected = slice_test_df.iloc[expected_rows] @@ -29,8 +28,7 @@ def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): def test_slice(slice_test_df, slice_test_grouped): - """Test single slice""" - + # Test single slice result = slice_test_grouped._rows[0:3:2] expected = slice_test_df.iloc[[0, 1, 4, 5]] @@ -53,8 +51,7 @@ def test_slice(slice_test_df, slice_test_grouped): ], ) def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): - """Test lists of integers and integer valued iterables""" - + # Test lists of integers and integer valued iterables result = slice_test_grouped._rows[arg] expected = slice_test_df.iloc[expected_rows] @@ -62,8 +59,7 @@ def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): def test_ints(slice_test_df, slice_test_grouped): - """Test tuple of ints""" - + # Test tuple of ints result = slice_test_grouped._rows[0, 2, -1] expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] @@ -71,8 +67,7 @@ def test_ints(slice_test_df, slice_test_grouped): def test_slices(slice_test_df, slice_test_grouped): - """Test tuple of slices""" - + # Test tuple of slices result = slice_test_grouped._rows[:2, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] @@ -80,8 +75,7 @@ def test_slices(slice_test_df, slice_test_grouped): def test_mix(slice_test_df, slice_test_grouped): - """Test mixed tuple of ints and slices""" - + # Test mixed tuple of ints and slices result = slice_test_grouped._rows[0, 1, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] @@ -97,6 +91,7 @@ def test_mix(slice_test_df, slice_test_grouped): ], ) def test_as_index(slice_test_df, arg, expected_rows): + # Test the default as_index behaviour result = slice_test_df.groupby("Group", sort=False)._rows[arg] expected = slice_test_df.iloc[expected_rows].set_index("Group") @@ -104,8 +99,7 @@ def test_as_index(slice_test_df, arg, expected_rows): def test_doc_examples(): - """Test the examples in the documentation""" - + # Test the examples in the documentation df = pd.DataFrame( [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] ) @@ -157,8 +151,7 @@ def _make_df_from_data(data): def test_multiindex(multiindex_data): - """Test the multiindex mentioned as the use-case in the documentation""" - + # Test the multiindex mentioned as the use-case in the documentation df = _make_df_from_data(multiindex_data) result = df.groupby("Date", as_index=False).nth(slice(3, -3)) @@ -172,8 +165,7 @@ def test_multiindex(multiindex_data): @pytest.mark.parametrize("method", ["head", "tail"]) @pytest.mark.parametrize("simulated", [True, False]) def test_against_head_and_tail(arg, method, simulated): - """Test gives the same results as grouped head and tail""" - + # Test gives the same results as grouped head and tail n_groups = 100 n_rows_per_group = 30 @@ -227,8 +219,7 @@ def test_against_head_and_tail(arg, method, simulated): @pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10]) @pytest.mark.parametrize("step", [None, 1, 5]) def test_against_df_iloc(start, stop, step): - """Test that a single group gives the same results as DataFame.iloc""" - + # Test that a single group gives the same results as DataFame.iloc n_rows = 30 data = { @@ -245,8 +236,7 @@ def test_against_df_iloc(start, stop, step): def test_series(): - """Test grouped Series""" - + # Test grouped Series ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) grouped = ser.groupby(level=0) result = grouped._rows[1:2] @@ -257,8 +247,7 @@ def test_series(): @pytest.mark.parametrize("step", [1, 2, 3, 4, 5]) def test_step(step): - """Test slice with various step values""" - + # Test slice with various step values data = [["x", f"x{i}"] for i in range(5)] data += [["y", f"y{i}"] for i in range(4)] data += [["z", f"z{i}"] for i in range(3)] From 44120e1abfb6624ea52f959ac4aa89c697b95468 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 17 Sep 2021 16:02:54 +0100 Subject: [PATCH 50/73] Don't need to check for None anymore --- pandas/core/groupby/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index f60ae55129d9e..6a5004517bbe7 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -105,7 +105,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: # Drop NA values in grouping mask &= ids != -1 - if mask is None or mask is True: + if mask is True: result = self.groupby_object._selected_obj[:] else: result = self.groupby_object._selected_obj[mask] From 88b8ac5d309c878e555a3884f046ac40eb16d900 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 18 Sep 2021 08:26:44 +0100 Subject: [PATCH 51/73] Speed up by checking dropna --- pandas/core/groupby/indexing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 6a5004517bbe7..43b7a88585233 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -100,10 +100,11 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: "integers and slices" ) - ids, _, _ = self.groupby_object.grouper.group_info + if self.groupby_object.dropna: + # Drop NA values in grouping + ids, _, _ = self.groupby_object.grouper.group_info - # Drop NA values in grouping - mask &= ids != -1 + mask &= ids != -1 if mask is True: result = self.groupby_object._selected_obj[:] From 0ee53cdfa571a88cc08d787ef32d0ec5c78300bd Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 20 Sep 2021 15:21:26 +0100 Subject: [PATCH 52/73] Implement head, tail. column axis, change _rows to _middle and remove nth emulation --- pandas/core/groupby/groupby.py | 63 +++++-- pandas/core/groupby/indexing.py | 173 +++++++++--------- .../groupby/{test_rows.py => test_middle.py} | 48 +++-- pandas/tests/groupby/test_nth.py | 8 +- 4 files changed, 163 insertions(+), 129 deletions(-) rename pandas/tests/groupby/{test_rows.py => test_middle.py} (85%) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 813e0eb4bc930..b0caca50ba410 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2483,10 +2483,26 @@ def nth( 4 2 5.0 """ if not dropna: - if isinstance(n, Iterable): - return self._rows[tuple(n)] + with self._group_selection_context(): + mask = self._make_mask(n) + + ids, _, _ = self.grouper.group_info + + # Drop NA values in grouping + mask = mask & (ids != -1) + + out = self._selected_obj[mask] + if not self.as_index: + return out + + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - return self._rows[n] + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) + + out = self._reindex_output(out) + return out.sort_index() if self.sort else out # dropna is truthy if not is_integer(n): @@ -3237,11 +3253,16 @@ def head(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). - Does not work for negative values of `n`. + Parameters + ---------- + n : int + If positive: number of entries to include from start of each group. + If negative: number of entries to exclude from end of each group. Returns ------- Series or DataFrame + Subset of original Series or DataFrame as determined by n. %(see_also)s Examples -------- @@ -3253,16 +3274,12 @@ def head(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(-1) - Empty DataFrame - Columns: [A, B] - Index: [] + A B + 0 1 2 """ self._reset_group_selection() - mask = self._cumcount_array() < n - if self.axis == 0: - return self._selected_obj[mask] - else: - return self._selected_obj.iloc[:, mask] + mask = self._make_mask(slice(None, n)) + return self._apply_mask(mask) @final @Substitution(name="groupby") @@ -3275,11 +3292,16 @@ def tail(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). - Does not work for negative values of `n`. + Parameters + ---------- + n : int + If positive: number of entries to include from end of each group. + If negative: number of entries to exclude from start of each group. Returns ------- Series or DataFrame + Subset of original Series or DataFrame as determined by n. %(see_also)s Examples -------- @@ -3291,16 +3313,17 @@ def tail(self, n=5): 1 a 2 3 b 2 >>> df.groupby('A').tail(-1) - Empty DataFrame - Columns: [A, B] - Index: [] + A B + 1 a 2 + 3 b 2 """ self._reset_group_selection() - mask = self._cumcount_array(ascending=False) < n - if self.axis == 0: - return self._selected_obj[mask] + if n: + mask = self._make_mask(slice(-n, None)) else: - return self._selected_obj.iloc[:, mask] + mask = self._make_mask([]) + + return self._apply_mask(mask) @final def _reindex_output( diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 43b7a88585233..c78a904062234 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -25,106 +25,44 @@ if TYPE_CHECKING: from pandas.core.groupby import groupby -from pandas.core.indexes.api import CategoricalIndex - class GroupByIndexingMixin: """ - Mixin for adding ._rows to GroupBy. + Mixin for adding ._middle to GroupBy. """ - @property - def _rows(self) -> RowsGroupByIndexer: + @cache_readonly + def _middle(self) -> MiddleGroupByIndexer: if TYPE_CHECKING: - groupby_object = cast(groupby.GroupBy, self) - else: - groupby_object = self - - return RowsGroupByIndexer(groupby_object) - - -@doc(GroupByIndexingMixin._rows) -class RowsGroupByIndexer: - def __init__(self, groupby_object: groupby.GroupBy): - self.groupby_object = groupby_object - - def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: - """ - Positional index for selection by integer location per group. - - Used to implement GroupBy._rows which is used to implement GroupBy.nth - in the case when the keyword dropna is None or absent. - - Parameters - ---------- - arg : PositionalIndexer | tuple - Allowed values are: - - int - - int valued iterable such as list or range - - slice with step either None or positive - - tuple of integers and slices + self = cast(groupby.GroupBy, self) - Returns - ------- - Series - The filtered subset of the original groupby Series. - DataFrame - The filtered subset of the original groupby DataFrame. + return MiddleGroupByIndexer(self) - See Also - -------- - DataFrame.iloc : Purely integer-location based indexing for selection by - position. - GroupBy.head : Return first n rows of each group. - GroupBy.tail : Return last n rows of each group. - GroupBy.nth : Take the nth row from each group if n is an int, or a - subset of rows, if n is a list of ints. - """ - with self.groupby_object._group_selection_context(): - if isinstance(arg, tuple): - if all(is_integer(i) for i in arg): - mask = self._handle_list(arg) - else: - mask = self._handle_tuple(arg) - - elif isinstance(arg, slice): - mask = self._handle_slice(arg) - elif is_integer(arg): - mask = self._handle_int(cast(int, arg)) - elif is_list_like(arg): + def _make_mask(self, arg: PositionalIndexer | tuple) -> np.ndarray: + if is_list_like(arg): + if all(is_integer(i) for i in cast(Iterable, arg)): mask = self._handle_list(cast(Iterable[int], arg)) else: - raise TypeError( - f"Invalid index {type(arg)}. " - "Must be integer, list-like, slice or a tuple of " - "integers and slices" - ) + mask = self._handle_tuple(cast(tuple, arg)) - if self.groupby_object.dropna: - # Drop NA values in grouping - ids, _, _ = self.groupby_object.grouper.group_info - - mask &= ids != -1 - - if mask is True: - result = self.groupby_object._selected_obj[:] + elif isinstance(arg, slice): + mask = self._handle_slice(arg) + elif is_integer(arg): + mask = self._handle_int(cast(int, arg)) + else: + raise TypeError( + f"Invalid index {type(arg)}. " + "Must be integer, list-like, slice or a tuple of " + "integers and slices" + ) + + if isinstance(mask, bool): + if mask: + mask = self._ascending_count >= 0 else: - result = self.groupby_object._selected_obj[mask] - - if self.groupby_object.as_index: - result_index = self.groupby_object.grouper.result_index - result.index = result_index[ids[mask]] - - if not self.groupby_object.observed and isinstance( - result_index, CategoricalIndex - ): - result = result.reindex(result_index) + mask = self._ascending_count < 0 - result = self.groupby_object._reindex_output(result) - if self.groupby_object.sort: - result = result.sort_index() - - return result + return cast(np.ndarray, mask) def _handle_int(self, arg: int) -> np.ndarray: if arg >= 0: @@ -203,10 +141,67 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: return mask + def _apply_mask(self, mask: np.ndarray): + if TYPE_CHECKING: + self = cast(groupby.GroupBy, self) + + if self.axis == 0: + return self._selected_obj[mask] + else: + return self._selected_obj.iloc[:, mask] + @cache_readonly def _ascending_count(self) -> np.ndarray: - return self.groupby_object._cumcount_array() + if TYPE_CHECKING: + self = cast(groupby.GroupBy, self) + + return self._cumcount_array() @cache_readonly def _descending_count(self) -> np.ndarray: - return self.groupby_object._cumcount_array(ascending=False) + if TYPE_CHECKING: + self = cast(groupby.GroupBy, self) + + return self._cumcount_array(ascending=False) + + +@doc(GroupByIndexingMixin._middle) +class MiddleGroupByIndexer: + def __init__(self, groupby_object: groupby.GroupBy): + self.groupby_object = groupby_object + + def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: + """ + Positional index for selection by integer location per group. + + Used to implement GroupBy._middle which is used to implement GroupBy.nth + in the case when the keyword dropna is None or absent. + + Parameters + ---------- + arg : PositionalIndexer | tuple + Allowed values are: + - int + - int valued iterable such as list or range + - slice with step either None or positive + - tuple of integers and slices + + Returns + ------- + Series + The filtered subset of the original groupby Series. + DataFrame + The filtered subset of the original groupby DataFrame. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + """ + self.groupby_object._reset_group_selection() + mask = self.groupby_object._make_mask(arg) + return self.groupby_object._apply_mask(mask) diff --git a/pandas/tests/groupby/test_rows.py b/pandas/tests/groupby/test_middle.py similarity index 85% rename from pandas/tests/groupby/test_rows.py rename to pandas/tests/groupby/test_middle.py index 6f7cd2358ed0e..e5414e2f3472f 100644 --- a/pandas/tests/groupby/test_rows.py +++ b/pandas/tests/groupby/test_middle.py @@ -1,4 +1,4 @@ -# Test GroupBy._rows positional grouped indexing GH#42864 +# Test GroupBy._middle positional grouped indexing GH#42864 import random @@ -21,7 +21,7 @@ ) def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): # Test single integer - result = slice_test_grouped._rows[arg] + result = slice_test_grouped._middle[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -29,7 +29,7 @@ def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): def test_slice(slice_test_df, slice_test_grouped): # Test single slice - result = slice_test_grouped._rows[0:3:2] + result = slice_test_grouped._middle[0:3:2] expected = slice_test_df.iloc[[0, 1, 4, 5]] tm.assert_frame_equal(result, expected) @@ -52,7 +52,7 @@ def test_slice(slice_test_df, slice_test_grouped): ) def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): # Test lists of integers and integer valued iterables - result = slice_test_grouped._rows[arg] + result = slice_test_grouped._middle[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -60,7 +60,7 @@ def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): def test_ints(slice_test_df, slice_test_grouped): # Test tuple of ints - result = slice_test_grouped._rows[0, 2, -1] + result = slice_test_grouped._middle[0, 2, -1] expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] tm.assert_frame_equal(result, expected) @@ -68,7 +68,7 @@ def test_ints(slice_test_df, slice_test_grouped): def test_slices(slice_test_df, slice_test_grouped): # Test tuple of slices - result = slice_test_grouped._rows[:2, -2:] + result = slice_test_grouped._middle[:2, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) @@ -76,7 +76,7 @@ def test_slices(slice_test_df, slice_test_grouped): def test_mix(slice_test_df, slice_test_grouped): # Test mixed tuple of ints and slices - result = slice_test_grouped._rows[0, 1, -2:] + result = slice_test_grouped._middle[0, 1, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) @@ -92,8 +92,8 @@ def test_mix(slice_test_df, slice_test_grouped): ) def test_as_index(slice_test_df, arg, expected_rows): # Test the default as_index behaviour - result = slice_test_df.groupby("Group", sort=False)._rows[arg] - expected = slice_test_df.iloc[expected_rows].set_index("Group") + result = slice_test_df.groupby("Group", sort=False)._middle[arg] + expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -106,12 +106,12 @@ def test_doc_examples(): grouped = df.groupby("A", as_index=False) - result = grouped._rows[1:2] + result = grouped._middle[1:2] expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) tm.assert_frame_equal(result, expected) - result = grouped._rows[1, -1] + result = grouped._middle[1, -1] expected = pd.DataFrame( [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] ) @@ -183,7 +183,7 @@ def test_against_head_and_tail(arg, method, simulated): grouped = df.groupby("group", as_index=False) if method == "head": - result = grouped._rows[:arg] + result = grouped._middle[:arg] if simulated: indices = [] @@ -198,7 +198,7 @@ def test_against_head_and_tail(arg, method, simulated): expected = grouped.head(arg) else: - result = grouped._rows[-arg:] + result = grouped._middle[-arg:] if simulated: indices = [] @@ -229,7 +229,7 @@ def test_against_df_iloc(start, stop, step): df = pd.DataFrame(data) grouped = df.groupby("group", as_index=False) - result = grouped._rows[start:stop:step] + result = grouped._middle[start:stop:step] expected = df.iloc[start:stop:step] tm.assert_frame_equal(result, expected) @@ -239,7 +239,7 @@ def test_series(): # Test grouped Series ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) grouped = ser.groupby(level=0) - result = grouped._rows[1:2] + result = grouped._middle[1:2] expected = pd.Series([2, 5], index=["a", "b"]) tm.assert_series_equal(result, expected) @@ -255,7 +255,7 @@ def test_step(step): grouped = df.groupby("A", as_index=False) - result = grouped._rows[::step] + result = grouped._middle[::step] data = [["x", f"x{i}"] for i in range(0, 5, step)] data += [["y", f"y{i}"] for i in range(0, 4, step)] @@ -268,3 +268,19 @@ def test_step(step): expected = pd.DataFrame(data, columns=["A", "B"], index=index) tm.assert_frame_equal(result, expected) + + +@pytest.fixture() +def column_group_df(): + return pd.DataFrame( + [[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]], + columns=["A", "B", "C", "D", "E", "F", "G"], + ) + + +def test_column_axis(column_group_df): + g = column_group_df.groupby(column_group_df.iloc[1], axis=1) + result = g._middle[1:-1] + expected = column_group_df.iloc[:, [1, 3]] + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index ad9b16583def7..d923a4edb5a0b 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -517,11 +517,11 @@ def test_nth_multi_index_as_expected(): @pytest.mark.parametrize( "op, n, expected_rows", [ - ("head", -1, []), + ("head", -1, [0]), ("head", 0, []), ("head", 1, [0, 2]), ("head", 7, [0, 1, 2]), - ("tail", -1, []), + ("tail", -1, [1]), ("tail", 0, []), ("tail", 1, [1, 2]), ("tail", 7, [0, 1, 2]), @@ -543,11 +543,11 @@ def test_groupby_head_tail(op, n, expected_rows, columns, as_index): @pytest.mark.parametrize( "op, n, expected_cols", [ - ("head", -1, []), + ("head", -1, [0]), ("head", 0, []), ("head", 1, [0, 2]), ("head", 7, [0, 1, 2]), - ("tail", -1, []), + ("tail", -1, [1]), ("tail", 0, []), ("tail", 1, [1, 2]), ("tail", 7, [0, 1, 2]), From 9412e3eca764c0285e5c6614a606b3a608d1e40f Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 21 Sep 2021 08:33:35 +0100 Subject: [PATCH 53/73] Change _middle to _body --- pandas/core/groupby/indexing.py | 8 ++--- .../groupby/{test_middle.py => test_body.py} | 32 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) rename pandas/tests/groupby/{test_middle.py => test_body.py} (91%) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index c78a904062234..dd7a58539aaa3 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -28,11 +28,11 @@ class GroupByIndexingMixin: """ - Mixin for adding ._middle to GroupBy. + Mixin for adding ._body to GroupBy. """ @cache_readonly - def _middle(self) -> MiddleGroupByIndexer: + def _body(self) -> MiddleGroupByIndexer: if TYPE_CHECKING: self = cast(groupby.GroupBy, self) @@ -165,7 +165,7 @@ def _descending_count(self) -> np.ndarray: return self._cumcount_array(ascending=False) -@doc(GroupByIndexingMixin._middle) +@doc(GroupByIndexingMixin._body) class MiddleGroupByIndexer: def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object @@ -174,7 +174,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: """ Positional index for selection by integer location per group. - Used to implement GroupBy._middle which is used to implement GroupBy.nth + Used to implement GroupBy._body which is used to implement GroupBy.nth in the case when the keyword dropna is None or absent. Parameters diff --git a/pandas/tests/groupby/test_middle.py b/pandas/tests/groupby/test_body.py similarity index 91% rename from pandas/tests/groupby/test_middle.py rename to pandas/tests/groupby/test_body.py index e5414e2f3472f..b2a61237975f3 100644 --- a/pandas/tests/groupby/test_middle.py +++ b/pandas/tests/groupby/test_body.py @@ -1,4 +1,4 @@ -# Test GroupBy._middle positional grouped indexing GH#42864 +# Test GroupBy._body positional grouped indexing GH#42864 import random @@ -21,7 +21,7 @@ ) def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): # Test single integer - result = slice_test_grouped._middle[arg] + result = slice_test_grouped._body[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -29,7 +29,7 @@ def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): def test_slice(slice_test_df, slice_test_grouped): # Test single slice - result = slice_test_grouped._middle[0:3:2] + result = slice_test_grouped._body[0:3:2] expected = slice_test_df.iloc[[0, 1, 4, 5]] tm.assert_frame_equal(result, expected) @@ -52,7 +52,7 @@ def test_slice(slice_test_df, slice_test_grouped): ) def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): # Test lists of integers and integer valued iterables - result = slice_test_grouped._middle[arg] + result = slice_test_grouped._body[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -60,7 +60,7 @@ def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): def test_ints(slice_test_df, slice_test_grouped): # Test tuple of ints - result = slice_test_grouped._middle[0, 2, -1] + result = slice_test_grouped._body[0, 2, -1] expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] tm.assert_frame_equal(result, expected) @@ -68,7 +68,7 @@ def test_ints(slice_test_df, slice_test_grouped): def test_slices(slice_test_df, slice_test_grouped): # Test tuple of slices - result = slice_test_grouped._middle[:2, -2:] + result = slice_test_grouped._body[:2, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) @@ -76,7 +76,7 @@ def test_slices(slice_test_df, slice_test_grouped): def test_mix(slice_test_df, slice_test_grouped): # Test mixed tuple of ints and slices - result = slice_test_grouped._middle[0, 1, -2:] + result = slice_test_grouped._body[0, 1, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) @@ -92,7 +92,7 @@ def test_mix(slice_test_df, slice_test_grouped): ) def test_as_index(slice_test_df, arg, expected_rows): # Test the default as_index behaviour - result = slice_test_df.groupby("Group", sort=False)._middle[arg] + result = slice_test_df.groupby("Group", sort=False)._body[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -106,12 +106,12 @@ def test_doc_examples(): grouped = df.groupby("A", as_index=False) - result = grouped._middle[1:2] + result = grouped._body[1:2] expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) tm.assert_frame_equal(result, expected) - result = grouped._middle[1, -1] + result = grouped._body[1, -1] expected = pd.DataFrame( [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] ) @@ -183,7 +183,7 @@ def test_against_head_and_tail(arg, method, simulated): grouped = df.groupby("group", as_index=False) if method == "head": - result = grouped._middle[:arg] + result = grouped._body[:arg] if simulated: indices = [] @@ -198,7 +198,7 @@ def test_against_head_and_tail(arg, method, simulated): expected = grouped.head(arg) else: - result = grouped._middle[-arg:] + result = grouped._body[-arg:] if simulated: indices = [] @@ -229,7 +229,7 @@ def test_against_df_iloc(start, stop, step): df = pd.DataFrame(data) grouped = df.groupby("group", as_index=False) - result = grouped._middle[start:stop:step] + result = grouped._body[start:stop:step] expected = df.iloc[start:stop:step] tm.assert_frame_equal(result, expected) @@ -239,7 +239,7 @@ def test_series(): # Test grouped Series ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) grouped = ser.groupby(level=0) - result = grouped._middle[1:2] + result = grouped._body[1:2] expected = pd.Series([2, 5], index=["a", "b"]) tm.assert_series_equal(result, expected) @@ -255,7 +255,7 @@ def test_step(step): grouped = df.groupby("A", as_index=False) - result = grouped._middle[::step] + result = grouped._body[::step] data = [["x", f"x{i}"] for i in range(0, 5, step)] data += [["y", f"y{i}"] for i in range(0, 4, step)] @@ -280,7 +280,7 @@ def column_group_df(): def test_column_axis(column_group_df): g = column_group_df.groupby(column_group_df.iloc[1], axis=1) - result = g._middle[1:-1] + result = g._body[1:-1] expected = column_group_df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) From ea45bc671713bfb9cf7690bdeb751aad3c7d7a88 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 21 Sep 2021 08:59:20 +0100 Subject: [PATCH 54/73] Change class name to match --- pandas/core/groupby/indexing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index dd7a58539aaa3..b0a4846bd2c2b 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -32,11 +32,11 @@ class GroupByIndexingMixin: """ @cache_readonly - def _body(self) -> MiddleGroupByIndexer: + def _body(self) -> BodyGroupByIndexer: if TYPE_CHECKING: self = cast(groupby.GroupBy, self) - return MiddleGroupByIndexer(self) + return BodyGroupByIndexer(self) def _make_mask(self, arg: PositionalIndexer | tuple) -> np.ndarray: if is_list_like(arg): @@ -166,7 +166,7 @@ def _descending_count(self) -> np.ndarray: @doc(GroupByIndexingMixin._body) -class MiddleGroupByIndexer: +class BodyGroupByIndexer: def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object From 19edf00aad501128d354ce14ce8ae0552cf1af78 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 22 Sep 2021 13:46:49 +0100 Subject: [PATCH 55/73] Add negative values to test_body.py/test_against_head_and_tail() --- pandas/tests/groupby/test_body.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_body.py b/pandas/tests/groupby/test_body.py index b2a61237975f3..fa63b863e4995 100644 --- a/pandas/tests/groupby/test_body.py +++ b/pandas/tests/groupby/test_body.py @@ -161,7 +161,7 @@ def test_multiindex(multiindex_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("arg", [1, 5, 30, 1000]) +@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000]) @pytest.mark.parametrize("method", ["head", "tail"]) @pytest.mark.parametrize("simulated", [True, False]) def test_against_head_and_tail(arg, method, simulated): @@ -181,13 +181,14 @@ def test_against_head_and_tail(arg, method, simulated): } df = pd.DataFrame(data) grouped = df.groupby("group", as_index=False) + size = arg if arg >= 0 else n_rows_per_group + arg if method == "head": result = grouped._body[:arg] if simulated: indices = [] - for j in range(arg): + for j in range(size): for i in range(n_groups): if j * n_groups + i < n_groups * n_rows_per_group: indices.append(j * n_groups + i) @@ -202,10 +203,10 @@ def test_against_head_and_tail(arg, method, simulated): if simulated: indices = [] - for j in range(arg): + for j in range(size): for i in range(n_groups): - if (n_rows_per_group + j - arg) * n_groups + i >= 0: - indices.append((n_rows_per_group + j - arg) * n_groups + i) + if (n_rows_per_group + j - size) * n_groups + i >= 0: + indices.append((n_rows_per_group + j - size) * n_groups + i) expected = df.iloc[indices] From ae2105908fbc6d74b4f35468ed619355027a5075 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 25 Sep 2021 17:39:42 +0100 Subject: [PATCH 56/73] Add _body docstring --- pandas/core/groupby/indexing.py | 82 +++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index b0a4846bd2c2b..0a808379d1b16 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -33,6 +33,79 @@ class GroupByIndexingMixin: @cache_readonly def _body(self) -> BodyGroupByIndexer: + """ + Return positional selection for each group. + + ``groupby._body[i:j]`` is similar to + ``groupby.apply(lambda x: x.iloc[i:j])`` + but much faster and preserves the original index and order. + + ``_body[]`` is compatible with and extends :meth:`~GroupBy.head` and + :meth:`~GroupBy.tail`. For example: + + - ``head(5)`` + - ``_body[5:-5]`` + - ``tail(5)`` + + together return all the rows. + + Allowed inputs for the index are: + + - An integer valued iterable, e.g. ``range(2, 4)``. + - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``. + + The output format is the same as :meth:`~GroupBy.head` and + :meth:`~GroupBy.tail`, namely + a subset of the ``DataFrame`` or ``Series`` with the index and order preserved. + + Returns + ------- + Series + The filtered subset of the original Series. + DataFrame + The filtered subset of the original DataFrame. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + GroupBy.head : Return first n rows of each group. + GroupBy.tail : Return last n rows of each group. + GroupBy.nth : Take the nth row from each group if n is an int, or a + subset of rows, if n is a list of ints. + + Notes + ----- + - The slice step cannot be negative. + - If the index specification results in overlaps, the item is not duplicated. + - If the index specification changes the order of items, then + they are returned in their original order. + By contrast, ``DataFrame.iloc`` can change the row order. + - ``groupby()`` parameters such as as_index and dropna are ignored. + + The differences between ``_body`` and ``nth`` with ``as_index=False`` are: + + - Input to ``_body`` can include one or more slices whereas ``nth`` just handles + an integer or a list of integers. + - ``_body`` can accept a slice relative to the last row of each group. + - ``GroupBy._body`` does not have an equivalent to the ``nth`` ``dropna`` + parameter. + + Examples + -------- + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A", as_index=False)._body[1:2] + A B + 1 a 2 + 4 b 5 + + >>> df.groupby("A", as_index=False)._body[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 + """ if TYPE_CHECKING: self = cast(groupby.GroupBy, self) @@ -172,10 +245,9 @@ def __init__(self, groupby_object: groupby.GroupBy): def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: """ - Positional index for selection by integer location per group. + Select by positional index per group. - Used to implement GroupBy._body which is used to implement GroupBy.nth - in the case when the keyword dropna is None or absent. + Implements GroupBy._body Parameters ---------- @@ -195,10 +267,10 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: See Also -------- - DataFrame.iloc : Purely integer-location based indexing for selection by - position. + DataFrame.iloc : Integer-location based indexing for selection by position. GroupBy.head : Return first n rows of each group. GroupBy.tail : Return last n rows of each group. + GroupBy._body : Return positional selection for each group. GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows, if n is a list of ints. """ From 6ce90c41a8f1363daca12dae9e5a53c00076dd22 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 25 Sep 2021 19:56:55 +0100 Subject: [PATCH 57/73] Make nth a link --- pandas/core/groupby/indexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 0a808379d1b16..a6f4b8ed6fe45 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -83,7 +83,8 @@ def _body(self) -> BodyGroupByIndexer: By contrast, ``DataFrame.iloc`` can change the row order. - ``groupby()`` parameters such as as_index and dropna are ignored. - The differences between ``_body`` and ``nth`` with ``as_index=False`` are: + The differences between ``_body[]`` and :meth:`~GroupBy.nth` + with ``as_index=False`` are: - Input to ``_body`` can include one or more slices whereas ``nth`` just handles an integer or a list of integers. From 4f6cbe1c24d70dcb279e121334c7b1ecd3164c50 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 26 Sep 2021 09:09:49 +0100 Subject: [PATCH 58/73] Improve doc --- pandas/core/groupby/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index a6f4b8ed6fe45..dd580b70b3205 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -89,7 +89,7 @@ def _body(self) -> BodyGroupByIndexer: - Input to ``_body`` can include one or more slices whereas ``nth`` just handles an integer or a list of integers. - ``_body`` can accept a slice relative to the last row of each group. - - ``GroupBy._body`` does not have an equivalent to the ``nth`` ``dropna`` + - ``_body`` does not have an equivalent to the ``nth()`` ``dropna`` parameter. Examples From 19b21bb4ab498bcdeee1f7001d588abc02f8fbf6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 26 Sep 2021 12:42:30 +0100 Subject: [PATCH 59/73] Simplify examples --- pandas/core/groupby/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index dd580b70b3205..008f72c2b8cf3 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -96,12 +96,12 @@ def _body(self) -> BodyGroupByIndexer: -------- >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], ... columns=["A", "B"]) - >>> df.groupby("A", as_index=False)._body[1:2] + >>> df.groupby("A")._body[1:2] A B 1 a 2 4 b 5 - >>> df.groupby("A", as_index=False)._body[1, -1] + >>> df.groupby("A")._body[1, -1] A B 1 a 2 2 a 3 From ca164cf46e133c1a6f961ff8669ce11d437d341b Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 26 Sep 2021 17:04:17 +0100 Subject: [PATCH 60/73] Fix FrameOrSeries typing problem --- pandas/core/groupby/indexing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 008f72c2b8cf3..fb8dba38cb892 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -9,7 +9,7 @@ import numpy as np from pandas._typing import ( - FrameOrSeries, + FrameOrSeriesUnion, PositionalIndexer, ) from pandas.util._decorators import ( @@ -215,7 +215,7 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: return mask - def _apply_mask(self, mask: np.ndarray): + def _apply_mask(self, mask: np.ndarray) -> FrameOrSeriesUnion: if TYPE_CHECKING: self = cast(groupby.GroupBy, self) @@ -244,7 +244,7 @@ class BodyGroupByIndexer: def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object - def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeries: + def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeriesUnion: """ Select by positional index per group. From 337b15ce4a0a1571c91163d791778a17c9aabd96 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 26 Sep 2021 17:26:34 +0100 Subject: [PATCH 61/73] Fix more new typing problems --- pandas/core/groupby/indexing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index fb8dba38cb892..0a62bc412c3ac 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -9,8 +9,9 @@ import numpy as np from pandas._typing import ( - FrameOrSeriesUnion, + DataFrame, PositionalIndexer, + Series, ) from pandas.util._decorators import ( cache_readonly, @@ -215,7 +216,7 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: return mask - def _apply_mask(self, mask: np.ndarray) -> FrameOrSeriesUnion: + def _apply_mask(self, mask: np.ndarray) -> DataFrame | Series: if TYPE_CHECKING: self = cast(groupby.GroupBy, self) @@ -244,7 +245,7 @@ class BodyGroupByIndexer: def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object - def __getitem__(self, arg: PositionalIndexer | tuple) -> FrameOrSeriesUnion: + def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: """ Select by positional index per group. From 69d895693d684eceb9bf2d578a1eb7f4b4f12e62 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 26 Sep 2021 19:05:35 +0100 Subject: [PATCH 62/73] More typing problems --- pandas/core/groupby/indexing.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 0a62bc412c3ac..8a4a598b88c66 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -8,11 +8,7 @@ import numpy as np -from pandas._typing import ( - DataFrame, - PositionalIndexer, - Series, -) +from pandas._typing import PositionalIndexer from pandas.util._decorators import ( cache_readonly, doc, @@ -23,6 +19,11 @@ is_list_like, ) +from pandas import ( + DataFrame, + Series, +) + if TYPE_CHECKING: from pandas.core.groupby import groupby From 98a94606f25c39696c38f8ba6eeb3c07bbab537e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 26 Sep 2021 19:19:26 +0100 Subject: [PATCH 63/73] More typing woes --- pandas/core/groupby/indexing.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 8a4a598b88c66..66ed85ef20b44 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -19,12 +19,11 @@ is_list_like, ) -from pandas import ( - DataFrame, - Series, -) - if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) from pandas.core.groupby import groupby From d0e9aa02ca0e2c3032026a262d0d879a4eea3640 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 28 Sep 2021 13:45:07 +0100 Subject: [PATCH 64/73] Create test_body.py --- test_body.py | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 test_body.py diff --git a/test_body.py b/test_body.py new file mode 100644 index 0000000000000..fa63b863e4995 --- /dev/null +++ b/test_body.py @@ -0,0 +1,287 @@ +# Test GroupBy._body positional grouped indexing GH#42864 + +import random + +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [0, [0, 1, 4]], + [2, [5]], + [5, []], + [-1, [3, 4, 7]], + [-2, [1, 6]], + [-6, []], + ], +) +def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): + # Test single integer + result = slice_test_grouped._body[arg] + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_slice(slice_test_df, slice_test_grouped): + # Test single slice + result = slice_test_grouped._body[0:3:2] + expected = slice_test_df.iloc[[0, 1, 4, 5]] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [[0, 2], [0, 1, 4, 5]], + [[0, 2, -1], [0, 1, 3, 4, 5, 7]], + [range(0, 3, 2), [0, 1, 4, 5]], + [{0, 2}, [0, 1, 4, 5]], + ], + ids=[ + "list", + "negative", + "range", + "set", + ], +) +def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): + # Test lists of integers and integer valued iterables + result = slice_test_grouped._body[arg] + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_ints(slice_test_df, slice_test_grouped): + # Test tuple of ints + result = slice_test_grouped._body[0, 2, -1] + expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] + + tm.assert_frame_equal(result, expected) + + +def test_slices(slice_test_df, slice_test_grouped): + # Test tuple of slices + result = slice_test_grouped._body[:2, -2:] + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + + +def test_mix(slice_test_df, slice_test_grouped): + # Test mixed tuple of ints and slices + result = slice_test_grouped._body[0, 1, -2:] + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [0, [0, 1, 4]], + [[0, 2, -1], [0, 1, 3, 4, 5, 7]], + [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]], + ], +) +def test_as_index(slice_test_df, arg, expected_rows): + # Test the default as_index behaviour + result = slice_test_df.groupby("Group", sort=False)._body[arg] + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + + +def test_doc_examples(): + # Test the examples in the documentation + df = pd.DataFrame( + [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] + ) + + grouped = df.groupby("A", as_index=False) + + result = grouped._body[1:2] + expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) + + tm.assert_frame_equal(result, expected) + + result = grouped._body[1, -1] + expected = pd.DataFrame( + [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture() +def multiindex_data(): + ndates = 100 + nitems = 20 + dates = pd.date_range("20130101", periods=ndates, freq="D") + items = [f"item {i}" for i in range(nitems)] + + data = {} + for date in dates: + nitems_for_date = nitems - random.randint(0, 12) + levels = [ + (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) + for item in items[:nitems_for_date] + ] + levels.sort(key=lambda x: x[1]) + data[date] = levels + + return data + + +def _make_df_from_data(data): + rows = {} + for date in data: + for level in data[date]: + rows[(date, level[0])] = {"A": level[1], "B": level[2]} + + df = pd.DataFrame.from_dict(rows, orient="index") + df.index.names = ("Date", "Item") + return df + + +def test_multiindex(multiindex_data): + # Test the multiindex mentioned as the use-case in the documentation + df = _make_df_from_data(multiindex_data) + result = df.groupby("Date", as_index=False).nth(slice(3, -3)) + + sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data} + expected = _make_df_from_data(sliced) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000]) +@pytest.mark.parametrize("method", ["head", "tail"]) +@pytest.mark.parametrize("simulated", [True, False]) +def test_against_head_and_tail(arg, method, simulated): + # Test gives the same results as grouped head and tail + n_groups = 100 + n_rows_per_group = 30 + + data = { + "group": [ + f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) + ], + "value": [ + f"group {g} row {j}" + for j in range(n_rows_per_group) + for g in range(n_groups) + ], + } + df = pd.DataFrame(data) + grouped = df.groupby("group", as_index=False) + size = arg if arg >= 0 else n_rows_per_group + arg + + if method == "head": + result = grouped._body[:arg] + + if simulated: + indices = [] + for j in range(size): + for i in range(n_groups): + if j * n_groups + i < n_groups * n_rows_per_group: + indices.append(j * n_groups + i) + + expected = df.iloc[indices] + + else: + expected = grouped.head(arg) + + else: + result = grouped._body[-arg:] + + if simulated: + indices = [] + for j in range(size): + for i in range(n_groups): + if (n_rows_per_group + j - size) * n_groups + i >= 0: + indices.append((n_rows_per_group + j - size) * n_groups + i) + + expected = df.iloc[indices] + + else: + expected = grouped.tail(arg) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10]) +@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10]) +@pytest.mark.parametrize("step", [None, 1, 5]) +def test_against_df_iloc(start, stop, step): + # Test that a single group gives the same results as DataFame.iloc + n_rows = 30 + + data = { + "group": ["group 0"] * n_rows, + "value": list(range(n_rows)), + } + df = pd.DataFrame(data) + grouped = df.groupby("group", as_index=False) + + result = grouped._body[start:stop:step] + expected = df.iloc[start:stop:step] + + tm.assert_frame_equal(result, expected) + + +def test_series(): + # Test grouped Series + ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) + grouped = ser.groupby(level=0) + result = grouped._body[1:2] + expected = pd.Series([2, 5], index=["a", "b"]) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("step", [1, 2, 3, 4, 5]) +def test_step(step): + # Test slice with various step values + data = [["x", f"x{i}"] for i in range(5)] + data += [["y", f"y{i}"] for i in range(4)] + data += [["z", f"z{i}"] for i in range(3)] + df = pd.DataFrame(data, columns=["A", "B"]) + + grouped = df.groupby("A", as_index=False) + + result = grouped._body[::step] + + data = [["x", f"x{i}"] for i in range(0, 5, step)] + data += [["y", f"y{i}"] for i in range(0, 4, step)] + data += [["z", f"z{i}"] for i in range(0, 3, step)] + + index = [0 + i for i in range(0, 5, step)] + index += [5 + i for i in range(0, 4, step)] + index += [9 + i for i in range(0, 3, step)] + + expected = pd.DataFrame(data, columns=["A", "B"], index=index) + + tm.assert_frame_equal(result, expected) + + +@pytest.fixture() +def column_group_df(): + return pd.DataFrame( + [[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]], + columns=["A", "B", "C", "D", "E", "F", "G"], + ) + + +def test_column_axis(column_group_df): + g = column_group_df.groupby(column_group_df.iloc[1], axis=1) + result = g._body[1:-1] + expected = column_group_df.iloc[:, [1, 3]] + + tm.assert_frame_equal(result, expected) From a3db969bdcae24a95d0aed3aee8df52cfdf9ed23 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 28 Sep 2021 14:57:21 +0100 Subject: [PATCH 65/73] Resolve conflicts --- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/indexing.py | 28 ++++++++++++++++++---------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 53065ed75506f..700c617632644 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -45,8 +45,8 @@ class providing the base-class of operations. from pandas._typing import ( ArrayLike, IndexLabel, - PositionalIndexer, NDFrameT, + PositionalIndexer, RandomState, Scalar, T, @@ -558,7 +558,7 @@ def f(self): ] -class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT]): +class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): _group_selection: IndexLabel | None = None _apply_allowlist: frozenset[str] = frozenset() _hidden_attrs = PandasObject._hidden_attrs | { @@ -2448,7 +2448,7 @@ def nth( self, n: PositionalIndexer | tuple, dropna: Literal["any", "all", None] = None, - ) -> FrameOrSeries: + ) -> NDFrameT: """ Take the nth row from each group if n is an int, otherwise a subset of rows. diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 66ed85ef20b44..30cb1d845a71d 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -109,9 +109,11 @@ def _body(self) -> BodyGroupByIndexer: 4 b 5 """ if TYPE_CHECKING: - self = cast(groupby.GroupBy, self) + groupby = cast(groupby.GroupBy, self) + else: + groupby = self - return BodyGroupByIndexer(self) + return BodyGroupByIndexer(groupby) def _make_mask(self, arg: PositionalIndexer | tuple) -> np.ndarray: if is_list_like(arg): @@ -218,26 +220,32 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: def _apply_mask(self, mask: np.ndarray) -> DataFrame | Series: if TYPE_CHECKING: - self = cast(groupby.GroupBy, self) + groupby = cast(groupby.GroupBy, self) + else: + groupby = self - if self.axis == 0: - return self._selected_obj[mask] + if groupby.axis == 0: + return groupby._selected_obj[mask] else: - return self._selected_obj.iloc[:, mask] + return groupby._selected_obj.iloc[:, mask] @cache_readonly def _ascending_count(self) -> np.ndarray: if TYPE_CHECKING: - self = cast(groupby.GroupBy, self) + groupby = cast(groupby.GroupBy, self) + else: + groupby = self - return self._cumcount_array() + return groupby._cumcount_array() @cache_readonly def _descending_count(self) -> np.ndarray: if TYPE_CHECKING: - self = cast(groupby.GroupBy, self) + groupby = cast(groupby.GroupBy, self) + else: + groupby = self - return self._cumcount_array(ascending=False) + return groupby._cumcount_array(ascending=False) @doc(GroupByIndexingMixin._body) From 4c4ba92999b4866f349242098925ed5f1d5a8ae2 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 28 Sep 2021 15:13:59 +0100 Subject: [PATCH 66/73] Avoid groupby name clash --- pandas/core/groupby/indexing.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 30cb1d845a71d..44273c09b19d1 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -109,11 +109,11 @@ def _body(self) -> BodyGroupByIndexer: 4 b 5 """ if TYPE_CHECKING: - groupby = cast(groupby.GroupBy, self) + groupby_self = cast(groupby.GroupBy, self) else: - groupby = self + groupby_self = self - return BodyGroupByIndexer(groupby) + return BodyGroupByIndexer(groupby_self) def _make_mask(self, arg: PositionalIndexer | tuple) -> np.ndarray: if is_list_like(arg): @@ -220,32 +220,32 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: def _apply_mask(self, mask: np.ndarray) -> DataFrame | Series: if TYPE_CHECKING: - groupby = cast(groupby.GroupBy, self) + groupby_self = cast(groupby.GroupBy, self) else: - groupby = self + groupby_self = self - if groupby.axis == 0: - return groupby._selected_obj[mask] + if groupby_self.axis == 0: + return groupby_self._selected_obj[mask] else: - return groupby._selected_obj.iloc[:, mask] + return groupby_self._selected_obj.iloc[:, mask] @cache_readonly def _ascending_count(self) -> np.ndarray: if TYPE_CHECKING: - groupby = cast(groupby.GroupBy, self) + groupby_self = cast(groupby.GroupBy, self) else: - groupby = self + groupby_self = self - return groupby._cumcount_array() + return groupby_self._cumcount_array() @cache_readonly def _descending_count(self) -> np.ndarray: if TYPE_CHECKING: - groupby = cast(groupby.GroupBy, self) + groupby_self = cast(groupby.GroupBy, self) else: - groupby = self + groupby_self = self - return groupby._cumcount_array(ascending=False) + return groupby_self._cumcount_array(ascending=False) @doc(GroupByIndexingMixin._body) From acf67b17e11b20338f7bd4a8a5f5fc8f979dc0fb Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 29 Sep 2021 08:56:28 +0100 Subject: [PATCH 67/73] Delete duplicated test_body.py --- test_body.py | 287 --------------------------------------------------- 1 file changed, 287 deletions(-) delete mode 100644 test_body.py diff --git a/test_body.py b/test_body.py deleted file mode 100644 index fa63b863e4995..0000000000000 --- a/test_body.py +++ /dev/null @@ -1,287 +0,0 @@ -# Test GroupBy._body positional grouped indexing GH#42864 - -import random - -import pytest - -import pandas as pd -import pandas._testing as tm - - -@pytest.mark.parametrize( - "arg, expected_rows", - [ - [0, [0, 1, 4]], - [2, [5]], - [5, []], - [-1, [3, 4, 7]], - [-2, [1, 6]], - [-6, []], - ], -) -def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): - # Test single integer - result = slice_test_grouped._body[arg] - expected = slice_test_df.iloc[expected_rows] - - tm.assert_frame_equal(result, expected) - - -def test_slice(slice_test_df, slice_test_grouped): - # Test single slice - result = slice_test_grouped._body[0:3:2] - expected = slice_test_df.iloc[[0, 1, 4, 5]] - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "arg, expected_rows", - [ - [[0, 2], [0, 1, 4, 5]], - [[0, 2, -1], [0, 1, 3, 4, 5, 7]], - [range(0, 3, 2), [0, 1, 4, 5]], - [{0, 2}, [0, 1, 4, 5]], - ], - ids=[ - "list", - "negative", - "range", - "set", - ], -) -def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): - # Test lists of integers and integer valued iterables - result = slice_test_grouped._body[arg] - expected = slice_test_df.iloc[expected_rows] - - tm.assert_frame_equal(result, expected) - - -def test_ints(slice_test_df, slice_test_grouped): - # Test tuple of ints - result = slice_test_grouped._body[0, 2, -1] - expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] - - tm.assert_frame_equal(result, expected) - - -def test_slices(slice_test_df, slice_test_grouped): - # Test tuple of slices - result = slice_test_grouped._body[:2, -2:] - expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] - - tm.assert_frame_equal(result, expected) - - -def test_mix(slice_test_df, slice_test_grouped): - # Test mixed tuple of ints and slices - result = slice_test_grouped._body[0, 1, -2:] - expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "arg, expected_rows", - [ - [0, [0, 1, 4]], - [[0, 2, -1], [0, 1, 3, 4, 5, 7]], - [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]], - ], -) -def test_as_index(slice_test_df, arg, expected_rows): - # Test the default as_index behaviour - result = slice_test_df.groupby("Group", sort=False)._body[arg] - expected = slice_test_df.iloc[expected_rows] - - tm.assert_frame_equal(result, expected) - - -def test_doc_examples(): - # Test the examples in the documentation - df = pd.DataFrame( - [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] - ) - - grouped = df.groupby("A", as_index=False) - - result = grouped._body[1:2] - expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) - - tm.assert_frame_equal(result, expected) - - result = grouped._body[1, -1] - expected = pd.DataFrame( - [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] - ) - - tm.assert_frame_equal(result, expected) - - -@pytest.fixture() -def multiindex_data(): - ndates = 100 - nitems = 20 - dates = pd.date_range("20130101", periods=ndates, freq="D") - items = [f"item {i}" for i in range(nitems)] - - data = {} - for date in dates: - nitems_for_date = nitems - random.randint(0, 12) - levels = [ - (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) - for item in items[:nitems_for_date] - ] - levels.sort(key=lambda x: x[1]) - data[date] = levels - - return data - - -def _make_df_from_data(data): - rows = {} - for date in data: - for level in data[date]: - rows[(date, level[0])] = {"A": level[1], "B": level[2]} - - df = pd.DataFrame.from_dict(rows, orient="index") - df.index.names = ("Date", "Item") - return df - - -def test_multiindex(multiindex_data): - # Test the multiindex mentioned as the use-case in the documentation - df = _make_df_from_data(multiindex_data) - result = df.groupby("Date", as_index=False).nth(slice(3, -3)) - - sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data} - expected = _make_df_from_data(sliced) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000]) -@pytest.mark.parametrize("method", ["head", "tail"]) -@pytest.mark.parametrize("simulated", [True, False]) -def test_against_head_and_tail(arg, method, simulated): - # Test gives the same results as grouped head and tail - n_groups = 100 - n_rows_per_group = 30 - - data = { - "group": [ - f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) - ], - "value": [ - f"group {g} row {j}" - for j in range(n_rows_per_group) - for g in range(n_groups) - ], - } - df = pd.DataFrame(data) - grouped = df.groupby("group", as_index=False) - size = arg if arg >= 0 else n_rows_per_group + arg - - if method == "head": - result = grouped._body[:arg] - - if simulated: - indices = [] - for j in range(size): - for i in range(n_groups): - if j * n_groups + i < n_groups * n_rows_per_group: - indices.append(j * n_groups + i) - - expected = df.iloc[indices] - - else: - expected = grouped.head(arg) - - else: - result = grouped._body[-arg:] - - if simulated: - indices = [] - for j in range(size): - for i in range(n_groups): - if (n_rows_per_group + j - size) * n_groups + i >= 0: - indices.append((n_rows_per_group + j - size) * n_groups + i) - - expected = df.iloc[indices] - - else: - expected = grouped.tail(arg) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10]) -@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10]) -@pytest.mark.parametrize("step", [None, 1, 5]) -def test_against_df_iloc(start, stop, step): - # Test that a single group gives the same results as DataFame.iloc - n_rows = 30 - - data = { - "group": ["group 0"] * n_rows, - "value": list(range(n_rows)), - } - df = pd.DataFrame(data) - grouped = df.groupby("group", as_index=False) - - result = grouped._body[start:stop:step] - expected = df.iloc[start:stop:step] - - tm.assert_frame_equal(result, expected) - - -def test_series(): - # Test grouped Series - ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) - grouped = ser.groupby(level=0) - result = grouped._body[1:2] - expected = pd.Series([2, 5], index=["a", "b"]) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("step", [1, 2, 3, 4, 5]) -def test_step(step): - # Test slice with various step values - data = [["x", f"x{i}"] for i in range(5)] - data += [["y", f"y{i}"] for i in range(4)] - data += [["z", f"z{i}"] for i in range(3)] - df = pd.DataFrame(data, columns=["A", "B"]) - - grouped = df.groupby("A", as_index=False) - - result = grouped._body[::step] - - data = [["x", f"x{i}"] for i in range(0, 5, step)] - data += [["y", f"y{i}"] for i in range(0, 4, step)] - data += [["z", f"z{i}"] for i in range(0, 3, step)] - - index = [0 + i for i in range(0, 5, step)] - index += [5 + i for i in range(0, 4, step)] - index += [9 + i for i in range(0, 3, step)] - - expected = pd.DataFrame(data, columns=["A", "B"], index=index) - - tm.assert_frame_equal(result, expected) - - -@pytest.fixture() -def column_group_df(): - return pd.DataFrame( - [[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]], - columns=["A", "B", "C", "D", "E", "F", "G"], - ) - - -def test_column_axis(column_group_df): - g = column_group_df.groupby(column_group_df.iloc[1], axis=1) - result = g._body[1:-1] - expected = column_group_df.iloc[:, [1, 3]] - - tm.assert_frame_equal(result, expected) From 82360f573d658ae7b69ef8d373f5a377b3aea95e Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 4 Oct 2021 13:54:06 +0100 Subject: [PATCH 68/73] Rename test_body.py to test_indexing.py --- pandas/tests/groupby/{test_body.py => test_indexing.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/groupby/{test_body.py => test_indexing.py} (100%) diff --git a/pandas/tests/groupby/test_body.py b/pandas/tests/groupby/test_indexing.py similarity index 100% rename from pandas/tests/groupby/test_body.py rename to pandas/tests/groupby/test_indexing.py From 8abcad767778069a323fb8d714f671a6b93ab490 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 4 Oct 2021 15:03:59 +0100 Subject: [PATCH 69/73] @jreback suggested renames --- pandas/core/groupby/groupby.py | 12 ++-- pandas/core/groupby/indexing.py | 91 ++++++++++++++------------- pandas/tests/groupby/test_indexing.py | 32 +++++----- 3 files changed, 70 insertions(+), 65 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 217922e9c7699..8f4ed396e41ea 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2536,7 +2536,7 @@ def nth( """ if not dropna: with self._group_selection_context(): - mask = self._make_mask(n) + mask = self._make_mask_from_positional_indexer(n) ids, _, _ = self.grouper.group_info @@ -3319,8 +3319,8 @@ def head(self, n=5): 0 1 2 """ self._reset_group_selection() - mask = self._make_mask(slice(None, n)) - return self._apply_mask(mask) + mask = self._make_mask_from_positional_indexer(slice(None, n)) + return self._apply_positional_indexer_mask(mask) @final @Substitution(name="groupby") @@ -3360,11 +3360,11 @@ def tail(self, n=5): """ self._reset_group_selection() if n: - mask = self._make_mask(slice(-n, None)) + mask = self._make_mask_from_positional_indexer(slice(-n, None)) else: - mask = self._make_mask([]) + mask = self._make_mask_from_positional_indexer([]) - return self._apply_mask(mask) + return self._apply_positional_indexer_mask(mask) @final def _reindex_output( diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 44273c09b19d1..862c98cd28d46 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -29,23 +29,23 @@ class GroupByIndexingMixin: """ - Mixin for adding ._body to GroupBy. + Mixin for adding ._positional_selector to GroupBy. """ @cache_readonly - def _body(self) -> BodyGroupByIndexer: + def _positional_selector(self) -> GroupByPositionalSelector: """ Return positional selection for each group. - ``groupby._body[i:j]`` is similar to + ``groupby._positional_selector[i:j]`` is similar to ``groupby.apply(lambda x: x.iloc[i:j])`` but much faster and preserves the original index and order. - ``_body[]`` is compatible with and extends :meth:`~GroupBy.head` and - :meth:`~GroupBy.tail`. For example: + ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head` + and :meth:`~GroupBy.tail`. For example: - ``head(5)`` - - ``_body[5:-5]`` + - ``_positional_selector[5:-5]`` - ``tail(5)`` together return all the rows. @@ -84,48 +84,53 @@ def _body(self) -> BodyGroupByIndexer: By contrast, ``DataFrame.iloc`` can change the row order. - ``groupby()`` parameters such as as_index and dropna are ignored. - The differences between ``_body[]`` and :meth:`~GroupBy.nth` + The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth` with ``as_index=False`` are: - - Input to ``_body`` can include one or more slices whereas ``nth`` just handles - an integer or a list of integers. - - ``_body`` can accept a slice relative to the last row of each group. - - ``_body`` does not have an equivalent to the ``nth()`` ``dropna`` - parameter. + - Input to ``_positional_selector`` can include + one or more slices whereas ``nth`` + just handles an integer or a list of integers. + - ``_positional_selector`` can accept a slice relative to the + last row of each group. + - ``_positional_selector`` does not have an equivalent to the + ``nth()`` ``dropna`` parameter. Examples -------- - >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], - ... columns=["A", "B"]) - >>> df.groupby("A")._body[1:2] - A B - 1 a 2 - 4 b 5 - - >>> df.groupby("A")._body[1, -1] - A B - 1 a 2 - 2 a 3 - 4 b 5 + >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], + ... columns=["A", "B"]) + >>> df.groupby("A")._positional_selector[1:2] + A B + 1 a 2 + 4 b 5 + + >>> df.groupby("A")._positional_selector[1, -1] + A B + 1 a 2 + 2 a 3 + 4 b 5 """ if TYPE_CHECKING: groupby_self = cast(groupby.GroupBy, self) else: groupby_self = self - return BodyGroupByIndexer(groupby_self) + return GroupByPositionalSelector(groupby_self) - def _make_mask(self, arg: PositionalIndexer | tuple) -> np.ndarray: + def _make_mask_from_positional_indexer( + self, + arg: PositionalIndexer | tuple, + ) -> np.ndarray: if is_list_like(arg): if all(is_integer(i) for i in cast(Iterable, arg)): - mask = self._handle_list(cast(Iterable[int], arg)) + mask = self._make_mask_from_list(cast(Iterable[int], arg)) else: - mask = self._handle_tuple(cast(tuple, arg)) + mask = self._make_mask_from_tuple(cast(tuple, arg)) elif isinstance(arg, slice): - mask = self._handle_slice(arg) + mask = self._make_mask_from_slice(arg) elif is_integer(arg): - mask = self._handle_int(cast(int, arg)) + mask = self._make_mask_from_int(cast(int, arg)) else: raise TypeError( f"Invalid index {type(arg)}. " @@ -141,13 +146,13 @@ def _make_mask(self, arg: PositionalIndexer | tuple) -> np.ndarray: return cast(np.ndarray, mask) - def _handle_int(self, arg: int) -> np.ndarray: + def _make_mask_from_int(self, arg: int) -> np.ndarray: if arg >= 0: return self._ascending_count == arg else: return self._descending_count == (-arg - 1) - def _handle_list(self, args: Iterable[int]) -> bool | np.ndarray: + def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray: positive = [arg for arg in args if arg >= 0] negative = [-arg - 1 for arg in args if arg < 0] @@ -161,14 +166,14 @@ def _handle_list(self, args: Iterable[int]) -> bool | np.ndarray: return mask - def _handle_tuple(self, args: tuple) -> bool | np.ndarray: + def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray: mask: bool | np.ndarray = False for arg in args: if is_integer(arg): - mask |= self._handle_int(cast(int, arg)) + mask |= self._make_mask_from_int(cast(int, arg)) elif isinstance(arg, slice): - mask |= self._handle_slice(arg) + mask |= self._make_mask_from_slice(arg) else: raise ValueError( f"Invalid argument {type(arg)}. Should be int or slice." @@ -176,7 +181,7 @@ def _handle_tuple(self, args: tuple) -> bool | np.ndarray: return mask - def _handle_slice(self, arg: slice) -> bool | np.ndarray: + def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray: start = arg.start stop = arg.stop step = arg.step @@ -218,7 +223,7 @@ def _handle_slice(self, arg: slice) -> bool | np.ndarray: return mask - def _apply_mask(self, mask: np.ndarray) -> DataFrame | Series: + def _apply_positional_indexer_mask(self, mask: np.ndarray) -> DataFrame | Series: if TYPE_CHECKING: groupby_self = cast(groupby.GroupBy, self) else: @@ -248,8 +253,8 @@ def _descending_count(self) -> np.ndarray: return groupby_self._cumcount_array(ascending=False) -@doc(GroupByIndexingMixin._body) -class BodyGroupByIndexer: +@doc(GroupByIndexingMixin._positional_selector) +class GroupByPositionalSelector: def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object @@ -257,7 +262,7 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: """ Select by positional index per group. - Implements GroupBy._body + Implements GroupBy._positional_selector Parameters ---------- @@ -280,10 +285,10 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: DataFrame.iloc : Integer-location based indexing for selection by position. GroupBy.head : Return first n rows of each group. GroupBy.tail : Return last n rows of each group. - GroupBy._body : Return positional selection for each group. + GroupBy._positional_selector : Return positional selection for each group. GroupBy.nth : Take the nth row from each group if n is an int, or a subset of rows, if n is a list of ints. """ self.groupby_object._reset_group_selection() - mask = self.groupby_object._make_mask(arg) - return self.groupby_object._apply_mask(mask) + mask = self.groupby_object._make_mask_from_positional_indexer(arg) + return self.groupby_object._apply_positional_indexer_mask(mask) diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py index fa63b863e4995..b9f71fd4ed96a 100644 --- a/pandas/tests/groupby/test_indexing.py +++ b/pandas/tests/groupby/test_indexing.py @@ -1,4 +1,4 @@ -# Test GroupBy._body positional grouped indexing GH#42864 +# Test GroupBy._positional_selector positional grouped indexing GH#42864 import random @@ -21,7 +21,7 @@ ) def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): # Test single integer - result = slice_test_grouped._body[arg] + result = slice_test_grouped._positional_selector[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -29,7 +29,7 @@ def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): def test_slice(slice_test_df, slice_test_grouped): # Test single slice - result = slice_test_grouped._body[0:3:2] + result = slice_test_grouped._positional_selector[0:3:2] expected = slice_test_df.iloc[[0, 1, 4, 5]] tm.assert_frame_equal(result, expected) @@ -52,7 +52,7 @@ def test_slice(slice_test_df, slice_test_grouped): ) def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): # Test lists of integers and integer valued iterables - result = slice_test_grouped._body[arg] + result = slice_test_grouped._positional_selector[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -60,7 +60,7 @@ def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): def test_ints(slice_test_df, slice_test_grouped): # Test tuple of ints - result = slice_test_grouped._body[0, 2, -1] + result = slice_test_grouped._positional_selector[0, 2, -1] expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] tm.assert_frame_equal(result, expected) @@ -68,7 +68,7 @@ def test_ints(slice_test_df, slice_test_grouped): def test_slices(slice_test_df, slice_test_grouped): # Test tuple of slices - result = slice_test_grouped._body[:2, -2:] + result = slice_test_grouped._positional_selector[:2, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) @@ -76,7 +76,7 @@ def test_slices(slice_test_df, slice_test_grouped): def test_mix(slice_test_df, slice_test_grouped): # Test mixed tuple of ints and slices - result = slice_test_grouped._body[0, 1, -2:] + result = slice_test_grouped._positional_selector[0, 1, -2:] expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] tm.assert_frame_equal(result, expected) @@ -92,7 +92,7 @@ def test_mix(slice_test_df, slice_test_grouped): ) def test_as_index(slice_test_df, arg, expected_rows): # Test the default as_index behaviour - result = slice_test_df.groupby("Group", sort=False)._body[arg] + result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg] expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) @@ -106,12 +106,12 @@ def test_doc_examples(): grouped = df.groupby("A", as_index=False) - result = grouped._body[1:2] + result = grouped._positional_selector[1:2] expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) tm.assert_frame_equal(result, expected) - result = grouped._body[1, -1] + result = grouped._positional_selector[1, -1] expected = pd.DataFrame( [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] ) @@ -184,7 +184,7 @@ def test_against_head_and_tail(arg, method, simulated): size = arg if arg >= 0 else n_rows_per_group + arg if method == "head": - result = grouped._body[:arg] + result = grouped._positional_selector[:arg] if simulated: indices = [] @@ -199,7 +199,7 @@ def test_against_head_and_tail(arg, method, simulated): expected = grouped.head(arg) else: - result = grouped._body[-arg:] + result = grouped._positional_selector[-arg:] if simulated: indices = [] @@ -230,7 +230,7 @@ def test_against_df_iloc(start, stop, step): df = pd.DataFrame(data) grouped = df.groupby("group", as_index=False) - result = grouped._body[start:stop:step] + result = grouped._positional_selector[start:stop:step] expected = df.iloc[start:stop:step] tm.assert_frame_equal(result, expected) @@ -240,7 +240,7 @@ def test_series(): # Test grouped Series ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) grouped = ser.groupby(level=0) - result = grouped._body[1:2] + result = grouped._positional_selector[1:2] expected = pd.Series([2, 5], index=["a", "b"]) tm.assert_series_equal(result, expected) @@ -256,7 +256,7 @@ def test_step(step): grouped = df.groupby("A", as_index=False) - result = grouped._body[::step] + result = grouped._positional_selector[::step] data = [["x", f"x{i}"] for i in range(0, 5, step)] data += [["y", f"y{i}"] for i in range(0, 4, step)] @@ -281,7 +281,7 @@ def column_group_df(): def test_column_axis(column_group_df): g = column_group_df.groupby(column_group_df.iloc[1], axis=1) - result = g._body[1:-1] + result = g._positional_selector[1:-1] expected = column_group_df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) From 86c8e2034eef857ab9ed441301936a9756deee67 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 4 Oct 2021 18:04:56 +0100 Subject: [PATCH 70/73] Update whatsnew v1.4.0 --- doc/source/whatsnew/v1.4.0.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 0c841078fe9b4..c35d984a462ff 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -110,6 +110,30 @@ Example: s.rolling(3).rank(method="max") +.. _whatsnew_140.enhancements.groupby_indexing: + +Groupby positional indexing +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is now posible to specify positional ranges relative to the ends of each group. + +Negative arguments for :meth:`.GroupBy.head` and :meth:`.GroupBy.tail` now work correctly and result in ranges relative to the end and start of each group, respectively. +Previously, negative arguments returned empty frames. + +.. ipython:: python + + df = pd.DataFrame([["g", "g0"], ["g", "g1"], ["g", "g2"], ["g", "g3"], + ["h", "h0"], ["h", "h1"]], columns=["A", "B"]) + df.groupby("A").head(-1) + + +:meth:`.GroupBy.nth` now accepts a slice or list of integers and slices. + +.. ipython:: python + + df.groupby("A").nth(slice(1, -1)) + df.groupby("A").nth([slice(None, 1), slice(-1, None)]) + .. _whatsnew_140.enhancements.other: Other enhancements From ee33df00be759147abc43d431433df6f5396fe90 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 4 Oct 2021 18:23:11 +0100 Subject: [PATCH 71/73] Correct typo in doc --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c35d984a462ff..104ea154c7f6d 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -115,7 +115,7 @@ Example: Groupby positional indexing ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It is now posible to specify positional ranges relative to the ends of each group. +It is now possible to specify positional ranges relative to the ends of each group. Negative arguments for :meth:`.GroupBy.head` and :meth:`.GroupBy.tail` now work correctly and result in ranges relative to the end and start of each group, respectively. Previously, negative arguments returned empty frames. From 534ea54c80792391978bededbdea93e8cfc5c400 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 9 Oct 2021 14:11:31 +0100 Subject: [PATCH 72/73] Resolve with another branch --- pandas/core/groupby/groupby.py | 13 ++++++++++--- pandas/core/groupby/indexing.py | 24 ++++++------------------ 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8f4ed396e41ea..8f21c6b85d3d7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2543,7 +2543,7 @@ def nth( # Drop NA values in grouping mask = mask & (ids != -1) - out = self._selected_obj[mask] + out = self._mask_selected_obj(mask) if not self.as_index: return out @@ -3320,7 +3320,7 @@ def head(self, n=5): """ self._reset_group_selection() mask = self._make_mask_from_positional_indexer(slice(None, n)) - return self._apply_positional_indexer_mask(mask) + return self._mask_selected_obj(mask) @final @Substitution(name="groupby") @@ -3364,7 +3364,14 @@ def tail(self, n=5): else: mask = self._make_mask_from_positional_indexer([]) - return self._apply_positional_indexer_mask(mask) + return self._mask_selected_obj(mask) + + @final + def _mask_selected_obj(self, mask: np.ndarray) -> NDFrameT: + if self.axis == 0: + return self._selected_obj[mask] + else: + return self._selected_obj.iloc[:, mask] @final def _reindex_output( diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 862c98cd28d46..7aaffe51f2ee8 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -8,7 +8,10 @@ import numpy as np -from pandas._typing import PositionalIndexer +from pandas._typing import ( + NDFrameT, + PositionalIndexer, +) from pandas.util._decorators import ( cache_readonly, doc, @@ -20,10 +23,6 @@ ) if TYPE_CHECKING: - from pandas import ( - DataFrame, - Series, - ) from pandas.core.groupby import groupby @@ -223,17 +222,6 @@ def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray: return mask - def _apply_positional_indexer_mask(self, mask: np.ndarray) -> DataFrame | Series: - if TYPE_CHECKING: - groupby_self = cast(groupby.GroupBy, self) - else: - groupby_self = self - - if groupby_self.axis == 0: - return groupby_self._selected_obj[mask] - else: - return groupby_self._selected_obj.iloc[:, mask] - @cache_readonly def _ascending_count(self) -> np.ndarray: if TYPE_CHECKING: @@ -258,7 +246,7 @@ class GroupByPositionalSelector: def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object - def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: + def __getitem__(self, arg: PositionalIndexer | tuple) -> NDFrameT: """ Select by positional index per group. @@ -291,4 +279,4 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: """ self.groupby_object._reset_group_selection() mask = self.groupby_object._make_mask_from_positional_indexer(arg) - return self.groupby_object._apply_positional_indexer_mask(mask) + return self.groupby_object._mask_selected_obj(mask) From 97c3ac0b1b7ee5e12b6abd2340aca3438b0fc376 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 9 Oct 2021 16:40:39 +0100 Subject: [PATCH 73/73] NDFrameT cannot be used like that --- pandas/core/groupby/indexing.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 7aaffe51f2ee8..4b3bb6bc0aa50 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -8,10 +8,7 @@ import numpy as np -from pandas._typing import ( - NDFrameT, - PositionalIndexer, -) +from pandas._typing import PositionalIndexer from pandas.util._decorators import ( cache_readonly, doc, @@ -23,6 +20,10 @@ ) if TYPE_CHECKING: + from pandas import ( + DataFrame, + Series, + ) from pandas.core.groupby import groupby @@ -246,7 +247,7 @@ class GroupByPositionalSelector: def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object - def __getitem__(self, arg: PositionalIndexer | tuple) -> NDFrameT: + def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: """ Select by positional index per group.