pandas-dev · jreback · Oct 15, 2021 · Aug 9, 2021 · Aug 9, 2021 · Aug 9, 2021
diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py
@@ -125,6 +125,7 @@
         "groups",
         "head",
         "hist",
+        "iloc",
         "indices",
         "ndim",
         "ngroups",

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -109,6 +109,8 @@ class providing the base-class of operations.
     maybe_use_numba,
 )
 
+from pandas.core.groupby.groupbyindexing import GroupByIndexingMixin
+
 _common_see_also = """
         See Also
         --------
@@ -565,7 +567,7 @@ def group_selection_context(groupby: GroupBy) -> Iterator[GroupBy]:
 ]
 
 
-class BaseGroupBy(PandasObject, SelectionMixin[FrameOrSeries]):
+class BaseGroupBy(PandasObject, SelectionMixin[FrameOrSeries], GroupByIndexingMixin):
     _group_selection: IndexLabel | None = None
     _apply_allowlist: frozenset[str] = frozenset()
     _hidden_attrs = PandasObject._hidden_attrs | {

diff --git a/pandas/core/groupby/groupbyindexing.py b/pandas/core/groupby/groupbyindexing.py
@@ -0,0 +1,214 @@
+from __future__ import annotations
+
+from pandas.util._decorators import doc
+import numpy as np
+
+
+class GroupByIndexingMixin:
+    """
+    Mixin for adding .iloc to GroupBy.
+    """
+
+    @property
+    def iloc(self) -> _ilocGroupByIndexer:
+        """
+        Integer location-based indexing for selection by position per group.
+
+        Similar to ``.apply(lambda x: x.iloc[i:j, k:l])``, but much faster and returns
+        a subset of rows from the original DataFrame with the original index and order
+        preserved.
+
+        The output is compatible with head() and tail()
+        The output is different from take() and nth() which do not preserve the index or order
+
+        Inputs
+        ------
+        Allowed inputs for the first index are:
+
+        - An integer, e.g. ``5``.
+        - A slice object with ints and positive step, e.g. ``1:``, ``4:-3:2``.
+
+        Allowed inputs for the second index are as for DataFrame.iloc, namely:
+
+        - An integer, e.g. ``5``.
+        - A list or array of integers, e.g. ``[4, 3, 0]``.
+        - A slice object with ints, e.g. ``1:7``.
+        - A boolean array.
+        - A ``callable`` function with one argument (the calling Series or
+          DataFrame) and that returns valid output for indexing (one of the above).
+
+        Returns
+        -------
+        Series or DataFrame
+
+        Note
+        ----
+        Neither GroupBy.nth() nor GroupBy.take() take a slice argument and
+        neither of them preserve the original DataFrame order and index.
+        They are both slow for large integer lists and take() is very slow for large group counts.
+
+        Use Case
+        --------
+        Suppose that we have a multi-indexed DataFrame with a large primary index and a secondary sorted
+        to a different order for each primary.
+        To reduce the DataFrame to a middle slice of each secondary, group by the primary and then
+        use iloc.
+        This preserves the original DataFrame"s order and indexing.
+        (See tests/groupby/test_groupby_iloc)
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
+        ...                   columns=["A", "B"])
+        >>> df.groupby("A").iloc[1:2]
+           A  B
+        1  a  2
+        4  b  5
+        >>> df.groupby("A").iloc[:-1, -1:]
+           B
+        0  1
+        1  2
+        3  4
+        """
+        return _ilocGroupByIndexer(self)
+
+
+@doc(GroupByIndexingMixin.iloc)
+class _ilocGroupByIndexer:
+    def __init__(self, grouped):
+        self.grouped = grouped
+        self.reversed = False
+        self._cached_ascending_count = None
+        self._cached_descending_count = None
+
+    def __getitem__(self, arg):
+        self.reversed = False
+
+        if type(arg) == tuple:
+            return self._handle_item(arg[0], arg[1])
+
+        else:
+            return self._handle_item(arg, None)
+
+    def _handle_item(self, arg0, arg1):
+        typeof_arg = type(arg0)
+
+        if typeof_arg == slice:
+            start = arg0.start
+            stop = arg0.stop
+            step = arg0.step
+
+            if step is not None and step < 0:
+                raise ValueError(
+                    f"GroupBy.iloc row slice step must be positive. Slice was {start}:{stop}:{step}"
+                )
+                # self.reversed = True
+                # start = None if start is None else -start - 1
+                # stop = None if stop is None else -stop - 1
+                # step = -step
+
+            return self._handle_slice(start, stop, step, arg1)
+
+        elif typeof_arg == int:
+            return self._handle_slice(arg0, arg0 + 1, 1, arg1)
+
+        else:
+            raise ValueError(
+                f"GroupBy.iloc row must be an integer or a slice, not a {typeof_arg}"
+            )
+
+    def _handle_slice(self, start, stop, step, arg1):
+        mask = None
+        if step is None:
+            step = 1
+
+        self.grouped._reset_group_selection()
+
+        if start is None:
+            if step > 1:
+                mask = self._ascending_count % step == 0
+
+        else:
+            if start >= 0:
+                mask = self._ascending_count >= start
+
+                if step > 1:
+                    mask &= (self._ascending_count - start) % step == 0
+
+            else:
+                mask = self._descending_count < -start
+
+                if step > 1:
+                    #
+                    # if start is -ve and -start exceedes the length of a group
+                    # then step must count from the
+                    # first row of that group rather than the calculated offset
+                    #
+                    # count_array + reverse_array gives the length of the
+                    # current group enabling to switch between
+                    # the offset_array and the count_array depending on whether
+                    #  -start exceedes the group size
+                    #
+                    offset_array = self._descending_count + start + 1
+                    limit_array = (
+                        self._ascending_count + self._descending_count + (start + 1)
+                    ) < 0
+                    offset_array = np.where(
+                        limit_array, self._ascending_count, offset_array
+                    )
+
+                    mask &= offset_array % step == 0
+
+        if stop is not None:
+            if stop >= 0:
+                if mask is None:
+                    mask = self._ascending_count < stop
+
+                else:
+                    mask &= self._ascending_count < stop
+            else:
+                if mask is None:
+                    mask = self._descending_count >= -stop
+
+                else:
+                    mask &= self._descending_count >= -stop
+
+        if mask is None:
+            arg0 = slice(None)
+
+        else:
+            arg0 = mask
+
+        if arg1 is None:
+            return self._selected_obj.iloc[arg0]
+
+        else:
+            return self._selected_obj.iloc[arg0, arg1]
+
+    @property
+    def _ascending_count(self):
+        if self._cached_ascending_count is None:
+            self._cached_ascending_count = self.grouped._cumcount_array()
+            if self.reversed:
+                self._cached_ascending_count = self._cached_ascending_count[::-1]
+
+        return self._cached_ascending_count
+
+    @property
+    def _descending_count(self):
+        if self._cached_descending_count is None:
+            self._cached_descending_count = self.grouped._cumcount_array(
+                ascending=False
+            )
+            if self.reversed:
+                self._cached_descending_count = self._cached_descending_count[::-1]
+
+        return self._cached_descending_count
+
+    @property
+    def _selected_obj(self):
+        if self.reversed:
+            return self.grouped._selected_obj.iloc[::-1]
+
+        else:
+            return self.grouped._selected_obj
diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py
@@ -309,6 +309,7 @@ def test_tab_completion(mframe):
         "rank",
         "cumprod",
         "tail",
+        "iloc",
         "resample",
         "cummin",
         "fillna",

diff --git a/pandas/tests/groupby/test_groupby_iloc.py b/pandas/tests/groupby/test_groupby_iloc.py
@@ -0,0 +1,146 @@
+""" Test positional grouped indexing with iloc GH#42864"""
+
+import pandas as pd
+import pandas._testing as tm
+import random
+
+
+def test_doc_examples():
+    """Test the examples in the documentation"""
+
+    df = pd.DataFrame(
+        [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
+    )
+
+    grouped = df.groupby("A")
+    result = grouped.iloc[1:2, :]
+    expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
+
+    tm.assert_frame_equal(result, expected)
+
+    result = grouped.iloc[:-1, -1:]
+    expected = pd.DataFrame([1, 2, 4], columns=["B"], index=[0, 1, 3])
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_multiindex():
+    """Test the multiindex mentioned as the use-case in the documentation"""
+
+    def make_df_from_data(data):
+        rows = {}
+        for date in dates:
+            for level in data[date]:
+                rows[(date, level[0])] = {"A": level[1], "B": level[2]}
+
+        df = pd.DataFrame.from_dict(rows, orient="index")
+        df.index.names = ("Date", "Item")
+        return df
+
+    ndates = 1000
+    nitems = 40
+    dates = pd.date_range("20130101", periods=ndates, freq="D")
+    items = [f"item {i}" for i in range(nitems)]
+
+    data = {}
+    for date in dates:
+        levels = [
+            (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100) for item in items
+        ]
+        levels.sort(key=lambda x: x[1])
+        data[date] = levels
+
+    df = make_df_from_data(data)
+    result = df.groupby("Date").iloc[3:7]
+
+    sliced = {date: data[date][3:7] for date in dates}
+    expected = make_df_from_data(sliced)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_against_head_and_tail():
+    """Test gives the same results as grouped head and tail"""
+
+    n_groups = 100
+    n_rows_per_group = 30
+
+    data = {
+        "group": [f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)],
+        "value": [
+            random.randint(0, 10000) / 100
+            for j in range(n_rows_per_group)
+            for g in range(n_groups)
+        ]
+    }
+    df = pd.DataFrame(data)
+    grouped = df.groupby("group")
+
+    for i in [1, 5, 29, 30, 31, 1000]:
+        result = grouped.iloc[:i, :]
+        expected = grouped.head(i)
+
+        tm.assert_frame_equal(result, expected)
+
+        result = grouped.iloc[-i:, :]
+        expected = grouped.tail(i)
+
+        tm.assert_frame_equal(result, expected)
+
+
+def test_against_df_iloc():
+    """Test that a single group gives the same results as DataFame.iloc"""
+
+    n_rows_per_group = 30
+
+    data = {
+        "group": ["group 0" for j in range(n_rows_per_group)],
+        "value": [random.randint(0, 10000) / 100 for j in range(n_rows_per_group)]
+    }
+    df = pd.DataFrame(data)
+    grouped = df.groupby("group")
+
+    for start in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]:
+        for stop in [None, 0, 1, 10, 29, 30, 1000, -1, -10, -29, -30, -1000]:
+            for step in [None, 1, 2, 3, 10, 29, 30, 100]:
+                result = grouped.iloc[start:stop:step, :]
+                expected = df.iloc[start:stop:step, :]
+
+                tm.assert_frame_equal(result, expected)
+
+
+def test_series():
+    """Test grouped Series"""
+
+    ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
+    grouped = ser.groupby(level=0)
+    result = grouped.iloc[1:2]
+    expected = pd.Series([2, 5], index=["a", "b"])
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_step():
+    """Test grouped slice with step"""
+
+    data = [["x", f"x{i}"] for i in range(5)]
+    data += [["y", f"y{i}"] for i in range(4)]
+    data += [["z", f"z{i}"] for i in range(3)]
+    df = pd.DataFrame(data, columns=["A", "B"])
+
+    grouped = df.groupby("A")
+
+    for step in [1, 2, 3, 4, 5]:
+        result = grouped.iloc[::step, :]
+
+        data = [["x", f"x{i}"] for i in range(0, 5, step)]
+        data += [["y", f"y{i}"] for i in range(0, 4, step)]
+        data += [["z", f"z{i}"] for i in range(0, 3, step)]
+
+        index = [0 + i for i in range(0, 5, step)]
+        index += [5 + i for i in range(0, 4, step)]
+        index += [9 + i for i in range(0, 3, step)]
+
+        expected = pd.DataFrame(data, columns=["A", "B"], index=index)
+
+        tm.assert_frame_equal(result, expected)
-Original file line number
+Diff line change
@@ Expand Up / @@ -125,6 +125,7 @@ @@
             "groups",
             "head",
             "hist",
+            "iloc",
             "indices",
             "ndim",
             "ngroups",
@@ Expand Down @@