pandas-dev
diff --git a/‎.github/workflows/code-checks.yml
+1 b/‎.github/workflows/code-checks.yml
+1
diff --git a/‎.github/workflows/deprecation-tracking-bot.yml
+47-13 b/‎.github/workflows/deprecation-tracking-bot.yml
+47-13
diff --git a/‎.github/workflows/wheels.yml
+6-3 b/‎.github/workflows/wheels.yml
+6-3
diff --git a/‎asv_bench/benchmarks/groupby.py
+88-9 b/‎asv_bench/benchmarks/groupby.py
+88-9
diff --git a/‎ci/code_checks.sh
-13 b/‎ci/code_checks.sh
-13
diff --git a/‎doc/source/development/contributing.rst
+16 b/‎doc/source/development/contributing.rst
+16
diff --git a/‎doc/source/user_guide/basics.rst
+1-3 b/‎doc/source/user_guide/basics.rst
+1-3
diff --git a/‎doc/source/user_guide/missing_data.rst
-7 b/‎doc/source/user_guide/missing_data.rst
-7
diff --git a/‎doc/source/whatsnew/v2.0.3.rst
+2 b/‎doc/source/whatsnew/v2.0.3.rst
+2
@@ -77,6 +77,7 @@ jobs:
 
     - name: Install pandas in editable mode
       id: build-editable
+      if: ${{ steps.build.outcome == 'success' && always() }}
       uses: ./.github/actions/build_pandas
       with:
         editable: true
 
@@ -1,11 +1,13 @@
+# This bot updates the issue with number DEPRECATION_TRACKER_ISSUE
+# with the PR number that issued the deprecation.
+
+# It runs on commits to main, and will trigger if the PR linked to a merged commit has the "Deprecate" label
 name: Deprecations Bot
 
 on:
-  pull_request:
+  push:
     branches:
       - main
-    types:
-      [closed]
 
 
 permissions:
@@ -15,17 +17,49 @@ jobs:
   deprecation_update:
     permissions:
       issues: write
-    if: >-
-      contains(github.event.pull_request.labels.*.name, 'Deprecate') && github.event.pull_request.merged == true
     runs-on: ubuntu-22.04
     env:
       DEPRECATION_TRACKER_ISSUE: 50578
     steps:
-    - name: Checkout
-      run: |
-        echo "Adding deprecation PR number to deprecation tracking issue"
-        export PR=${{ github.event.pull_request.number }}
-        BODY=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" https://api.github.com/repos/${{ github.repository }}/issues/${DEPRECATION_TRACKER_ISSUE} |
-          python3 -c "import sys, json, os; x = {'body': json.load(sys.stdin)['body']}; pr = os.environ['PR']; x['body'] += f'\n- [ ] #{pr}'; print(json.dumps(x))")
-        echo ${BODY}
-        curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X PATCH -d "${BODY}" https://api.github.com/repos/${{ github.repository }}/issues/${DEPRECATION_TRACKER_ISSUE}
+    - uses: actions/github-script@v6
+      id: update-deprecation-issue
+      with:
+        script: |
+          body = await github.rest.issues.get({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }},
+          })
+          body = body["data"]["body"];
+          linkedPRs = await github.rest.repos.listPullRequestsAssociatedWithCommit({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            commit_sha: '${{ github.sha }}'
+          })
+          linkedPRs = linkedPRs["data"];
+          console.log(linkedPRs);
+          if (linkedPRs.length > 0) {
+            console.log("Found linked PR");
+            linkedPR = linkedPRs[0]
+            isDeprecation = false
+            for (label of linkedPR["labels"]) {
+              if (label["name"] == "Deprecate") {
+                isDeprecation = true;
+                break;
+              }
+            }
+
+            PR_NUMBER = linkedPR["number"];
+
+            body += ("\n- [ ] #" + PR_NUMBER);
+            if (isDeprecation) {
+              console.log("PR is a deprecation PR. Printing new body of issue");
+              console.log(body);
+              github.rest.issues.update({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }},
+                body: body
+              })
+            }
+          }
@@ -110,9 +110,12 @@ jobs:
           path: ./dist
 
       - name: Build wheels
-        uses: pypa/[email protected]
-        with:
-          package-dir: ./dist/${{ needs.build_sdist.outputs.sdist_file }}
+        uses: pypa/[email protected]
+        # TODO: Build wheels from sdist again
+        # There's some sort of weird race condition?
+        # within Github that makes the sdist be missing files
+        #with:
+        #  package-dir: ./dist/${{ needs.build_sdist.outputs.sdist_file }}
         env:
           CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
 
 
@@ -57,6 +57,38 @@
     },
 }
 
+# These aggregations don't have a kernel implemented for them yet
+_numba_unsupported_methods = [
+    "all",
+    "any",
+    "bfill",
+    "count",
+    "cumcount",
+    "cummax",
+    "cummin",
+    "cumprod",
+    "cumsum",
+    "describe",
+    "diff",
+    "ffill",
+    "first",
+    "head",
+    "last",
+    "median",
+    "nunique",
+    "pct_change",
+    "prod",
+    "quantile",
+    "rank",
+    "sem",
+    "shift",
+    "size",
+    "skew",
+    "tail",
+    "unique",
+    "value_counts",
+]
+
 
 class ApplyDictReturn:
     def setup(self):
@@ -453,9 +485,10 @@ class GroupByMethods:
         ],
         ["direct", "transformation"],
         [1, 5],
+        ["cython", "numba"],
     ]
 
-    def setup(self, dtype, method, application, ncols):
+    def setup(self, dtype, method, application, ncols, engine):
         if method in method_blocklist.get(dtype, {}):
             raise NotImplementedError  # skip benchmark
 
@@ -474,6 +507,19 @@ def setup(self, dtype, method, application, ncols):
             # DataFrameGroupBy doesn't have these methods
             raise NotImplementedError
 
+        # Numba currently doesn't support
+        # multiple transform functions or strs for transform,
+        # grouping on multiple columns
+        # and we lack kernels for a bunch of methods
+        if (
+            engine == "numba"
+            and method in _numba_unsupported_methods
+            or ncols > 1
+            or application == "transformation"
+            or dtype == "datetime"
+        ):
+            raise NotImplementedError
+
         if method == "describe":
             ngroups = 20
         elif method == "skew":
@@ -505,17 +551,30 @@ def setup(self, dtype, method, application, ncols):
         if len(cols) == 1:
             cols = cols[0]
 
+        # Not everything supports the engine keyword yet
+        kwargs = {}
+        if engine == "numba":
+            kwargs["engine"] = engine
+
         if application == "transformation":
-            self.as_group_method = lambda: df.groupby("key")[cols].transform(method)
-            self.as_field_method = lambda: df.groupby(cols)["key"].transform(method)
+            self.as_group_method = lambda: df.groupby("key")[cols].transform(
+                method, **kwargs
+            )
+            self.as_field_method = lambda: df.groupby(cols)["key"].transform(
+                method, **kwargs
+            )
         else:
-            self.as_group_method = getattr(df.groupby("key")[cols], method)
-            self.as_field_method = getattr(df.groupby(cols)["key"], method)
+            self.as_group_method = partial(
+                getattr(df.groupby("key")[cols], method), **kwargs
+            )
+            self.as_field_method = partial(
+                getattr(df.groupby(cols)["key"], method), **kwargs
+            )
 
-    def time_dtype_as_group(self, dtype, method, application, ncols):
+    def time_dtype_as_group(self, dtype, method, application, ncols, engine):
         self.as_group_method()
 
-    def time_dtype_as_field(self, dtype, method, application, ncols):
+    def time_dtype_as_field(self, dtype, method, application, ncols, engine):
         self.as_field_method()
 
 
@@ -532,8 +591,12 @@ class GroupByCythonAgg:
         [
             "sum",
             "prod",
-            "min",
-            "max",
+            # TODO: uncomment min/max
+            # Currently, min/max implemented very inefficiently
+            # because it re-uses the Window min/max kernel
+            # so it will time out ASVs
+            # "min",
+            # "max",
             "mean",
             "median",
             "var",
@@ -554,6 +617,22 @@ def time_frame_agg(self, dtype, method):
         self.df.groupby("key").agg(method)
 
 
+class GroupByNumbaAgg(GroupByCythonAgg):
+    """
+    Benchmarks specifically targeting our numba aggregation algorithms
+    (using a big enough dataframe with simple key, so a large part of the
+    time is actually spent in the grouped aggregation).
+    """
+
+    def setup(self, dtype, method):
+        if method in _numba_unsupported_methods:
+            raise NotImplementedError
+        super().setup(dtype, method)
+
+    def time_frame_agg(self, dtype, method):
+        self.df.groupby("key").agg(method, engine="numba")
+
+
 class GroupByCythonAggEaDtypes:
     """
     Benchmarks specifically targeting our cython aggregation algorithms
 
@@ -105,8 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.errors.UnsupportedFunctionCall \
         pandas.test \
         pandas.NaT \
-        pandas.Timestamp.as_unit \
-        pandas.Timestamp.ctime \
         pandas.Timestamp.date \
         pandas.Timestamp.dst \
         pandas.Timestamp.isocalendar \
@@ -121,16 +119,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.Timestamp.utcoffset \
         pandas.Timestamp.utctimetuple \
         pandas.Timestamp.weekday \
-        pandas.arrays.DatetimeArray \
-        pandas.Timedelta.view \
-        pandas.Timedelta.as_unit \
-        pandas.Timedelta.ceil \
-        pandas.Timedelta.floor \
-        pandas.Timedelta.round \
-        pandas.Timedelta.to_pytimedelta \
-        pandas.Timedelta.to_timedelta64 \
-        pandas.Timedelta.to_numpy \
-        pandas.Timedelta.total_seconds \
         pandas.arrays.TimedeltaArray \
         pandas.Period.asfreq \
         pandas.Period.now \
@@ -263,7 +251,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.core.window.ewm.ExponentialMovingWindow.cov \
         pandas.api.indexers.BaseIndexer \
         pandas.api.indexers.VariableOffsetWindowIndexer \
-        pandas.core.groupby.SeriesGroupBy.fillna \
         pandas.io.formats.style.Styler \
         pandas.io.formats.style.Styler.from_custom_template \
         pandas.io.formats.style.Styler.set_caption \
 
@@ -119,6 +119,22 @@ Some great resources for learning Git:
 * the `NumPy documentation <https://numpy.org/doc/stable/dev/index.html>`_.
 * Matthew Brett's `Pydagogue <https://matthew-brett.github.io/pydagogue/>`_.
 
+Also, the project follows a forking workflow further described on this page whereby
+contributors fork the repository, make changes and then create a pull request.
+So please be sure to read and follow all the instructions in this guide.
+
+If you are new to contributing to projects through forking on GitHub,
+take a look at the `GitHub documentation for contributing to projects <https://docs.github.com/en/get-started/quickstart/contributing-to-projects>`_.
+GitHub provides a quick tutorial using a test repository that may help you become more familiar
+with forking a repository, cloning a fork, creating a feature branch, pushing changes and
+making pull requests.
+
+Below are some useful resources for learning more about forking and pull requests on GitHub:
+
+* the `GitHub documentation for forking a repo <https://docs.github.com/en/get-started/quickstart/fork-a-repo>`_.
+* the `GitHub documentation for collaborating with pull requests <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests>`_.
+* the `GitHub documentation for working with forks <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks>`_.
+
 Getting started with Git
 ------------------------
 
 
@@ -675,7 +675,7 @@ matching index:
 Value counts (histogramming) / mode
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The :meth:`~Series.value_counts` Series method and top-level function computes a histogram
+The :meth:`~Series.value_counts` Series method computes a histogram
 of a 1D array of values. It can also be used as a function on regular arrays:
 
 .. ipython:: python
@@ -684,7 +684,6 @@ of a 1D array of values. It can also be used as a function on regular arrays:
    data
    s = pd.Series(data)
    s.value_counts()
-   pd.value_counts(data)
 
 The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns.
 By default all columns are used but a subset can be selected using the ``subset`` argument.
@@ -733,7 +732,6 @@ normally distributed data into equal-size quartiles like so:
    arr = np.random.randn(30)
    factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1])
    factor
-   pd.value_counts(factor)
 
 We can also pass infinite values to define the bins:
 
 
@@ -551,13 +551,6 @@ For a DataFrame, you can specify individual values by column:
 
    df.replace({"a": 0, "b": 5}, 100)
 
-Instead of replacing with specified values, you can treat all given values as
-missing and interpolate over them:
-
-.. ipython:: python
-
-   ser.replace([1, 2, 3], method="pad")
-
 .. _missing_data.replace_expression:
 
 String/regular expression replacement
 
@@ -21,8 +21,10 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
+- Bug in :func:`RangeIndex.union` when using ``sort=True`` with another :class:`RangeIndex` (:issue:`53490`)
 - Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
 - Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`)
+- Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`)
 -
 
 .. ---------------------------------------------------------------------------
Original file line number	Diff line number	Diff line change
`@@ -21,8 +21,10 @@ Fixed regressions`
`21`	`21`
`22`	`22`	`Bug fixes`
`23`	`23`	`~~~~~~~~~`
	`24`	+- Bug in :func:`RangeIndex.union` when using ``sort=True`` with another :class:`RangeIndex` (:issue:`53490`)
`24`	`25`	- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
`25`	`26`	- Bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` with ``expand=True`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`53532`)
	`27`	+- Bug in indexing methods (e.g. :meth:`DataFrame.__getitem__`) where taking the entire :class:`DataFrame`/:class:`Series` would raise an ``OverflowError`` when Copy on Write was enabled and the length of the array was over the maximum size a 32-bit integer can hold (:issue:`53616`)
`26`	`28`	`-`
`27`	`29`
`28`	`30`	`.. ---------------------------------------------------------------------------`