pandas-dev · canthonyscott · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023 · Jun 8, 2023
@@ -77,6 +77,7 @@ jobs:
 
     - name: Install pandas in editable mode
       id: build-editable
+      if: ${{ steps.build.outcome == 'success' && always() }}
       uses: ./.github/actions/build_pandas
       with:
         editable: true

@@ -1,11 +1,13 @@
+# This bot updates the issue with number DEPRECATION_TRACKER_ISSUE
+# with the PR number that issued the deprecation.
+
+# It runs on commits to main, and will trigger if the PR linked to a merged commit has the "Deprecate" label
 name: Deprecations Bot
 
 on:
-  pull_request:
+  push:
     branches:
       - main
-    types:
-      [closed]
 
 
 permissions:
@@ -15,17 +17,49 @@ jobs:
   deprecation_update:
     permissions:
       issues: write
-    if: >-
-      contains(github.event.pull_request.labels.*.name, 'Deprecate') && github.event.pull_request.merged == true
     runs-on: ubuntu-22.04
     env:
       DEPRECATION_TRACKER_ISSUE: 50578
     steps:
-    - name: Checkout
-      run: |
-        echo "Adding deprecation PR number to deprecation tracking issue"
-        export PR=${{ github.event.pull_request.number }}
-        BODY=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" https://api.github.com/repos/${{ github.repository }}/issues/${DEPRECATION_TRACKER_ISSUE} |
-          python3 -c "import sys, json, os; x = {'body': json.load(sys.stdin)['body']}; pr = os.environ['PR']; x['body'] += f'\n- [ ] #{pr}'; print(json.dumps(x))")
-        echo ${BODY}
-        curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X PATCH -d "${BODY}" https://api.github.com/repos/${{ github.repository }}/issues/${DEPRECATION_TRACKER_ISSUE}
+    - uses: actions/github-script@v6
+      id: update-deprecation-issue
+      with:
+        script: |
+          body = await github.rest.issues.get({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }},
+          })
+          body = body["data"]["body"];
+          linkedPRs = await github.rest.repos.listPullRequestsAssociatedWithCommit({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            commit_sha: '${{ github.sha }}'
+          })
+          linkedPRs = linkedPRs["data"];
+          console.log(linkedPRs);
+          if (linkedPRs.length > 0) {
+            console.log("Found linked PR");
+            linkedPR = linkedPRs[0]
+            isDeprecation = false
+            for (label of linkedPR["labels"]) {
+              if (label["name"] == "Deprecate") {
+                isDeprecation = true;
+                break;
+              }
+            }
+
+            PR_NUMBER = linkedPR["number"];
+
+            body += ("\n- [ ] #" + PR_NUMBER);
+            if (isDeprecation) {
+              console.log("PR is a deprecation PR. Printing new body of issue");
+              console.log(body);
+              github.rest.issues.update({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }},
+                body: body
+              })
+            }
+          }
@@ -230,7 +230,7 @@ jobs:
           /opt/python/cp39-cp39/bin/python -m venv ~/virtualenvs/pandas-dev
           . ~/virtualenvs/pandas-dev/bin/activate
           python -m pip install -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1
-          python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
+          python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
           python -m pip install --no-cache-dir --no-build-isolation -e .
           python -m pip list --no-cache-dir
           export PANDAS_CI=1
@@ -268,7 +268,7 @@ jobs:
           /opt/python/cp39-cp39/bin/python -m venv ~/virtualenvs/pandas-dev
           . ~/virtualenvs/pandas-dev/bin/activate
           python -m pip install -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1
-          python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
+          python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
           python -m pip install --no-cache-dir --no-build-isolation -e .
           python -m pip list --no-cache-dir
 
@@ -337,10 +337,10 @@ jobs:
         run: |
           python --version
           python -m pip install --upgrade pip setuptools wheel
-          python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy
+          python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
           python -m pip install git+https://github.com/nedbat/coveragepy.git
           python -m pip install versioneer[toml]
-          python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
+          python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
           python -m pip list
 
       - name: Build Pandas

@@ -110,9 +110,12 @@ jobs:
           path: ./dist
 
       - name: Build wheels
-        uses: pypa/[email protected]
-        with:
-          package-dir: ./dist/${{ needs.build_sdist.outputs.sdist_file }}
+        uses: pypa/[email protected]
+        # TODO: Build wheels from sdist again
+        # There's some sort of weird race condition?
+        # within Github that makes the sdist be missing files
+        #with:
+        #  package-dir: ./dist/${{ needs.build_sdist.outputs.sdist_file }}
         env:
           CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
 
@@ -137,7 +140,7 @@ jobs:
         shell: pwsh
         run: |
           $TST_CMD = @"
-          python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17;
+          python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17;
           python -m pip install --find-links=pandas\wheelhouse --no-index pandas;
           python -c `'import pandas as pd; pd.test()`';
           "@
@@ -156,7 +159,7 @@ jobs:
           PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }}
           PANDAS_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN }}
         # trigger an upload to
-        # https://anaconda.org/scipy-wheels-nightly/pandas
+        # https://anaconda.org/scientific-python-nightly-wheels/pandas
         # for cron jobs or "Run workflow" (restricted to main branch).
         # Tags will upload to
         # https://anaconda.org/multibuild-wheels-staging/pandas

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -57,6 +57,38 @@
     },
 }
 
+# These aggregations don't have a kernel implemented for them yet
+_numba_unsupported_methods = [
+    "all",
+    "any",
+    "bfill",
+    "count",
+    "cumcount",
+    "cummax",
+    "cummin",
+    "cumprod",
+    "cumsum",
+    "describe",
+    "diff",
+    "ffill",
+    "first",
+    "head",
+    "last",
+    "median",
+    "nunique",
+    "pct_change",
+    "prod",
+    "quantile",
+    "rank",
+    "sem",
+    "shift",
+    "size",
+    "skew",
+    "tail",
+    "unique",
+    "value_counts",
+]
+
 
 class ApplyDictReturn:
     def setup(self):
@@ -453,9 +485,10 @@ class GroupByMethods:
         ],
         ["direct", "transformation"],
         [1, 5],
+        ["cython", "numba"],
     ]
 
-    def setup(self, dtype, method, application, ncols):
+    def setup(self, dtype, method, application, ncols, engine):
         if method in method_blocklist.get(dtype, {}):
             raise NotImplementedError  # skip benchmark
 
@@ -474,6 +507,19 @@ def setup(self, dtype, method, application, ncols):
             # DataFrameGroupBy doesn't have these methods
             raise NotImplementedError
 
+        # Numba currently doesn't support
+        # multiple transform functions or strs for transform,
+        # grouping on multiple columns
+        # and we lack kernels for a bunch of methods
+        if (
+            engine == "numba"
+            and method in _numba_unsupported_methods
+            or ncols > 1
+            or application == "transformation"
+            or dtype == "datetime"
+        ):
+            raise NotImplementedError
+
         if method == "describe":
             ngroups = 20
         elif method == "skew":
@@ -505,17 +551,30 @@ def setup(self, dtype, method, application, ncols):
         if len(cols) == 1:
             cols = cols[0]
 
+        # Not everything supports the engine keyword yet
+        kwargs = {}
+        if engine == "numba":
+            kwargs["engine"] = engine
+
         if application == "transformation":
-            self.as_group_method = lambda: df.groupby("key")[cols].transform(method)
-            self.as_field_method = lambda: df.groupby(cols)["key"].transform(method)
+            self.as_group_method = lambda: df.groupby("key")[cols].transform(
+                method, **kwargs
+            )
+            self.as_field_method = lambda: df.groupby(cols)["key"].transform(
+                method, **kwargs
+            )
         else:
-            self.as_group_method = getattr(df.groupby("key")[cols], method)
-            self.as_field_method = getattr(df.groupby(cols)["key"], method)
+            self.as_group_method = partial(
+                getattr(df.groupby("key")[cols], method), **kwargs
+            )
+            self.as_field_method = partial(
+                getattr(df.groupby(cols)["key"], method), **kwargs
+            )
 
-    def time_dtype_as_group(self, dtype, method, application, ncols):
+    def time_dtype_as_group(self, dtype, method, application, ncols, engine):
         self.as_group_method()
 
-    def time_dtype_as_field(self, dtype, method, application, ncols):
+    def time_dtype_as_field(self, dtype, method, application, ncols, engine):
         self.as_field_method()
 
 
@@ -532,8 +591,12 @@ class GroupByCythonAgg:
         [
             "sum",
             "prod",
-            "min",
-            "max",
+            # TODO: uncomment min/max
+            # Currently, min/max implemented very inefficiently
+            # because it re-uses the Window min/max kernel
+            # so it will time out ASVs
+            # "min",
+            # "max",
             "mean",
             "median",
             "var",
@@ -554,6 +617,22 @@ def time_frame_agg(self, dtype, method):
         self.df.groupby("key").agg(method)
 
 
+class GroupByNumbaAgg(GroupByCythonAgg):
+    """
+    Benchmarks specifically targeting our numba aggregation algorithms
+    (using a big enough dataframe with simple key, so a large part of the
+    time is actually spent in the grouped aggregation).
+    """
+
+    def setup(self, dtype, method):
+        if method in _numba_unsupported_methods:
+            raise NotImplementedError
+        super().setup(dtype, method)
+
+    def time_frame_agg(self, dtype, method):
+        self.df.groupby("key").agg(method, engine="numba")
+
+
 class GroupByCythonAggEaDtypes:
     """
     Benchmarks specifically targeting our cython aggregation algorithms

diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
@@ -396,4 +396,30 @@ def time_putmask_all_different(self):
         self.midx.putmask(self.mask, self.midx_values_different)
 
 
+class Append:
+    params = ["datetime64[ns]", "int64", "string"]
+    param_names = ["dtype"]
+
+    def setup(self, dtype):
+        N1 = 1000
+        N2 = 500
+        left_level1 = range(N1)
+        right_level1 = range(N1, N1 + N1)
+
+        if dtype == "datetime64[ns]":
+            level2 = date_range(start="2000-01-01", periods=N2)
+        elif dtype == "int64":
+            level2 = range(N2)
+        elif dtype == "string":
+            level2 = tm.makeStringIndex(N2)
+        else:
+            raise NotImplementedError
+
+        self.left = MultiIndex.from_product([left_level1, level2])
+        self.right = MultiIndex.from_product([right_level1, level2])
+
+    def time_append(self, dtype):
+        self.left.append(self.right)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip