simonjayhawkins
diff --git a/‎.github/workflows/datamanger.yml
-54 b/‎.github/workflows/datamanger.yml
-54
diff --git a/‎.github/workflows/posix.yml
+20-1 b/‎.github/workflows/posix.yml
+20-1
diff --git a/‎.github/workflows/sdist.yml
+2 b/‎.github/workflows/sdist.yml
+2
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎Dockerfile
+6-1 b/‎Dockerfile
+6-1
diff --git a/‎asv_bench/benchmarks/groupby.py
+4-2 b/‎asv_bench/benchmarks/groupby.py
+4-2
diff --git a/‎asv_bench/benchmarks/indexing.py
+91-21 b/‎asv_bench/benchmarks/indexing.py
+91-21
diff --git a/‎asv_bench/benchmarks/join_merge.py
+13 b/‎asv_bench/benchmarks/join_merge.py
+13
diff --git a/‎asv_bench/benchmarks/reindex.py
+7-1 b/‎asv_bench/benchmarks/reindex.py
+7-1
diff --git a/‎azure-pipelines.yml
-1 b/‎azure-pipelines.yml
-1
@@ -26,37 +26,52 @@ jobs:
       matrix:
         env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
         pattern: ["not single_cpu", "single_cpu"]
+        # Don't test pyarrow v2/3: Causes timeouts in read_csv engine
+        # even if tests are skipped/xfailed
+        pyarrow_version: ["5", "7"]
         include:
           - env_file: actions-38-downstream_compat.yaml
             pattern: "not slow and not network and not single_cpu"
             pytest_target: "pandas/tests/test_downstream.py"
+            name: "Downstream Compat"
           - env_file: actions-38-minimum_versions.yaml
             pattern: "not slow and not network and not single_cpu"
+            name: "Minimum Versions"
           - env_file: actions-38.yaml
             pattern: "not slow and not network and not single_cpu"
             extra_apt: "language-pack-it"
             lang: "it_IT.utf8"
             lc_all: "it_IT.utf8"
+            name: "Locale: it_IT.utf8"
           - env_file: actions-38.yaml
             pattern: "not slow and not network and not single_cpu"
             extra_apt: "language-pack-zh-hans"
             lang: "zh_CN.utf8"
             lc_all: "zh_CN.utf8"
+            name: "Locale: zh_CN.utf8"
+          - env_file: actions-38.yaml
+            pattern: "not slow and not network and not single_cpu"
+            pandas_data_manager: "array"
+            name: "Data Manager"
           - env_file: actions-pypy-38.yaml
             pattern: "not slow and not network and not single_cpu"
             test_args: "--max-worker-restart 0"
+            name: "Pypy"
           - env_file: actions-310-numpydev.yaml
             pattern: "not slow and not network and not single_cpu"
             pandas_testing_mode: "deprecate"
             test_args: "-W error"
+            name: "Numpy Dev"
       fail-fast: false
+    name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }}
     env:
       ENV_FILE: ci/deps/${{ matrix.env_file }}
       PATTERN: ${{ matrix.pattern }}
       EXTRA_APT: ${{ matrix.extra_apt || '' }}
       LANG: ${{ matrix.lang || '' }}
       LC_ALL: ${{ matrix.lc_all || '' }}
       PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }}
+      PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
       TEST_ARGS: ${{ matrix.test_args || '' }}
       PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
       PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
@@ -65,7 +80,7 @@ jobs:
       COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }}
     concurrency:
       # https://github.community/t/concurrecy-not-work-for-push/183068/7
-      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
+      group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }}
       cancel-in-progress: true
 
     services:
@@ -133,6 +148,10 @@ jobs:
         use-only-tar-bz2: true
       if: ${{ env.IS_PYPY == 'false' }} # No pypy3.8 support
 
+    - name: Upgrade Arrow version
+      run: conda install -n pandas-dev -c conda-forge --no-update-deps pyarrow=${{ matrix.pyarrow_version }}
+      if: ${{ matrix.pyarrow_version }}
+
     - name: Setup PyPy
       uses: actions/setup-python@v2
       with:
 
@@ -9,11 +9,13 @@ on:
     branches:
       - main
       - 1.4.x
+    types: [labeled, opened, synchronize, reopened]
     paths-ignore:
       - "doc/**"
 
 jobs:
   build:
+    if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
     runs-on: ubuntu-latest
     timeout-minutes: 60
     defaults:
 
@@ -178,7 +178,7 @@ repos:
         language: python
         files: ^pandas/core/generic\.py$
     -   id: pandas-errors-documented
-        name: Ensure pandas errors are documented in doc/source/reference/general_utility_functions.rst
+        name: Ensure pandas errors are documented in doc/source/reference/testing.rst
         entry: python scripts/pandas_errors_documented.py
         language: python
         files: ^pandas/errors/__init__.py$
 
@@ -1,4 +1,4 @@
-FROM quay.io/condaforge/miniforge3:4.11.0-0
+FROM quay.io/condaforge/miniforge3
 
 # if you forked pandas, you can pass in your own GitHub username to use your fork
 # i.e. gh_username=myname
@@ -12,6 +12,11 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update \
     && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
     #
+    # Install tzdata and configure timezone (fix for tests which try to read from "/etc/localtime")
+    && apt-get -y install tzdata \
+    && ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime \
+    && dpkg-reconfigure -f noninteractive tzdata \
+    #
     # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
     && apt-get -y install git iproute2 procps iproute2 lsb-release \
     #
 
@@ -18,6 +18,7 @@
 
 method_blocklist = {
     "object": {
+        "diff",
         "median",
         "prod",
         "sem",
@@ -405,7 +406,7 @@ class GroupByMethods:
 
     param_names = ["dtype", "method", "application", "ncols"]
     params = [
-        ["int", "float", "object", "datetime", "uint"],
+        ["int", "int16", "float", "object", "datetime", "uint"],
         [
             "all",
             "any",
@@ -417,6 +418,7 @@ class GroupByMethods:
             "cumprod",
             "cumsum",
             "describe",
+            "diff",
             "ffill",
             "first",
             "head",
@@ -478,7 +480,7 @@ def setup(self, dtype, method, application, ncols):
         values = rng.take(taker, axis=0)
         if dtype == "int":
             key = np.random.randint(0, size, size=size)
-        elif dtype == "uint":
+        elif dtype in ("int16", "uint"):
             key = np.random.randint(0, size, size=size, dtype=dtype)
         elif dtype == "float":
             key = np.concatenate(
 
@@ -13,7 +13,6 @@
     CategoricalIndex,
     DataFrame,
     Float64Index,
-    IndexSlice,
     Int64Index,
     IntervalIndex,
     MultiIndex,
@@ -200,28 +199,81 @@ def time_take(self, index):
 
 
 class MultiIndexing:
-    def setup(self):
-        mi = MultiIndex.from_product([range(1000), range(1000)])
-        self.s = Series(np.random.randn(1000000), index=mi)
-        self.df = DataFrame(self.s)
 
-        n = 100000
-        with warnings.catch_warnings(record=True):
-            self.mdt = DataFrame(
-                {
-                    "A": np.random.choice(range(10000, 45000, 1000), n),
-                    "B": np.random.choice(range(10, 400), n),
-                    "C": np.random.choice(range(1, 150), n),
-                    "D": np.random.choice(range(10000, 45000), n),
-                    "x": np.random.choice(range(400), n),
-                    "y": np.random.choice(range(25), n),
-                }
-            )
-        self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
-        self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
+    params = [True, False]
+    param_names = ["unique_levels"]
+
+    def setup(self, unique_levels):
+        self.nlevels = 2
+        if unique_levels:
+            mi = MultiIndex.from_arrays([range(1000000)] * self.nlevels)
+        else:
+            mi = MultiIndex.from_product([range(1000)] * self.nlevels)
+        self.df = DataFrame(np.random.randn(len(mi)), index=mi)
+
+        self.tgt_slice = slice(200, 800)
+        self.tgt_null_slice = slice(None)
+        self.tgt_list = list(range(0, 1000, 10))
+        self.tgt_scalar = 500
+
+        bool_indexer = np.zeros(len(mi), dtype=np.bool_)
+        bool_indexer[slice(0, len(mi), 100)] = True
+        self.tgt_bool_indexer = bool_indexer
+
+    def time_loc_partial_key_slice(self, unique_levels):
+        self.df.loc[self.tgt_slice, :]
+
+    def time_loc_partial_key_null_slice(self, unique_levels):
+        self.df.loc[self.tgt_null_slice, :]
+
+    def time_loc_partial_key_list(self, unique_levels):
+        self.df.loc[self.tgt_list, :]
+
+    def time_loc_partial_key_scalar(self, unique_levels):
+        self.df.loc[self.tgt_scalar, :]
+
+    def time_loc_partial_key_bool_indexer(self, unique_levels):
+        self.df.loc[self.tgt_bool_indexer, :]
+
+    def time_loc_all_slices(self, unique_levels):
+        target = tuple([self.tgt_slice] * self.nlevels)
+        self.df.loc[target, :]
+
+    def time_loc_all_null_slices(self, unique_levels):
+        target = tuple([self.tgt_null_slice] * self.nlevels)
+        self.df.loc[target, :]
+
+    def time_loc_all_lists(self, unique_levels):
+        target = tuple([self.tgt_list] * self.nlevels)
+        self.df.loc[target, :]
 
-    def time_index_slice(self):
-        self.mdt.loc[self.idx, :]
+    def time_loc_all_scalars(self, unique_levels):
+        target = tuple([self.tgt_scalar] * self.nlevels)
+        self.df.loc[target, :]
+
+    def time_loc_all_bool_indexers(self, unique_levels):
+        target = tuple([self.tgt_bool_indexer] * self.nlevels)
+        self.df.loc[target, :]
+
+    def time_loc_slice_plus_null_slice(self, unique_levels):
+        target = (self.tgt_slice, self.tgt_null_slice)
+        self.df.loc[target, :]
+
+    def time_loc_null_slice_plus_slice(self, unique_levels):
+        target = (self.tgt_null_slice, self.tgt_slice)
+        self.df.loc[target, :]
+
+    def time_xs_level_0(self, unique_levels):
+        target = self.tgt_scalar
+        self.df.xs(target, level=0)
+
+    def time_xs_level_1(self, unique_levels):
+        target = self.tgt_scalar
+        self.df.xs(target, level=1)
+
+    def time_xs_full_key(self, unique_levels):
+        target = tuple([self.tgt_scalar] * self.nlevels)
+        self.df.xs(target)
 
 
 class IntervalIndexing:
@@ -257,6 +309,24 @@ def time_get_indexer_mismatched_tz(self):
         self.dti.get_indexer(self.dti2)
 
 
+class SortedAndUnsortedDatetimeIndexLoc:
+    def setup(self):
+        dti = date_range("2016-01-01", periods=10000, tz="US/Pacific")
+        index = np.array(dti)
+
+        unsorted_index = index.copy()
+        unsorted_index[10] = unsorted_index[20]
+
+        self.df_unsorted = DataFrame(index=unsorted_index, data={"a": 1})
+        self.df_sort = DataFrame(index=index, data={"a": 1})
+
+    def time_loc_unsorted(self):
+        self.df_unsorted.loc["2016-6-11"]
+
+    def time_loc_sorted(self):
+        self.df_sort.loc["2016-6-11"]
+
+
 class CategoricalIndexIndexing:
 
     params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]
 
@@ -158,6 +158,19 @@ def time_left_outer_join_index(self):
         self.left.join(self.right, on="jim")
 
 
+class JoinEmpty:
+    def setup(self):
+        N = 100_000
+        self.df = DataFrame({"A": np.arange(N)})
+        self.df_empty = DataFrame(columns=["B", "C"], dtype="int64")
+
+    def time_inner_join_left_empty(self):
+        self.df_empty.join(self.df, how="inner")
+
+    def time_inner_join_right_empty(self):
+        self.df.join(self.df_empty, how="inner")
+
+
 class JoinNonUnique:
     # outer join of non-unique
     # GH 6329
 
@@ -28,16 +28,22 @@ def setup(self):
         index = MultiIndex.from_arrays([level1, level2])
         self.s = Series(np.random.randn(N * K), index=index)
         self.s_subset = self.s[::2]
+        self.s_subset_no_cache = self.s[::2].copy()
 
     def time_reindex_dates(self):
         self.df.reindex(self.rng_subset)
 
     def time_reindex_columns(self):
         self.df2.reindex(columns=self.df.columns[1:5])
 
-    def time_reindex_multiindex(self):
+    def time_reindex_multiindex_with_cache(self):
+        # MultiIndex._values gets cached
         self.s.reindex(self.s_subset.index)
 
+    def time_reindex_multiindex_no_cache(self):
+        # Copy to avoid MultiIndex._values getting cached
+        self.s.reindex(self.s_subset_no_cache.index.copy())
+
 
 class ReindexMethod:
 
 
@@ -22,7 +22,6 @@ variables:
   PANDAS_CI: 1
 
 jobs:
-# Mac and Linux use the same template
 - template: ci/azure/posix.yml
   parameters:
     name: macOS