pandas-dev
diff --git a/‎.github/workflows/code-checks.yml
+3 b/‎.github/workflows/code-checks.yml
+3
diff --git a/‎.github/workflows/macos-windows.yml
+1-1 b/‎.github/workflows/macos-windows.yml
+1-1
diff --git a/‎.github/workflows/ubuntu.yml
+17-3 b/‎.github/workflows/ubuntu.yml
+17-3
diff --git a/‎.libcst.codemod.yaml
+18 b/‎.libcst.codemod.yaml
+18
diff --git a/‎.pre-commit-config.yaml
+16-1 b/‎.pre-commit-config.yaml
+16-1
diff --git a/‎Dockerfile
+6-16 b/‎Dockerfile
+6-16
diff --git a/‎asv_bench/benchmarks/array.py
+4-3 b/‎asv_bench/benchmarks/array.py
+4-3
diff --git a/‎asv_bench/benchmarks/groupby.py
+2 b/‎asv_bench/benchmarks/groupby.py
+2
diff --git a/‎asv_bench/benchmarks/multiindex_object.py
+15 b/‎asv_bench/benchmarks/multiindex_object.py
+15
diff --git a/‎asv_bench/benchmarks/series_methods.py
+14 b/‎asv_bench/benchmarks/series_methods.py
+14
diff --git a/‎doc/source/development/contributing_environment.rst
+10 b/‎doc/source/development/contributing_environment.rst
+10
diff --git a/‎doc/source/user_guide/io.rst
+36 b/‎doc/source/user_guide/io.rst
+36
diff --git a/‎doc/source/user_guide/timeseries.rst
-1 b/‎doc/source/user_guide/timeseries.rst
-1
diff --git a/‎doc/source/whatsnew/v1.5.0.rst
+4 b/‎doc/source/whatsnew/v1.5.0.rst
+4
diff --git a/‎doc/source/whatsnew/v1.6.0.rst
+8-4 b/‎doc/source/whatsnew/v1.6.0.rst
+8-4
@@ -153,6 +153,9 @@ jobs:
       - name: Build image
         run: docker build --pull --no-cache --tag pandas-dev-env .
 
+      - name: Show environment
+        run: docker run -w /home/pandas pandas-dev-env mamba run -n pandas-dev python -c "import pandas as pd; print(pd.show_versions())"
+
   requirements-dev-text-installable:
     name: Test install requirements-dev.txt
     runs-on: ubuntu-latest
 
@@ -28,7 +28,7 @@ jobs:
     defaults:
       run:
         shell: bash -el {0}
-    timeout-minutes: 120
+    timeout-minutes: 180
     strategy:
       matrix:
         os: [macos-latest, windows-latest]
 
@@ -26,7 +26,7 @@ jobs:
     defaults:
       run:
         shell: bash -el {0}
-    timeout-minutes: 120
+    timeout-minutes: 180
     strategy:
       matrix:
         env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
@@ -42,18 +42,26 @@ jobs:
           - name: "Minimum Versions"
             env_file: actions-38-minimum_versions.yaml
             pattern: "not slow and not network and not single_cpu"
-          - name: "Locale: it_IT.utf8"
+          - name: "Locale: it_IT"
             env_file: actions-38.yaml
             pattern: "not slow and not network and not single_cpu"
             extra_apt: "language-pack-it"
+            # Use the utf8 version as the default, it has no bad side-effect.
             lang: "it_IT.utf8"
             lc_all: "it_IT.utf8"
-          - name: "Locale: zh_CN.utf8"
+            # Also install it_IT (its encoding is ISO8859-1) but do not activate it.
+            # It will be temporarily activated during tests with locale.setlocale
+            extra_loc: "it_IT"
+          - name: "Locale: zh_CN"
             env_file: actions-38.yaml
             pattern: "not slow and not network and not single_cpu"
             extra_apt: "language-pack-zh-hans"
+            # Use the utf8 version as the default, it has no bad side-effect.
             lang: "zh_CN.utf8"
             lc_all: "zh_CN.utf8"
+            # Also install zh_CN (its encoding is gb2312) but do not activate it.
+            # It will be temporarily activated during tests with locale.setlocale
+            extra_loc: "zh_CN"
           - name: "Copy-on-Write"
             env_file: actions-310.yaml
             pattern: "not slow and not network and not single_cpu"
@@ -148,6 +156,12 @@ jobs:
       # xsel for clipboard tests
       run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }}
 
+    - name: Generate extra locales
+      # These extra locales will be available for locale.setlocale() calls in tests
+      run: |
+        sudo locale-gen ${{ matrix.extra_loc }}
+      if: ${{ matrix.extra_loc }}
+
     - name: Set up Conda
       uses: ./.github/actions/setup-conda
       with:
 
@@ -0,0 +1,18 @@
+# String that LibCST should look for in code which indicates that the
+# module is generated code.
+generated_code_marker: '@generated'
+# Command line and arguments for invoking a code formatter. Anything
+# specified here must be capable of taking code via stdin and returning
+# formatted code via stdout.
+formatter: ['black', '-']
+# List of regex patterns which LibCST will evaluate against filenames to
+# determine if the module should be touched.
+blacklist_patterns: []
+# List of modules that contain codemods inside of them.
+modules:
+- 'libcst.codemod.commands'
+- 'autotyping'
+# Absolute or relative path of the repository root, used for providing
+# full-repo metadata. Relative paths should be specified with this file
+# location as the base.
+repo_root: '.'
@@ -22,10 +22,14 @@ repos:
     hooks:
     -   id: black
 -   repo: https://github.com/codespell-project/codespell
-    rev: v2.1.0
+    rev: v2.2.1
     hooks:
     -   id: codespell
         types_or: [python, rst, markdown]
+-   repo: https://github.com/MarcoGorelli/cython-lint
+    rev: v0.1.4
+    hooks:
+    -   id: cython-lint
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0
     hooks:
@@ -252,3 +256,14 @@ repos:
             /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$
             |/tests/
             |/_testing/
+    -   id: autotyping
+        name: autotyping
+        entry: python -m libcst.tool codemod autotyping.AutotypeCommand --none-return --scalar-return --annotate-magics --annotate-imprecise-magics
+        types_or: [python, pyi]
+        files: ^pandas
+        exclude: ^(pandas/tests|pandas/io/clipboard)
+        language: python
+        additional_dependencies:
+        - autotyping==22.9.0
+        - black==22.6.0
+        - libcst==0.4.7
@@ -1,4 +1,4 @@
-FROM quay.io/condaforge/mambaforge:4.13.0-1
+FROM quay.io/condaforge/mambaforge
 
 # if you forked pandas, you can pass in your own GitHub username to use your fork
 # i.e. gh_username=myname
@@ -10,16 +10,12 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 # Configure apt and install packages
 RUN apt-get update \
-    && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
+    && apt-get -y install --no-install-recommends apt-utils git tzdata dialog 2>&1 \
     #
-    # Install tzdata and configure timezone (fix for tests which try to read from "/etc/localtime")
-    && apt-get -y install tzdata \
+    # Configure timezone (fix for tests which try to read from "/etc/localtime")
     && ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime \
     && dpkg-reconfigure -f noninteractive tzdata \
     #
-    # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
-    && apt-get -y install git iproute2 procps iproute2 lsb-release \
-    #
     # cleanup
     && apt-get autoremove -y \
     && apt-get clean -y \
@@ -35,18 +31,12 @@ RUN mkdir "$pandas_home" \
     && git remote add upstream "https://github.com/pandas-dev/pandas.git" \
     && git pull upstream main
 
-# Because it is surprisingly difficult to activate a conda environment inside a DockerFile
-# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89),
-# we just update the base/root one from the 'environment.yml' file instead of creating a new one.
-#
 # Set up environment
-RUN mamba env update -n base -f "$pandas_home/environment.yml"
+RUN mamba env create -f "$pandas_home/environment.yml"
 
 # Build C extensions and pandas
-SHELL ["/bin/bash", "-c"]
-RUN . /opt/conda/etc/profile.d/conda.sh \
-    && conda activate base \
-    && cd "$pandas_home" \
+SHELL ["mamba", "run", "--no-capture-output", "-n", "pandas-dev", "/bin/bash", "-c"]
+RUN cd "$pandas_home" \
     && export \
     && python setup.py build_ext -j 4 \
     && python -m pip install --no-build-isolation -e .
@@ -32,9 +32,10 @@ def time_from_float_array(self):
 
 class IntegerArray:
     def setup(self):
-        self.values_integer = np.array([1, 0, 1, 0])
-        self.data = np.array([1, 2, 3, 4], dtype="int64")
-        self.mask = np.array([False, False, True, False])
+        N = 250_000
+        self.values_integer = np.array([1, 0, 1, 0] * N)
+        self.data = np.array([1, 2, 3, 4] * N, dtype="int64")
+        self.mask = np.array([False, False, True, False] * N)
 
     def time_constructor(self):
         pd.arrays.IntegerArray(self.data, self.mask)
 
@@ -5,6 +5,7 @@
 import numpy as np
 
 from pandas import (
+    NA,
     Categorical,
     DataFrame,
     Index,
@@ -592,6 +593,7 @@ def setup(self, dtype, method):
             columns=list("abcdefghij"),
             dtype=dtype,
         )
+        df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA
         df["key"] = np.random.randint(0, 100, size=N)
         self.df = df
 
 
@@ -8,6 +8,7 @@
     MultiIndex,
     RangeIndex,
     Series,
+    array,
     date_range,
 )
 
@@ -176,6 +177,20 @@ def time_sortlevel_one(self):
         self.mi.sortlevel(1)
 
 
+class SortValues:
+
+    params = ["int64", "Int64"]
+    param_names = ["dtype"]
+
+    def setup(self, dtype):
+        a = array(np.tile(np.arange(100), 1000), dtype=dtype)
+        b = array(np.tile(np.arange(1000), 100), dtype=dtype)
+        self.mi = MultiIndex.from_arrays([a, b])
+
+    def time_sort_values(self, dtype):
+        self.mi.sort_values()
+
+
 class Values:
     def setup_cache(self):
 
 
@@ -3,6 +3,7 @@
 import numpy as np
 
 from pandas import (
+    NA,
     Index,
     NaT,
     Series,
@@ -166,6 +167,19 @@ def time_value_counts(self, N, dtype):
         self.s.value_counts()
 
 
+class ValueCountsEA:
+
+    params = [[10**3, 10**4, 10**5], [True, False]]
+    param_names = ["N", "dropna"]
+
+    def setup(self, N, dropna):
+        self.s = Series(np.random.randint(0, N, size=10 * N), dtype="Int64")
+        self.s.loc[1] = NA
+
+    def time_value_counts(self, N, dropna):
+        self.s.value_counts(dropna=dropna)
+
+
 class ValueCountsObjectDropNAFalse:
 
     params = [10**3, 10**4, 10**5]
 
@@ -237,6 +237,16 @@ Run Container::
     # Run a container and bind your local repo to the container
     docker run -it -w /home/pandas --rm -v path-to-local-pandas-repo:/home/pandas pandas-yourname-env
 
+Then a ``pandas-dev`` virtual environment will be available with all the development dependencies.
+
+.. code-block:: shell
+
+    root@... :/home/pandas# conda env list
+    # conda environments:
+    #
+    base                  *  /opt/conda
+    pandas-dev               /opt/conda/envs/pandas-dev
+
 .. note::
     If you bind your local repo for the first time, you have to build the C extensions afterwards.
     Run the following command inside the container::
 
@@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
 However, if XPath does not reference node names such as default, ``/*``, then
 ``namespaces`` is not required.
 
+.. note::
+
+   Since ``xpath`` identifies the parent of content to be parsed, only immediate
+   desendants which include child nodes or current attributes are parsed.
+   Therefore, ``read_xml`` will not parse the text of grandchildren or other
+   descendants and will not parse attributes of any descendant. To retrieve
+   lower level content, adjust xpath to lower level. For example,
+
+   .. ipython:: python
+        :okwarning:
+
+      xml = """
+      <data>
+        <row>
+          <shape sides="4">square</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="0">circle</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="3">triangle</shape>
+          <degrees>180</degrees>
+        </row>
+      </data>"""
+
+      df = pd.read_xml(xml, xpath="./row")
+      df
+
+   shows the attribute ``sides`` on ``shape`` element was not parsed as
+   expected since this attribute resides on the child of ``row`` element
+   and not ``row`` element itself. In other words, ``sides`` attribute is a
+   grandchild level descendant of ``row`` element. However, the ``xpath``
+   targets ``row`` element which covers only its children and attributes.
+
 With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
 script which also can be string/file/URL types. As background, `XSLT`_ is
 a special-purpose language written in a special XML file that can transform
 
@@ -1981,7 +1981,6 @@ frequency. Arithmetic is not allowed between ``Period`` with different ``freq``
    p = pd.Period("2012-01", freq="2M")
    p + 2
    p - 1
-   @okexcept
    p == pd.Period("2012-01", freq="3M")
 
 
 
@@ -308,6 +308,7 @@ Other enhancements
 - Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
 - :class:`Series` and :class:`DataFrame` with :class:`IntegerDtype` now supports bitwise operations (:issue:`34463`)
 - Add ``milliseconds`` field support for :class:`.DateOffset` (:issue:`43371`)
+- :meth:`DataFrame.where` tries to maintain dtype of :class:`DataFrame` if fill value can be cast without loss of precision (:issue:`45582`)
 - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`)
 - :func:`concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
 - :func:`concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
@@ -331,6 +332,7 @@ Other enhancements
 - Added ``copy`` keyword to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` to allow user to set axis on a new object without necessarily copying the underlying data (:issue:`47932`)
 - :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`)
 - The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`)
+- The ``Dockerfile`` now installs a dedicated ``pandas-dev`` virtual environment for pandas development instead of using the ``base`` environment (:issue:`48427`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
@@ -1011,6 +1013,8 @@ Time Zones
 Numeric
 ^^^^^^^
 - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
+- Bug in arithmetic operations with nullable types without :attr:`NA` values not matching the same operation with non-nullable types (:issue:`48223`)
+- Bug in ``floordiv`` when dividing by ``IntegerDtype`` ``0`` would return ``0`` instead of ``inf`` (:issue:`48223`)
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
 - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
 
@@ -100,7 +100,11 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
+- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
 - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
+- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
+- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
 -
 
@@ -154,7 +158,7 @@ Indexing
 ^^^^^^^^
 - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`)
 - Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`)
--
+- Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`)
 
 Missing
 ^^^^^^^
@@ -174,7 +178,7 @@ I/O
 
 Period
 ^^^^^^
--
+- Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`)
 -
 
 Plotting
@@ -189,7 +193,7 @@ Groupby/resample/rolling
 
 Reshaping
 ^^^^^^^^^
--
+- Bug in :func:`join` when ``left_on`` or ``right_on`` is or includes a :class:`CategoricalIndex` incorrectly raising ``AttributeError`` (:issue:`48464`)
 -
 
 Sparse
@@ -199,7 +203,7 @@ Sparse
 
 ExtensionArray
 ^^^^^^^^^^^^^^
--
+- Bug in :meth:`Series.mean` overflowing unnecessarily with nullable integers (:issue:`48378`)
 -
 
 Styler