pandas-dev
diff --git a/‎Dockerfile
+1-1 b/‎Dockerfile
+1-1
diff --git a/‎asv_bench/benchmarks/groupby.py
+2 b/‎asv_bench/benchmarks/groupby.py
+2
diff --git a/‎doc/source/user_guide/io.rst
+36 b/‎doc/source/user_guide/io.rst
+36
diff --git a/‎doc/source/whatsnew/v1.5.0.rst
+2 b/‎doc/source/whatsnew/v1.5.0.rst
+2
diff --git a/‎doc/source/whatsnew/v1.6.0.rst
+2-1 b/‎doc/source/whatsnew/v1.6.0.rst
+2-1
diff --git a/‎pandas/_libs/groupby.pyi
+2 b/‎pandas/_libs/groupby.pyi
+2
diff --git a/‎pandas/_libs/groupby.pyx
+95-19 b/‎pandas/_libs/groupby.pyx
+95-19
diff --git a/‎pandas/core/algorithms.py
+23-5 b/‎pandas/core/algorithms.py
+23-5
diff --git a/‎pandas/core/dtypes/common.py
+3 b/‎pandas/core/dtypes/common.py
+3
diff --git a/‎pandas/core/frame.py
+1-1 b/‎pandas/core/frame.py
+1-1
@@ -1,4 +1,4 @@
-FROM quay.io/condaforge/mambaforge
+FROM quay.io/condaforge/mambaforge:4.13.0-1
 
 # if you forked pandas, you can pass in your own GitHub username to use your fork
 # i.e. gh_username=myname
 
@@ -5,6 +5,7 @@
 import numpy as np
 
 from pandas import (
+    NA,
     Categorical,
     DataFrame,
     Index,
@@ -592,6 +593,7 @@ def setup(self, dtype, method):
             columns=list("abcdefghij"),
             dtype=dtype,
         )
+        df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA
         df["key"] = np.random.randint(0, 100, size=N)
         self.df = df
 
 
@@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
 However, if XPath does not reference node names such as default, ``/*``, then
 ``namespaces`` is not required.
 
+.. note::
+
+   Since ``xpath`` identifies the parent of content to be parsed, only immediate
+   desendants which include child nodes or current attributes are parsed.
+   Therefore, ``read_xml`` will not parse the text of grandchildren or other
+   descendants and will not parse attributes of any descendant. To retrieve
+   lower level content, adjust xpath to lower level. For example,
+
+   .. ipython:: python
+        :okwarning:
+
+      xml = """
+      <data>
+        <row>
+          <shape sides="4">square</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="0">circle</shape>
+          <degrees>360</degrees>
+        </row>
+        <row>
+          <shape sides="3">triangle</shape>
+          <degrees>180</degrees>
+        </row>
+      </data>"""
+
+      df = pd.read_xml(xml, xpath="./row")
+      df
+
+   shows the attribute ``sides`` on ``shape`` element was not parsed as
+   expected since this attribute resides on the child of ``row`` element
+   and not ``row`` element itself. In other words, ``sides`` attribute is a
+   grandchild level descendant of ``row`` element. However, the ``xpath``
+   targets ``row`` element which covers only its children and attributes.
+
 With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
 script which also can be string/file/URL types. As background, `XSLT`_ is
 a special-purpose language written in a special XML file that can transform
 
@@ -1011,6 +1011,8 @@ Time Zones
 Numeric
 ^^^^^^^
 - Bug in operations with array-likes with ``dtype="boolean"`` and :attr:`NA` incorrectly altering the array in-place (:issue:`45421`)
+- Bug in arithmetic operations with nullable types without :attr:`NA` values not matching the same operation with non-nullable types (:issue:`48223`)
+- Bug in ``floordiv`` when dividing by ``IntegerDtype`` ``0`` would return ``0`` instead of ``inf`` (:issue:`48223`)
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
 - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
 
@@ -100,6 +100,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
 - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
 -
@@ -154,7 +155,7 @@ Indexing
 ^^^^^^^^
 - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`)
 - Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`)
--
+- Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`)
 
 Missing
 ^^^^^^^
 
@@ -10,6 +10,8 @@ def group_median_float64(
     values: np.ndarray,  # ndarray[float64_t, ndim=2]
     labels: npt.NDArray[np.int64],
     min_count: int = ...,  # Py_ssize_t
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_cumprod_float64(
     out: np.ndarray,  # float64_t[:, ::1]
 
@@ -41,6 +41,7 @@ from pandas._libs.algos import (
     ensure_platform_int,
     groupsort_indexer,
     rank_1d,
+    take_2d_axis1_bool_bool,
     take_2d_axis1_float64_float64,
 )
 
@@ -64,11 +65,48 @@ cdef enum InterpolationEnumType:
     INTERPOLATION_MIDPOINT
 
 
-cdef inline float64_t median_linear(float64_t* a, int n) nogil:
+cdef inline float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil:
     cdef:
         int i, j, na_count = 0
+        float64_t* tmp
         float64_t result
+
+    if n == 0:
+        return NaN
+
+    # count NAs
+    for i in range(n):
+        if mask[i]:
+            na_count += 1
+
+    if na_count:
+        if na_count == n:
+            return NaN
+
+        tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
+
+        j = 0
+        for i in range(n):
+            if not mask[i]:
+                tmp[j] = a[i]
+                j += 1
+
+        a = tmp
+        n -= na_count
+
+    result = calc_median_linear(a, n, na_count)
+
+    if na_count:
+        free(a)
+
+    return result
+
+
+cdef inline float64_t median_linear(float64_t* a, int n) nogil:
+    cdef:
+        int i, j, na_count = 0
         float64_t* tmp
+        float64_t result
 
     if n == 0:
         return NaN
@@ -93,18 +131,34 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
         a = tmp
         n -= na_count
 
+    result = calc_median_linear(a, n, na_count)
+
+    if na_count:
+        free(a)
+
+    return result
+
+
+cdef inline float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil:
+    cdef:
+        float64_t result
+
     if n % 2:
         result = kth_smallest_c(a, n // 2, n)
     else:
         result = (kth_smallest_c(a, n // 2, n) +
                   kth_smallest_c(a, n // 2 - 1, n)) / 2
 
-    if na_count:
-        free(a)
-
     return result
 
 
+ctypedef fused int64float_t:
+    int64_t
+    uint64_t
+    float32_t
+    float64_t
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_median_float64(
@@ -113,6 +167,8 @@ def group_median_float64(
     ndarray[float64_t, ndim=2] values,
     ndarray[intp_t] labels,
     Py_ssize_t min_count=-1,
+    const uint8_t[:, :] mask=None,
+    uint8_t[:, ::1] result_mask=None,
 ) -> None:
     """
     Only aggregates on axis=0
@@ -121,8 +177,12 @@ def group_median_float64(
         Py_ssize_t i, j, N, K, ngroups, size
         ndarray[intp_t] _counts
         ndarray[float64_t, ndim=2] data
+        ndarray[uint8_t, ndim=2] data_mask
         ndarray[intp_t] indexer
         float64_t* ptr
+        uint8_t* ptr_mask
+        float64_t result
+        bint uses_mask = mask is not None
 
     assert min_count == -1, "'min_count' only used in sum and prod"
 
@@ -137,15 +197,38 @@ def group_median_float64(
 
     take_2d_axis1_float64_float64(values.T, indexer, out=data)
 
-    with nogil:
+    if uses_mask:
+        data_mask = np.empty((K, N), dtype=np.uint8)
+        ptr_mask = <uint8_t *>cnp.PyArray_DATA(data_mask)
+
+        take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1)
 
-        for i in range(K):
-            # exclude NA group
-            ptr += _counts[0]
-            for j in range(ngroups):
-                size = _counts[j + 1]
-                out[j, i] = median_linear(ptr, size)
-                ptr += size
+        with nogil:
+
+            for i in range(K):
+                # exclude NA group
+                ptr += _counts[0]
+                ptr_mask += _counts[0]
+
+                for j in range(ngroups):
+                    size = _counts[j + 1]
+                    result = median_linear_mask(ptr, size, ptr_mask)
+                    out[j, i] = result
+
+                    if result != result:
+                        result_mask[j, i] = 1
+                    ptr += size
+                    ptr_mask += size
+
+    else:
+        with nogil:
+            for i in range(K):
+                # exclude NA group
+                ptr += _counts[0]
+                for j in range(ngroups):
+                    size = _counts[j + 1]
+                    out[j, i] = median_linear(ptr, size)
+                    ptr += size
 
 
 @cython.boundscheck(False)
@@ -206,13 +289,6 @@ def group_cumprod_float64(
                         accum[lab, j] = NaN
 
 
-ctypedef fused int64float_t:
-    int64_t
-    uint64_t
-    float32_t
-    float64_t
-
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_cumsum(
 
@@ -14,6 +14,7 @@
     Sequence,
     cast,
     final,
+    overload,
 )
 import warnings
 
@@ -101,6 +102,7 @@
         Categorical,
         DataFrame,
         Index,
+        MultiIndex,
         Series,
     )
     from pandas.core.arrays import (
@@ -1780,7 +1782,7 @@ def safe_sort(
     na_sentinel: int = -1,
     assume_unique: bool = False,
     verify: bool = True,
-) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
+) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]:
     """
     Sort ``values`` and reorder corresponding ``codes``.
 
@@ -1809,7 +1811,7 @@ def safe_sort(
 
     Returns
     -------
-    ordered : ndarray
+    ordered : ndarray or MultiIndex
         Sorted ``values``
     new_codes : ndarray
         Reordered ``codes``; returned when ``codes`` is not None.
@@ -1827,6 +1829,7 @@ def safe_sort(
         raise TypeError(
             "Only list-like objects are allowed to be passed to safe_sort as values"
         )
+    original_values = values
 
     if not isinstance(values, (np.ndarray, ABCExtensionArray)):
         # don't convert to string types
@@ -1838,6 +1841,7 @@ def safe_sort(
         values = np.asarray(values, dtype=dtype)  # type: ignore[arg-type]
 
     sorter = None
+    ordered: np.ndarray | MultiIndex
 
     if (
         not is_extension_array_dtype(values)
@@ -1853,7 +1857,7 @@ def safe_sort(
             # which would work, but which fails for special case of 1d arrays
             # with tuples.
             if values.size and isinstance(values[0], tuple):
-                ordered = _sort_tuples(values)
+                ordered = _sort_tuples(values, original_values)
             else:
                 ordered = _sort_mixed(values)
 
@@ -1915,19 +1919,33 @@ def _sort_mixed(values) -> np.ndarray:
     )
 
 
-def _sort_tuples(values: np.ndarray) -> np.ndarray:
+@overload
+def _sort_tuples(values: np.ndarray, original_values: np.ndarray) -> np.ndarray:
+    ...
+
+
+@overload
+def _sort_tuples(values: np.ndarray, original_values: MultiIndex) -> MultiIndex:
+    ...
+
+
+def _sort_tuples(
+    values: np.ndarray, original_values: np.ndarray | MultiIndex
+) -> np.ndarray | MultiIndex:
     """
     Convert array of tuples (1d) to array or array (2d).
     We need to keep the columns separately as they contain different types and
     nans (can't use `np.sort` as it may fail when str and nan are mixed in a
     column as types cannot be compared).
+    We have to apply the indexer to the original values to keep the dtypes in
+    case of MultiIndexes
     """
     from pandas.core.internals.construction import to_arrays
     from pandas.core.sorting import lexsort_indexer
 
     arrays, _ = to_arrays(values, None)
     indexer = lexsort_indexer(arrays, orders=True)
-    return values[indexer]
+    return original_values[indexer]
 
 
 def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
 
@@ -280,6 +280,9 @@ def is_categorical(arr) -> bool:
     """
     Check whether an array-like is a Categorical instance.
 
+    .. deprecated:: 1.1.0
+        Use ``is_categorical_dtype`` instead.
+
     Parameters
     ----------
     arr : array-like
 
@@ -9862,7 +9862,7 @@ def join(
             values given, the `other` DataFrame must have a MultiIndex. Can
             pass an array as the join key if it is not already contained in
             the calling DataFrame. Like an Excel VLOOKUP operation.
-        how : {'left', 'right', 'outer', 'inner'}, default 'left'
+        how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
             How to handle the operation of the two objects.
 
             * left: use calling frame's index (or column if on is specified)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM quay.io/condaforge/mambaforge`
	`1`	`+FROM quay.io/condaforge/mambaforge:4.13.0-1`
`2`	`2`
`3`	`3`	`# if you forked pandas, you can pass in your own GitHub username to use your fork`
`4`	`4`	`# i.e. gh_username=myname`