sthagen · sthagen · Jul 31, 2021 · Jul 30, 2021 · Jul 30, 2021 · Jul 30, 2021
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -121,6 +121,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
       pandas/io/parsers/ \
       pandas/io/sas/ \
       pandas/io/sql.py \
+      pandas/io/formats/format.py \
+      pandas/io/formats/style.py \
       pandas/tseries/
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 

diff --git a/doc/source/_static/style/df_pipe.png b/doc/source/_static/style/df_pipe.png
diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
@@ -19,6 +19,7 @@ Fixed regressions
 - Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
 - Fixed regression in :meth:`DataFrame.shift` where TypeError occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`)
 - Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`)
+- Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`)
 - Fixed regression where :meth:`pandas.read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to None (:issue:`42387`)
 - Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`)
 -
@@ -29,7 +30,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
--
+- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -37,6 +37,7 @@ Other enhancements
 - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
 - Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
 - Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
+- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -170,6 +171,8 @@ Performance improvements
 - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
 - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
 - Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
+- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
+-
 
 .. ---------------------------------------------------------------------------
 
@@ -229,6 +232,8 @@ Indexing
 - Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`)
 - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
 - Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
+- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
+- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
 
 Missing
 ^^^^^^^

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -326,8 +326,12 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
         Py_ssize_t i, j, xi, yi, N, K
         bint minpv
         float64_t[:, ::1] result
+        # Initialize to None since we only use in the no missing value case
+        float64_t[::1] means=None, ssqds=None
         ndarray[uint8_t, ndim=2] mask
+        bint no_nans
         int64_t nobs = 0
+        float64_t mean, ssqd, val
         float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
 
     N, K = (<object>mat).shape
@@ -339,25 +343,57 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
 
     result = np.empty((K, K), dtype=np.float64)
     mask = np.isfinite(mat).view(np.uint8)
+    no_nans = mask.all()
+
+    # Computing the online means and variances is expensive - so if possible we can
+    # precompute these and avoid repeating the computations each time we handle
+    # an (xi, yi) pair
+    if no_nans:
+        means = np.empty(K, dtype=np.float64)
+        ssqds = np.empty(K, dtype=np.float64)
+
+        with nogil:
+            for j in range(K):
+                ssqd = mean = 0
+                for i in range(N):
+                    val = mat[i, j]
+                    dx = val - mean
+                    mean += 1 / (i + 1) * dx
+                    ssqd += (val - mean) * dx
+
+                means[j] = mean
+                ssqds[j] = ssqd
 
     with nogil:
         for xi in range(K):
             for yi in range(xi + 1):
-                # Welford's method for the variance-calculation
-                # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-                nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
-                for i in range(N):
-                    if mask[i, xi] and mask[i, yi]:
+                covxy = 0
+                if no_nans:
+                    for i in range(N):
                         vx = mat[i, xi]
                         vy = mat[i, yi]
-                        nobs += 1
-                        dx = vx - meanx
-                        dy = vy - meany
-                        meanx += 1 / nobs * dx
-                        meany += 1 / nobs * dy
-                        ssqdmx += (vx - meanx) * dx
-                        ssqdmy += (vy - meany) * dy
-                        covxy += (vx - meanx) * dy
+                        covxy += (vx - means[xi]) * (vy - means[yi])
+
+                    ssqdmx = ssqds[xi]
+                    ssqdmy = ssqds[yi]
+                    nobs = N
+
+                else:
+                    nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
+                    for i in range(N):
+                        # Welford's method for the variance-calculation
+                        # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+                        if mask[i, xi] and mask[i, yi]:
+                            vx = mat[i, xi]
+                            vy = mat[i, yi]
+                            nobs += 1
+                            dx = vx - meanx
+                            dy = vy - meany
+                            meanx += 1 / nobs * dx
+                            meany += 1 / nobs * dy
+                            ssqdmx += (vx - meanx) * dx
+                            ssqdmy += (vy - meany) * dy
+                            covxy += (vx - meanx) * dy
 
                 if nobs < minpv:
                     result[xi, yi] = result[yi, xi] = NaN

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
                       const intp_t[:] labels,
                       int ngroups,
                       bint is_datetimelike,
+                      bint skipna,
                       bint compute_max):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1336,6 +1337,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
         Number of groups, larger than all entries of `labels`.
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
+    skipna : bool
+        If True, ignore nans in `values`.
     compute_max : bool
         True if cumulative maximum should be computed, False
         if cumulative minimum should be computed
@@ -1356,9 +1359,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out,
         accum[:] = -np.inf if compute_max else np.inf
 
     if mask is not None:
-        masked_cummin_max(out, values, mask, labels, accum, compute_max)
+        masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max)
     else:
-        cummin_max(out, values, labels, accum, is_datetimelike, compute_max)
+        cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max)
 
 
 @cython.boundscheck(False)
@@ -1367,6 +1370,7 @@ cdef cummin_max(groupby_t[:, ::1] out,
                 ndarray[groupby_t, ndim=2] values,
                 const intp_t[:] labels,
                 groupby_t[:, ::1] accum,
+                bint skipna,
                 bint is_datetimelike,
                 bint compute_max):
     """
@@ -1375,8 +1379,24 @@ cdef cummin_max(groupby_t[:, ::1] out,
     """
     cdef:
         Py_ssize_t i, j, N, K
-        groupby_t val, mval
+        groupby_t val, mval, na_val
+        uint8_t[:, ::1] seen_na
         intp_t lab
+        bint na_possible
+
+    if groupby_t is float64_t or groupby_t is float32_t:
+        na_val = NaN
+        na_possible = True
+    elif is_datetimelike:
+        na_val = NPY_NAT
+        na_possible = True
+    # Will never be used, just to avoid uninitialized warning
+    else:
+        na_val = 0
+        na_possible = False
+
+    if na_possible:
+        seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
 
     N, K = (<object>values).shape
     with nogil:
@@ -1385,18 +1405,22 @@ cdef cummin_max(groupby_t[:, ::1] out,
             if lab < 0:
                 continue
             for j in range(K):
-                val = values[i, j]
-                if not _treat_as_na(val, is_datetimelike):
-                    mval = accum[lab, j]
-                    if compute_max:
-                        if val > mval:
-                            accum[lab, j] = mval = val
-                    else:
-                        if val < mval:
-                            accum[lab, j] = mval = val
-                    out[i, j] = mval
+                if not skipna and na_possible and seen_na[lab, j]:
+                    out[i, j] = na_val
                 else:
-                    out[i, j] = val
+                    val = values[i, j]
+                    if not _treat_as_na(val, is_datetimelike):
+                        mval = accum[lab, j]
+                        if compute_max:
+                            if val > mval:
+                                accum[lab, j] = mval = val
+                        else:
+                            if val < mval:
+                                accum[lab, j] = mval = val
+                        out[i, j] = mval
+                    else:
+                        seen_na[lab, j] = 1
+                        out[i, j] = val
 
 
 @cython.boundscheck(False)
@@ -1406,6 +1430,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
                        uint8_t[:, ::1] mask,
                        const intp_t[:] labels,
                        groupby_t[:, ::1] accum,
+                       bint skipna,
                        bint compute_max):
     """
     Compute the cumulative minimum/maximum of columns of `values`, in row groups
@@ -1414,25 +1439,32 @@ cdef masked_cummin_max(groupby_t[:, ::1] out,
     cdef:
         Py_ssize_t i, j, N, K
         groupby_t val, mval
+        uint8_t[:, ::1] seen_na
         intp_t lab
 
     N, K = (<object>values).shape
+    seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
     with nogil:
         for i in range(N):
             lab = labels[i]
             if lab < 0:
                 continue
             for j in range(K):
-                if not mask[i, j]:
-                    val = values[i, j]
-                    mval = accum[lab, j]
-                    if compute_max:
-                        if val > mval:
-                            accum[lab, j] = mval = val
+                if not skipna and seen_na[lab, j]:
+                    mask[i, j] = 1
+                else:
+                    if not mask[i, j]:
+                        val = values[i, j]
+                        mval = accum[lab, j]
+                        if compute_max:
+                            if val > mval:
+                                accum[lab, j] = mval = val
+                        else:
+                            if val < mval:
+                                accum[lab, j] = mval = val
+                        out[i, j] = mval
                     else:
-                        if val < mval:
-                            accum[lab, j] = mval = val
-                    out[i, j] = mval
+                        seen_na[lab, j] = 1
 
 
 @cython.boundscheck(False)
@@ -1442,7 +1474,8 @@ def group_cummin(groupby_t[:, ::1] out,
                  const intp_t[:] labels,
                  int ngroups,
                  bint is_datetimelike,
-                 uint8_t[:, ::1] mask=None) -> None:
+                 uint8_t[:, ::1] mask=None,
+                 bint skipna=True) -> None:
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
@@ -1451,6 +1484,7 @@ def group_cummin(groupby_t[:, ::1] out,
         labels,
         ngroups,
         is_datetimelike,
+        skipna,
         compute_max=False
     )
 
@@ -1462,7 +1496,8 @@ def group_cummax(groupby_t[:, ::1] out,
                  const intp_t[:] labels,
                  int ngroups,
                  bint is_datetimelike,
-                 uint8_t[:, ::1] mask=None) -> None:
+                 uint8_t[:, ::1] mask=None,
+                 bint skipna=True) -> None:
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
@@ -1471,5 +1506,6 @@ def group_cummax(groupby_t[:, ::1] out,
         labels,
         ngroups,
         is_datetimelike,
+        skipna,
         compute_max=True
     )
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1221,7 +1221,7 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1):
         """
 
     @doc(_shared_docs["searchsorted"], klass="Index")
-    def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
+    def searchsorted(self, value, side="left", sorter=None) -> npt.NDArray[np.intp]:
         return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)
 
     def drop_duplicates(self, keep="first"):
@@ -1232,5 +1232,5 @@ def drop_duplicates(self, keep="first"):
     @final
     def _duplicated(
         self, keep: Literal["first", "last", False] = "first"
-    ) -> np.ndarray:
+    ) -> npt.NDArray[np.bool_]:
         return duplicated(self._values, keep=keep)
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -402,7 +402,7 @@ def extract_array(
     >>> extract_array([1, 2, 3])
     [1, 2, 3]
 
-    For an ndarray-backed Series / Index a PandasArray is returned.
+    For an ndarray-backed Series / Index the ndarray is returned.
 
     >>> extract_array(pd.Series([1, 2, 3]))
     array([1, 2, 3])

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -85,6 +85,7 @@
     rewrite_axis_style_signature,
 )
 from pandas.util._validators import (
+    validate_ascending,
     validate_axis_style_args,
     validate_bool_kwarg,
     validate_percentile,
@@ -6202,7 +6203,7 @@ def sort_values(  # type: ignore[override]
     ):
         inplace = validate_bool_kwarg(inplace, "inplace")
         axis = self._get_axis_number(axis)
-
+        ascending = validate_ascending(ascending)
         if not isinstance(by, list):
             by = [by]
         if is_sequence(ascending) and len(by) != len(ascending):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -99,7 +99,10 @@
     ABCDataFrame,
     ABCSeries,
 )
-from pandas.core.dtypes.inference import is_hashable
+from pandas.core.dtypes.inference import (
+    is_hashable,
+    is_nested_list_like,
+)
 from pandas.core.dtypes.missing import (
     isna,
     notna,
@@ -4182,6 +4185,7 @@ def _drop_axis(
 
         # Case for non-unique axis
         else:
+            is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
             labels = ensure_object(com.index_labels_to_array(labels))
             if level is not None:
                 if not isinstance(axis, MultiIndex):
@@ -4191,9 +4195,14 @@ def _drop_axis(
                 # GH 18561 MultiIndex.drop should raise if label is absent
                 if errors == "raise" and indexer.all():
                     raise KeyError(f"{labels} not found in axis")
-            elif isinstance(axis, MultiIndex) and labels.dtype == "object":
+            elif (
+                isinstance(axis, MultiIndex)
+                and labels.dtype == "object"
+                and not is_tuple_labels
+            ):
                 # Set level to zero in case of MultiIndex and label is string,
                 #  because isin can't handle strings for MultiIndexes GH#36293
+                # In case of tuples we get dtype object but have to use isin GH#42771
                 indexer = ~axis.get_level_values(0).isin(labels)
             else:
                 indexer = ~axis.isin(labels)