diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 9f59958b4e827..d04d0eaee6ec4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,6 +121,8 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pandas/io/parsers/ \ pandas/io/sas/ \ pandas/io/sql.py \ + pandas/io/formats/format.py \ + pandas/io/formats/style.py \ pandas/tseries/ RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/doc/source/_static/style/df_pipe.png b/doc/source/_static/style/df_pipe.png new file mode 100644 index 0000000000000..071a481ad5acc Binary files /dev/null and b/doc/source/_static/style/df_pipe.png differ diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index 116bdd6e1d98f..8723b1b766485 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -19,6 +19,7 @@ Fixed regressions - Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`) - Fixed regression in :meth:`DataFrame.shift` where TypeError occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`) - Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`) +- Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`) - Fixed regression where :meth:`pandas.read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to None (:issue:`42387`) - Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`) - @@ -29,7 +30,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e42360558d284..ad6a9d994bf7b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -37,6 +37,7 @@ Other enhancements - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) - Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`) - Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`) +- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - .. --------------------------------------------------------------------------- @@ -170,6 +171,8 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) - Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`) +- .. --------------------------------------------------------------------------- @@ -229,6 +232,8 @@ Indexing - Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`) - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`) - Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`) +- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`) +- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`) Missing ^^^^^^^ diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ff46c699c71e7..6c5388a38c345 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -326,8 +326,12 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): Py_ssize_t i, j, xi, yi, N, K bint minpv float64_t[:, ::1] result + # Initialize to None since we only use in the no missing value case + float64_t[::1] means=None, ssqds=None ndarray[uint8_t, ndim=2] mask + bint no_nans int64_t nobs = 0 + float64_t mean, ssqd, val float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy N, K = (mat).shape @@ -339,25 +343,57 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + no_nans = mask.all() + + # Computing the online means and variances is expensive - so if possible we can + # precompute these and avoid repeating the computations each time we handle + # an (xi, yi) pair + if no_nans: + means = np.empty(K, dtype=np.float64) + ssqds = np.empty(K, dtype=np.float64) + + with nogil: + for j in range(K): + ssqd = mean = 0 + for i in range(N): + val = mat[i, j] + dx = val - mean + mean += 1 / (i + 1) * dx + ssqd += (val - mean) * dx + + means[j] = mean + ssqds[j] = ssqd with nogil: for xi in range(K): for yi in range(xi + 1): - # Welford's method for the variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 - for i in range(N): - if mask[i, xi] and mask[i, yi]: + covxy = 0 + if no_nans: + for i in range(N): vx = mat[i, xi] vy = mat[i, yi] - nobs += 1 - dx = vx - meanx - dy = vy - meany - meanx += 1 / nobs * dx - meany += 1 / nobs * dy - ssqdmx += (vx - meanx) * dx - ssqdmy += (vy - meany) * dy - covxy += (vx - meanx) * dy + covxy += (vx - means[xi]) * (vy - means[yi]) + + ssqdmx = ssqds[xi] + ssqdmy = ssqds[yi] + nobs = N + + else: + nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 + for i in range(N): + # Welford's method for the variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] + vy = mat[i, yi] + nobs += 1 + dx = vx - meanx + dy = vy - meany + meanx += 1 / nobs * dx + meany += 1 / nobs * dy + ssqdmx += (vx - meanx) * dx + ssqdmy += (vy - meany) * dy + covxy += (vx - meanx) * dy if nobs < minpv: result[xi, yi] = result[yi, xi] = NaN diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 354b87e03e6c4..91921ba0e64c2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1317,6 +1317,7 @@ cdef group_cummin_max(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, + bint skipna, bint compute_max): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1336,6 +1337,8 @@ cdef group_cummin_max(groupby_t[:, ::1] out, Number of groups, larger than all entries of `labels`. is_datetimelike : bool True if `values` contains datetime-like entries. + skipna : bool + If True, ignore nans in `values`. compute_max : bool True if cumulative maximum should be computed, False if cumulative minimum should be computed @@ -1356,9 +1359,9 @@ cdef group_cummin_max(groupby_t[:, ::1] out, accum[:] = -np.inf if compute_max else np.inf if mask is not None: - masked_cummin_max(out, values, mask, labels, accum, compute_max) + masked_cummin_max(out, values, mask, labels, accum, skipna, compute_max) else: - cummin_max(out, values, labels, accum, is_datetimelike, compute_max) + cummin_max(out, values, labels, accum, skipna, is_datetimelike, compute_max) @cython.boundscheck(False) @@ -1367,6 +1370,7 @@ cdef cummin_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, groupby_t[:, ::1] accum, + bint skipna, bint is_datetimelike, bint compute_max): """ @@ -1375,8 +1379,24 @@ cdef cummin_max(groupby_t[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K - groupby_t val, mval + groupby_t val, mval, na_val + uint8_t[:, ::1] seen_na intp_t lab + bint na_possible + + if groupby_t is float64_t or groupby_t is float32_t: + na_val = NaN + na_possible = True + elif is_datetimelike: + na_val = NPY_NAT + na_possible = True + # Will never be used, just to avoid uninitialized warning + else: + na_val = 0 + na_possible = False + + if na_possible: + seen_na = np.zeros((accum).shape, dtype=np.uint8) N, K = (values).shape with nogil: @@ -1385,18 +1405,22 @@ cdef cummin_max(groupby_t[:, ::1] out, if lab < 0: continue for j in range(K): - val = values[i, j] - if not _treat_as_na(val, is_datetimelike): - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val - else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + if not skipna and na_possible and seen_na[lab, j]: + out[i, j] = na_val else: - out[i, j] = val + val = values[i, j] + if not _treat_as_na(val, is_datetimelike): + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val + else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + else: + seen_na[lab, j] = 1 + out[i, j] = val @cython.boundscheck(False) @@ -1406,6 +1430,7 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, uint8_t[:, ::1] mask, const intp_t[:] labels, groupby_t[:, ::1] accum, + bint skipna, bint compute_max): """ Compute the cumulative minimum/maximum of columns of `values`, in row groups @@ -1414,25 +1439,32 @@ cdef masked_cummin_max(groupby_t[:, ::1] out, cdef: Py_ssize_t i, j, N, K groupby_t val, mval + uint8_t[:, ::1] seen_na intp_t lab N, K = (values).shape + seen_na = np.zeros((accum).shape, dtype=np.uint8) with nogil: for i in range(N): lab = labels[i] if lab < 0: continue for j in range(K): - if not mask[i, j]: - val = values[i, j] - mval = accum[lab, j] - if compute_max: - if val > mval: - accum[lab, j] = mval = val + if not skipna and seen_na[lab, j]: + mask[i, j] = 1 + else: + if not mask[i, j]: + val = values[i, j] + mval = accum[lab, j] + if compute_max: + if val > mval: + accum[lab, j] = mval = val + else: + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval else: - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval + seen_na[lab, j] = 1 @cython.boundscheck(False) @@ -1442,7 +1474,8 @@ def group_cummin(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None) -> None: + uint8_t[:, ::1] mask=None, + bint skipna=True) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1451,6 +1484,7 @@ def group_cummin(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, + skipna, compute_max=False ) @@ -1462,7 +1496,8 @@ def group_cummax(groupby_t[:, ::1] out, const intp_t[:] labels, int ngroups, bint is_datetimelike, - uint8_t[:, ::1] mask=None) -> None: + uint8_t[:, ::1] mask=None, + bint skipna=True) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1471,5 +1506,6 @@ def group_cummax(groupby_t[:, ::1] out, labels, ngroups, is_datetimelike, + skipna, compute_max=True ) diff --git a/pandas/core/base.py b/pandas/core/base.py index 4d380c6831071..7d51b50f783a5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1221,7 +1221,7 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1): """ @doc(_shared_docs["searchsorted"], klass="Index") - def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: + def searchsorted(self, value, side="left", sorter=None) -> npt.NDArray[np.intp]: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) def drop_duplicates(self, keep="first"): @@ -1232,5 +1232,5 @@ def drop_duplicates(self, keep="first"): @final def _duplicated( self, keep: Literal["first", "last", False] = "first" - ) -> np.ndarray: + ) -> npt.NDArray[np.bool_]: return duplicated(self._values, keep=keep) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 89591f27e9092..f84aaa907f3fc 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -402,7 +402,7 @@ def extract_array( >>> extract_array([1, 2, 3]) [1, 2, 3] - For an ndarray-backed Series / Index a PandasArray is returned. + For an ndarray-backed Series / Index the ndarray is returned. >>> extract_array(pd.Series([1, 2, 3])) array([1, 2, 3]) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 27aa2ed939c1a..48b18a33f9c9f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -85,6 +85,7 @@ rewrite_axis_style_signature, ) from pandas.util._validators import ( + validate_ascending, validate_axis_style_args, validate_bool_kwarg, validate_percentile, @@ -6202,7 +6203,7 @@ def sort_values( # type: ignore[override] ): inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) - + ascending = validate_ascending(ascending) if not isinstance(by, list): by = [by] if is_sequence(ascending) and len(by) != len(ascending): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bf79e58077179..19dd06074bf78 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -99,7 +99,10 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.inference import ( + is_hashable, + is_nested_list_like, +) from pandas.core.dtypes.missing import ( isna, notna, @@ -4182,6 +4185,7 @@ def _drop_axis( # Case for non-unique axis else: + is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple) labels = ensure_object(com.index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): @@ -4191,9 +4195,14 @@ def _drop_axis( # GH 18561 MultiIndex.drop should raise if label is absent if errors == "raise" and indexer.all(): raise KeyError(f"{labels} not found in axis") - elif isinstance(axis, MultiIndex) and labels.dtype == "object": + elif ( + isinstance(axis, MultiIndex) + and labels.dtype == "object" + and not is_tuple_labels + ): # Set level to zero in case of MultiIndex and label is string, # because isin can't handle strings for MultiIndexes GH#36293 + # In case of tuples we get dtype object but have to use isin GH#42771 indexer = ~axis.get_level_values(0).isin(labels) else: indexer = ~axis.isin(labels) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 939cff16bf1ae..e57e48cb3ab11 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2784,10 +2784,11 @@ def cummin(self, axis=0, **kwargs): ------- Series or DataFrame """ + skipna = kwargs.get("skipna", True) if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform("cummin", numeric_only=False) + return self._cython_transform("cummin", numeric_only=False, skipna=skipna) @final @Substitution(name="groupby") @@ -2800,10 +2801,11 @@ def cummax(self, axis=0, **kwargs): ------- Series or DataFrame """ + skipna = kwargs.get("skipna", True) if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform("cummax", numeric_only=False) + return self._cython_transform("cummax", numeric_only=False, skipna=skipna) @final def _get_cythonized_result( diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 07c6a84f75302..7dc59bdb1e840 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -238,10 +238,11 @@ def _format_attrs(self): """ attrs = super()._format_attrs() for attrib in self._attributes: + # iterating over _attributes prevents us from doing this for PeriodIndex if attrib == "freq": freq = self.freqstr if freq is not None: - freq = repr(freq) + freq = repr(freq) # e.g. D -> 'D' # Argument 1 to "append" of "list" has incompatible type # "Tuple[str, Optional[str]]"; expected "Tuple[str, Union[str, int]]" attrs.append(("freq", freq)) # type: ignore[arg-type] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3b42d1c4505da..60179b69f56a4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -642,7 +642,7 @@ def _get_setitem_indexer(self, key): self._ensure_listlike_indexer(key) if self.axis is not None: - return self._convert_tuple(key, is_setter=True) + return self._convert_tuple(key) ax = self.obj._get_axis(0) @@ -653,12 +653,12 @@ def _get_setitem_indexer(self, key): if isinstance(key, tuple): with suppress(IndexingError): - return self._convert_tuple(key, is_setter=True) + return self._convert_tuple(key) if isinstance(key, range): return list(key) - return self._convert_to_indexer(key, axis=0, is_setter=True) + return self._convert_to_indexer(key, axis=0) def _ensure_listlike_indexer(self, key, axis=None, value=None): """ @@ -755,21 +755,19 @@ def _is_nested_tuple_indexer(self, tup: tuple) -> bool: return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False - def _convert_tuple(self, key, is_setter: bool = False): + def _convert_tuple(self, key): keyidx = [] if self.axis is not None: axis = self.obj._get_axis_number(self.axis) for i in range(self.ndim): if i == axis: - keyidx.append( - self._convert_to_indexer(key, axis=axis, is_setter=is_setter) - ) + keyidx.append(self._convert_to_indexer(key, axis=axis)) else: keyidx.append(slice(None)) else: self._validate_key_length(key) for i, k in enumerate(key): - idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) + idx = self._convert_to_indexer(k, axis=i) keyidx.append(idx) return tuple(keyidx) @@ -867,8 +865,8 @@ def _getitem_nested_tuple(self, tup: tuple): # a tuple passed to a series with a multi-index if len(tup) > self.ndim: if self.name != "loc": - # This should never be reached, but lets be explicit about it - raise ValueError("Too many indices") + # This should never be reached, but let's be explicit about it + raise ValueError("Too many indices") # pragma: no cover if all(is_hashable(x) or com.is_null_slice(x) for x in tup): # GH#10521 Series should reduce MultiIndex dimensions instead of # DataFrame, IndexingError is not raised when slice(None,None,None) @@ -911,7 +909,7 @@ def _getitem_nested_tuple(self, tup: tuple): return obj - def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): + def _convert_to_indexer(self, key, axis: int): raise AbstractMethodError(self) def __getitem__(self, key): @@ -1176,7 +1174,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): # return a DatetimeIndex instead of a slice object. return self.obj.take(indexer, axis=axis) - def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): + def _convert_to_indexer(self, key, axis: int): """ Convert indexing key into something we can use to do actual fancy indexing on a ndarray. @@ -1486,7 +1484,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): labels._validate_positional_slice(slice_obj) return self.obj._slice(slice_obj, axis=axis) - def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): + def _convert_to_indexer(self, key, axis: int): """ Much simpler as we only have to deal with our valid types. """ diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index bd049b80b281c..79c0aad66229c 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -167,15 +167,6 @@ def set_axis(self, axis: int, new_labels: Index) -> None: axis = self._normalize_axis(axis) self._axes[axis] = new_labels - def consolidate(self: T) -> T: - return self - - def is_consolidated(self) -> bool: - return True - - def _consolidate_inplace(self) -> None: - pass - def get_dtypes(self): return np.array([arr.dtype for arr in self.arrays], dtype="object") @@ -1262,9 +1253,6 @@ def _can_hold_na(self) -> bool: def is_single_block(self) -> bool: return True - def _consolidate_check(self): - pass - def fast_xs(self, loc: int) -> ArrayLike: raise NotImplementedError("Use series._values[loc] instead") diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 0ee22200ed495..e65318dd29c52 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -33,6 +33,7 @@ class DataManager(PandasObject): def items(self) -> Index: raise AbstractMethodError(self) + @final def __len__(self) -> int: return len(self.items) @@ -105,6 +106,7 @@ def _equal_values(self: T, other: T) -> bool: """ raise AbstractMethodError(self) + @final def equals(self, other: object) -> bool: """ Implementation for DataFrame.equals @@ -129,13 +131,27 @@ def apply( ) -> T: raise AbstractMethodError(self) + @final def isna(self: T, func) -> T: return self.apply("apply", func=func) + # -------------------------------------------------------------------- + # Consolidation: No-ops for all but BlockManager + + def is_consolidated(self) -> bool: + return True + + def consolidate(self: T) -> T: + return self + + def _consolidate_inplace(self) -> None: + return + class SingleDataManager(DataManager): ndim = 1 + @final @property def array(self): """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index adfecb946d822..953ccedaa5222 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1552,12 +1552,10 @@ def _slice(self, slicer): def getitem_block_index(self, slicer: slice) -> ExtensionBlock: """ Perform __getitem__-like specialized to slicing along index. - - Assumes self.ndim == 2 """ - # error: Invalid index type "Tuple[ellipsis, slice]" for - # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]" - new_values = self.values[..., slicer] # type: ignore[index] + # GH#42787 in principle this is equivalent to values[..., slicer], but we don't + # require subclasses of ExtensionArray to support that form (for now). + new_values = self.values[slicer] return type(self)(new_values, self._mgr_locs, ndim=self.ndim) def fillna( diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 34d0137c26fda..9bc2404cefcfa 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -672,6 +672,7 @@ def _combine_concat_plans(plans, concat_axis: int): offset += last_plc.as_slice.stop else: + # singleton list so we can modify it as a side-effect within _next_or_none num_ended = [0] def _next_or_none(seq): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8937c2c107c62..9d35e1e8d4929 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -465,19 +465,6 @@ def to_native_types(self: T, **kwargs) -> T: """ return self.apply("to_native_types", **kwargs) - def is_consolidated(self) -> bool: - """ - Return True if more than one block with the same dtype - """ - if not self._known_consolidated: - self._consolidate_check() - return self._is_consolidated - - def _consolidate_check(self) -> None: - dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] - self._is_consolidated = len(dtypes) == len(set(dtypes)) - self._known_consolidated = True - @property def is_numeric_mixed_type(self) -> bool: return all(block.is_numeric for block in self.blocks) @@ -623,13 +610,6 @@ def consolidate(self: T) -> T: bm._consolidate_inplace() return bm - def _consolidate_inplace(self) -> None: - if not self.is_consolidated(): - self.blocks = tuple(_consolidate(self.blocks)) - self._is_consolidated = True - self._known_consolidated = True - self._rebuild_blknos_and_blklocs() - def reindex_indexer( self: T, new_axis: Index, @@ -1551,6 +1531,29 @@ def _interleave( return result + # ---------------------------------------------------------------- + # Consolidation + + def is_consolidated(self) -> bool: + """ + Return True if more than one block with the same dtype + """ + if not self._known_consolidated: + self._consolidate_check() + return self._is_consolidated + + def _consolidate_check(self) -> None: + dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] + self._is_consolidated = len(dtypes) == len(set(dtypes)) + self._known_consolidated = True + + def _consolidate_inplace(self) -> None: + if not self.is_consolidated(): + self.blocks = tuple(_consolidate(self.blocks)) + self._is_consolidated = True + self._known_consolidated = True + self._rebuild_blknos_and_blklocs() + class SingleBlockManager(BaseBlockManager, SingleDataManager): """manage a single block with""" @@ -1710,15 +1713,6 @@ def array_values(self): def _can_hold_na(self) -> bool: return self._block._can_hold_na - def is_consolidated(self) -> bool: - return True - - def _consolidate_check(self): - pass - - def _consolidate_inplace(self): - pass - def idelete(self, indexer) -> SingleBlockManager: """ Delete single location from SingleBlockManager. diff --git a/pandas/core/series.py b/pandas/core/series.py index 32b56462788e5..ce986f2dd8038 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -57,6 +57,7 @@ doc, ) from pandas.util._validators import ( + validate_ascending, validate_bool_kwarg, validate_percentile, ) @@ -69,7 +70,6 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_bool, is_dict_like, is_integer, is_iterator, @@ -3438,8 +3438,7 @@ def sort_values( ) ascending = ascending[0] - if not is_bool(ascending): - raise ValueError("ascending must be boolean") + ascending = validate_ascending(ascending) if na_position not in ["first", "last"]: raise ValueError(f"invalid na_position: {na_position}") @@ -4386,7 +4385,7 @@ def _reduce( return op(delegate, skipna=skipna, **kwds) def _reindex_indexer( - self, new_index: Index | None, indexer: np.ndarray | None, copy: bool + self, new_index: Index | None, indexer: npt.NDArray[np.intp] | None, copy: bool ) -> Series: # Note: new_index is None iff indexer is None # if not None, indexer is np.intp diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 83e0086958b9a..3fd3d84f90161 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1956,16 +1956,14 @@ def __call__(self, num: int | float) -> str: """ Formats a number in engineering notation, appending a letter representing the power of 1000 of the original number. Some examples: - - >>> format_eng(0) # for self.accuracy = 0 + >>> format_eng = EngFormatter(accuracy=0, use_eng_prefix=True) + >>> format_eng(0) ' 0' - - >>> format_eng(1000000) # for self.accuracy = 1, - # self.use_eng_prefix = True + >>> format_eng = EngFormatter(accuracy=1, use_eng_prefix=True) + >>> format_eng(1_000_000) ' 1.0M' - - >>> format_eng("-1e-6") # for self.accuracy = 2 - # self.use_eng_prefix = False + >>> format_eng = EngFormatter(accuracy=2, use_eng_prefix=False) + >>> format_eng("-1e-6") '-1.00E-06' @param num: the value to represent diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index af1c0ca34ec0f..3d6705ed593d2 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -341,9 +341,10 @@ def set_tooltips( >>> df.style.set_tooltips(ttips, css_class='tt-add', props=[ ... ('visibility', 'hidden'), ... ('position', 'absolute'), - ... ('z-index', 1)]) + ... ('z-index', 1)]) # doctest: +SKIP >>> df.style.set_tooltips(ttips, css_class='tt-add', ... props='visibility:hidden; position:absolute; z-index:1;') + ... # doctest: +SKIP """ if not self.cell_ids: # tooltips not optimised for individual cell check. requires reasonable @@ -553,7 +554,7 @@ def to_latex( >>> s = df.style.highlight_max(axis=None, ... props='cellcolor:{red}; bfseries: ;') - >>> s.to_latex() + >>> s.to_latex() # doctest: +SKIP Internally these structured LaTeX ``(, )`` pairs are translated to the @@ -592,7 +593,7 @@ def to_latex( ... props='cellcolor:[HTML]{FFFF00}; color:{red};' ... 'textit:--rwrap; textbf:--rwrap;' ... ) - >>> s.to_latex() + >>> s.to_latex() # doctest: +SKIP .. figure:: ../../_static/style/latex_1.png @@ -653,7 +654,7 @@ def to_latex( ... column_format="rrrrr", position="h", position_float="centering", ... hrules=True, label="table:5", caption="Styled LaTeX Table", ... multirow_align="t", multicol_align="r" - ... ) + ... ) # doctest: +SKIP .. figure:: ../../_static/style/latex_2.png @@ -670,8 +671,14 @@ def to_latex( ... ("Numeric", "Integers"): '\${}', ... ("Numeric", "Floats"): '{:.3f}', ... ("Non-Numeric", "Strings"): str.upper - ... }) - >>> s.to_latex() + ... }) # doctest: +SKIP + Numeric Non-Numeric + Integers Floats Strings + L0 ix1 $1 2.200 DOGS + ix2 $3 4.400 CATS + L1 ix3 $2 6.600 COWS + + >>> s.to_latex() # doctest: +SKIP \begin{tabular}{llrrl} {} & {} & \multicolumn{2}{r}{Numeric} & {Non-Numeric} \\ {} & {} & {Integers} & {Floats} & {Strings} \\ @@ -713,7 +720,7 @@ def to_latex( >>> df = pd.DataFrame([[1]]) >>> df.style.set_properties( ... **{"font-weight": "bold /* --dwrap */", "Huge": "--latex--rwrap"} - ... ).to_latex(convert_css=True) + ... ).to_latex(convert_css=True) # doctest: +SKIP \begin{tabular}{lr} {} & {0} \\ 0 & {\bfseries}{\Huge{1}} \\ @@ -934,7 +941,7 @@ def set_td_classes(self, classes: DataFrame) -> Styler: ... ["min-val red", "", "blue"], ... ["red", None, "blue max-val"] ... ], index=df.index, columns=df.columns) - >>> df.style.set_td_classes(classes) + >>> df.style.set_td_classes(classes) # doctest: +SKIP Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the underlying, @@ -943,14 +950,14 @@ def set_td_classes(self, classes: DataFrame) -> Styler: ... columns=[["level0", "level0"], ["level1a", "level1b"]]) >>> classes = pd.DataFrame(["min-val"], index=["a"], ... columns=[["level0"],["level1a"]]) - >>> df.style.set_td_classes(classes) + >>> df.style.set_td_classes(classes) # doctest: +SKIP Form of the output with new additional css classes, >>> df = pd.DataFrame([[1]]) >>> css = pd.DataFrame([["other-class"]]) >>> s = Styler(df, uuid="_", cell_ids=False).set_td_classes(css) - >>> s.hide_index().render() + >>> s.hide_index().render() # doctest: +SKIP '' '' ' ' @@ -1178,19 +1185,26 @@ def apply( >>> def highlight_max(x, color): ... return np.where(x == np.nanmax(x.to_numpy()), f"color: {color};", None) >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - >>> df.style.apply(highlight_max, color='red') - >>> df.style.apply(highlight_max, color='blue', axis=1) - >>> df.style.apply(highlight_max, color='green', axis=None) + >>> df.style.apply(highlight_max, color='red') # doctest: +SKIP + >>> df.style.apply(highlight_max, color='blue', axis=1) # doctest: +SKIP + >>> df.style.apply(highlight_max, color='green', axis=None) # doctest: +SKIP Using ``subset`` to restrict application to a single column or multiple columns >>> df.style.apply(highlight_max, color='red', subset="A") + ... # doctest: +SKIP >>> df.style.apply(highlight_max, color='red', subset=["A", "B"]) + ... # doctest: +SKIP Using a 2d input to ``subset`` to select rows in addition to columns - >>> df.style.apply(highlight_max, color='red', subset=([0,1,2], slice(None)) - >>> df.style.apply(highlight_max, color='red', subset=(slice(0,5,2), "A") + >>> df.style.apply(highlight_max, color='red', subset=([0,1,2], slice(None))) + ... # doctest: +SKIP + >>> df.style.apply(highlight_max, color='red', subset=(slice(0,5,2), "A")) + ... # doctest: +SKIP + + See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for + more details. """ self._todo.append( (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) @@ -1246,17 +1260,24 @@ def applymap( >>> def color_negative(v, color): ... return f"color: {color};" if v < 0 else None >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"]) - >>> df.style.applymap(color_negative, color='red') + >>> df.style.applymap(color_negative, color='red') # doctest: +SKIP Using ``subset`` to restrict application to a single column or multiple columns >>> df.style.applymap(color_negative, color='red', subset="A") + ... # doctest: +SKIP >>> df.style.applymap(color_negative, color='red', subset=["A", "B"]) + ... # doctest: +SKIP Using a 2d input to ``subset`` to select rows in addition to columns - >>> df.style.applymap(color_negative, color='red', subset=([0,1,2], slice(None)) - >>> df.style.applymap(color_negative, color='red', subset=(slice(0,5,2), "A") + >>> df.style.applymap(color_negative, color='red', + ... subset=([0,1,2], slice(None))) # doctest: +SKIP + >>> df.style.applymap(color_negative, color='red', subset=(slice(0,5,2), "A")) + ... # doctest: +SKIP + + See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for + more details. """ self._todo.append( (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) @@ -1317,6 +1338,7 @@ def where( >>> def cond(v, limit=4): ... return v > 1 and v != limit >>> df.style.where(cond, value='color:green;', other='color:red;') + ... # doctest: +SKIP should be refactored to: @@ -1324,6 +1346,7 @@ def where( ... cond = v > 1 and v != limit ... return value if cond else other >>> df.style.applymap(style_func, value='color:green;', other='color:red;') + ... # doctest: +SKIP """ warnings.warn( "this method is deprecated in favour of `Styler.applymap()`", @@ -1389,7 +1412,7 @@ def set_table_attributes(self, attributes: str) -> Styler: Examples -------- >>> df = pd.DataFrame(np.random.randn(10, 4)) - >>> df.style.set_table_attributes('class="pure-table"') + >>> df.style.set_table_attributes('class="pure-table"') # doctest: +SKIP # ...
... """ self.table_attributes = attributes @@ -1637,14 +1660,14 @@ def set_table_styles( >>> df.style.set_table_styles( ... [{'selector': 'tr:hover', ... 'props': [('background-color', 'yellow')]}] - ... ) + ... ) # doctest: +SKIP Or with CSS strings >>> df.style.set_table_styles( ... [{'selector': 'tr:hover', - ... 'props': 'background-color: yellow; font-size: 1em;']}] - ... ) + ... 'props': 'background-color: yellow; font-size: 1em;'}] + ... ) # doctest: +SKIP Adding column styling by name @@ -1652,15 +1675,18 @@ def set_table_styles( ... 'A': [{'selector': '', ... 'props': [('color', 'red')]}], ... 'B': [{'selector': 'td', - ... 'props': 'color: blue;']}] - ... }, overwrite=False) + ... 'props': 'color: blue;'}] + ... }, overwrite=False) # doctest: +SKIP Adding row styling >>> df.style.set_table_styles({ ... 0: [{'selector': 'td:hover', ... 'props': [('font-size', '25px')]}] - ... }, axis=1, overwrite=False) + ... }, axis=1, overwrite=False) # doctest: +SKIP + + See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for + more details. """ if isinstance(table_styles, dict): if axis in [0, "index"]: @@ -1753,7 +1779,7 @@ def hide_index(self, subset: Subset | None = None) -> Styler: Simple application hiding specific rows: >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"]) - >>> df.style.hide_index(["a", "b"]) + >>> df.style.hide_index(["a", "b"]) # doctest: +SKIP 0 1 c 5 6 @@ -1761,7 +1787,7 @@ def hide_index(self, subset: Subset | None = None) -> Styler: >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) - >>> df.style.format("{:.1f}").hide_index() + >>> df.style.format("{:.1f}").hide_index() # doctest: +SKIP x y a b c a b c 0.1 0.0 0.4 1.3 0.6 -1.4 @@ -1774,6 +1800,7 @@ def hide_index(self, subset: Subset | None = None) -> Styler: Hide specific rows but retain the index: >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"])) + ... # doctest: +SKIP x y a b c a b c x b 0.7 1.0 1.3 1.5 -0.0 -0.2 @@ -1781,8 +1808,8 @@ def hide_index(self, subset: Subset | None = None) -> Styler: Hide specific rows and the index: - >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"])) - ... .hide_index() + >>> df.style.format("{:.1f}").hide_index( + ... subset=(slice(None), ["a", "c"])).hide_index() # doctest: +SKIP x y a b c a b c 0.7 1.0 1.3 1.5 -0.0 -0.2 @@ -1833,7 +1860,7 @@ def hide_columns(self, subset: Subset | None = None) -> Styler: Simple application hiding specific columns: >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) - >>> df.style.hide_columns(["a", "b"]) + >>> df.style.hide_columns(["a", "b"]) # doctest: +SKIP c 0 3 1 6 @@ -1842,17 +1869,18 @@ def hide_columns(self, subset: Subset | None = None) -> Styler: >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) - >>> df.style.format("{:.1f}").hide_columns() - x d 0.1 0.0 0.4 1.3 0.6 -1.4 - e 0.7 1.0 1.3 1.5 -0.0 -0.2 - f 1.4 -0.8 1.6 -0.2 -0.4 -0.3 - y d 0.4 1.0 -0.2 -0.8 -1.2 1.1 - e -0.6 1.2 1.8 1.9 0.3 0.3 - f 0.8 0.5 -0.3 1.2 2.2 -0.8 + >>> df.style.format("{:.1f}").hide_columns() # doctest: +SKIP + x a 0.1 0.0 0.4 1.3 0.6 -1.4 + b 0.7 1.0 1.3 1.5 -0.0 -0.2 + c 1.4 -0.8 1.6 -0.2 -0.4 -0.3 + y a 0.4 1.0 -0.2 -0.8 -1.2 1.1 + b -0.6 1.2 1.8 1.9 0.3 0.3 + c 0.8 0.5 -0.3 1.2 2.2 -0.8 Hide specific columns but retain the column headers: >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"])) + ... # doctest: +SKIP x y b b x a 0.0 0.6 @@ -1864,8 +1892,8 @@ def hide_columns(self, subset: Subset | None = None) -> Styler: Hide specific columns and the column headers: - >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"])) - ... .hide_columns() + >>> df.style.format("{:.1f}").hide_columns( + ... subset=(slice(None), ["a", "c"])).hide_columns() # doctest: +SKIP x a 0.0 0.6 b 1.0 -0.0 c -0.8 -0.4 @@ -1995,31 +2023,32 @@ def background_gradient( Shading the values column-wise, with ``axis=0``, preselecting numeric columns - >>> df.style.{name}_gradient(axis=0) + >>> df.style.{name}_gradient(axis=0) # doctest: +SKIP .. figure:: ../../_static/style/{image_prefix}_ax0.png Shading all values collectively using ``axis=None`` - >>> df.style.{name}_gradient(axis=None) + >>> df.style.{name}_gradient(axis=None) # doctest: +SKIP .. figure:: ../../_static/style/{image_prefix}_axNone.png Compress the color map from the both ``low`` and ``high`` ends - >>> df.style.{name}_gradient(axis=None, low=0.75, high=1.0) + >>> df.style.{name}_gradient(axis=None, low=0.75, high=1.0) # doctest: +SKIP .. figure:: ../../_static/style/{image_prefix}_axNone_lowhigh.png Manually setting ``vmin`` and ``vmax`` gradient thresholds - >>> df.style.{name}_gradient(axis=None, vmin=6.7, vmax=21.6) + >>> df.style.{name}_gradient(axis=None, vmin=6.7, vmax=21.6) # doctest: +SKIP .. figure:: ../../_static/style/{image_prefix}_axNone_vminvmax.png Setting a ``gmap`` and applying to all columns with another ``cmap`` >>> df.style.{name}_gradient(axis=0, gmap=df['Temp (c)'], cmap='YlOrRd') + ... # doctest: +SKIP .. figure:: ../../_static/style/{image_prefix}_gmap.png @@ -2029,7 +2058,7 @@ def background_gradient( >>> gmap = np.array([[1,2,3], [2,3,4], [3,4,5]]) >>> df.style.{name}_gradient(axis=None, gmap=gmap, ... cmap='YlOrRd', subset=['Temp (c)', 'Rain (mm)', 'Wind (m/s)'] - ... ) + ... ) # doctest: +SKIP .. figure:: ../../_static/style/{image_prefix}_axNone_gmap.png """ @@ -2111,8 +2140,11 @@ def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler: Examples -------- >>> df = pd.DataFrame(np.random.randn(10, 4)) - >>> df.style.set_properties(color="white", align="right") - >>> df.style.set_properties(**{'background-color': 'yellow'}) + >>> df.style.set_properties(color="white", align="right") # doctest: +SKIP + >>> df.style.set_properties(**{'background-color': 'yellow'}) # doctest: +SKIP + + See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for + more details. """ values = "".join([f"{p}: {v};" for p, v in kwargs.items()]) return self.applymap(lambda x: values, subset=subset) @@ -2447,7 +2479,7 @@ def highlight_between( ... 'Two': [2.9, 2.1, 2.5], ... 'Three': [3.1, 3.2, 3.8], ... }) - >>> df.style.highlight_between(left=2.1, right=2.9) + >>> df.style.highlight_between(left=2.1, right=2.9) # doctest: +SKIP .. figure:: ../../_static/style/hbetw_basic.png @@ -2455,7 +2487,7 @@ def highlight_between( and ``right`` for each column individually >>> df.style.highlight_between(left=[1.4, 2.4, 3.4], right=[1.6, 2.6, 3.6], - ... axis=1, color="#fffd75") + ... axis=1, color="#fffd75") # doctest: +SKIP .. figure:: ../../_static/style/hbetw_seq.png @@ -2463,14 +2495,14 @@ def highlight_between( matches the input DataFrame, with a constant ``right`` >>> df.style.highlight_between(left=[[2,2,3],[2,2,3],[3,3,3]], right=3.5, - ... axis=None, color="#fffd75") + ... axis=None, color="#fffd75") # doctest: +SKIP .. figure:: ../../_static/style/hbetw_axNone.png Using ``props`` instead of default background coloring >>> df.style.highlight_between(left=1.5, right=3.5, - ... props='font-weight:bold;color:#e83e8c') + ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP .. figure:: ../../_static/style/hbetw_props.png """ @@ -2547,19 +2579,21 @@ def highlight_quantile( >>> df = pd.DataFrame(np.arange(10).reshape(2,5) + 1) >>> df.style.highlight_quantile(axis=None, q_left=0.8, color="#fffd75") + ... # doctest: +SKIP .. figure:: ../../_static/style/hq_axNone.png Or highlight quantiles row-wise or column-wise, in this case by row-wise >>> df.style.highlight_quantile(axis=1, q_left=0.8, color="#fffd75") + ... # doctest: +SKIP .. figure:: ../../_static/style/hq_ax1.png Use ``props`` instead of default background coloring >>> df.style.highlight_quantile(axis=None, q_left=0.2, q_right=0.8, - ... props='font-weight:bold;color:#e83e8c') + ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP .. figure:: ../../_static/style/hq_props.png """ @@ -2703,6 +2737,9 @@ def pipe(self, func: Callable, *args, **kwargs): ... .highlight_min(subset=['conversion'], color='yellow') ... .pipe(format_conversion) ... .set_caption("Results with minimum conversion highlighted.")) + ... # doctest: +SKIP + + .. figure:: ../../_static/style/df_pipe.png """ return com.pipe(self, func, *args, **kwargs) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 96833a2e49fa1..ac181af7875b5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -425,3 +425,23 @@ def test_item(self, data): with pytest.raises(ValueError, match=msg): s.item() + + def test_ellipsis_index(self): + # GH42430 1D slices over extension types turn into N-dimensional slices over + # ExtensionArrays + class CapturingStringArray(pd.arrays.StringArray): + """Extend StringArray to capture arguments to __getitem__""" + + def __getitem__(self, item): + self.last_item_arg = item + return super().__getitem__(item) + + df = pd.DataFrame( + {"col1": CapturingStringArray(np.array(["hello", "world"], dtype=object))} + ) + _ = df.iloc[:1] + + # String comparison because there's no native way to compare slices. + # Before the fix for GH42430, last_item_arg would get set to the 2D slice + # (Ellipsis, slice(None, 1, None)) + self.assert_equal(str(df["col1"].array.last_item_arg), "slice(None, 1, None)") diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index be29a3e50b9fa..fa658d87c3ca0 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -479,6 +479,17 @@ def test_drop_with_non_unique_multiindex(self): expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]])) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]]) + def test_drop_tuple_with_non_unique_multiindex(self, indexer): + # GH#42771 + idx = MultiIndex.from_product([["a", "b"], ["a", "a"]]) + df = DataFrame({"x": range(len(idx))}, index=idx) + result = df.drop(index=[("a", "a")]) + expected = DataFrame( + {"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")]) + ) + tm.assert_frame_equal(result, expected) + def test_drop_with_duplicate_columns(self): df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index d46796bcd978b..e104617552efc 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -868,3 +868,23 @@ def test_sort_values_pos_args_deprecation(self): result = df.sort_values("a", 0) expected = DataFrame({"a": [1, 2, 3]}) tm.assert_frame_equal(result, expected) + + def test_sort_values_validate_ascending_for_value_error(self): + # GH41634 + df = DataFrame({"D": [23, 7, 21]}) + + msg = 'For argument "ascending" expected type bool, received type str.' + with pytest.raises(ValueError, match=msg): + df.sort_values(by="D", ascending="False") + + @pytest.mark.parametrize("ascending", [False, 0, 1, True]) + def test_sort_values_validate_ascending_functional(self, ascending): + df = DataFrame({"D": [23, 7, 21]}) + indexer = df["D"].argsort().values + + if not ascending: + indexer = indexer[::-1] + + expected = df.loc[df.index[indexer]] + result = df.sort_values(by="D", ascending=ascending) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5434fc49e2174..77e5e9ba133f5 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -803,6 +803,39 @@ def test_cummax(dtypes_for_minmax): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) +@pytest.mark.parametrize( + "groups,expected_data", + [ + ([1, 1, 1], [1, None, None]), + ([1, 2, 3], [1, None, 2]), + ([1, 3, 3], [1, None, None]), + ], +) +def test_cummin_max_skipna(method, dtype, groups, expected_data): + # GH-34047 + df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + gb = df.groupby(groups)["a"] + + result = getattr(gb, method)(skipna=False) + expected = Series(expected_data, dtype=dtype, name="a") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_max_skipna_multiple_cols(method): + # Ensure missing value in "a" doesn't cause "b" to be nan-filled + df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) + gb = df.groupby([1, 1, 1])[["a", "b"]] + + result = getattr(gb, method)(skipna=False) + expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_32bit @pytest.mark.parametrize("method", ["cummin", "cummax"]) @pytest.mark.parametrize( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 76ac86d798086..addd6c17809a2 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -349,7 +349,12 @@ def test_agg(): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: - result = t.aggregate([np.mean, np.std]) + warn = FutureWarning if t in cases[1:3] else None + with tm.assert_produces_warning( + warn, match="Dropping invalid columns", check_stacklevel=False + ): + # .var on dt64 column raises and is dropped + result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index 67f986c0949ca..adc578d948163 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -51,7 +51,7 @@ def test_sort_values(self, datetime_series): expected = ts.sort_values(ascending=False, na_position="first") tm.assert_series_equal(expected, ordered) - msg = "ascending must be boolean" + msg = 'For argument "ascending" expected type bool, received type NoneType.' with pytest.raises(ValueError, match=msg): ts.sort_values(ascending=None) msg = r"Length of ascending \(0\) must be 1 for Series" @@ -63,7 +63,7 @@ def test_sort_values(self, datetime_series): msg = r"Length of ascending \(2\) must be 1 for Series" with pytest.raises(ValueError, match=msg): ts.sort_values(ascending=[False, False]) - msg = "ascending must be boolean" + msg = 'For argument "ascending" expected type bool, received type str.' with pytest.raises(ValueError, match=msg): ts.sort_values(ascending="foobar") @@ -206,6 +206,27 @@ def test_mergesort_decending_stability(self): expected = Series([3, 2, 1, 1], ["c", "b", "first", "second"]) tm.assert_series_equal(result, expected) + def test_sort_values_validate_ascending_for_value_error(self): + # GH41634 + ser = Series([23, 7, 21]) + + msg = 'For argument "ascending" expected type bool, received type str.' + with pytest.raises(ValueError, match=msg): + ser.sort_values(ascending="False") + + @pytest.mark.parametrize("ascending", [False, 0, 1, True]) + def test_sort_values_validate_ascending_functional(self, ascending): + # GH41634 + ser = Series([23, 7, 21]) + expected = np.sort(ser.values) + + sorted_ser = ser.sort_values(ascending=ascending) + if not ascending: + expected = expected[::-1] + + result = sorted_ser.values + tm.assert_numpy_array_equal(result, expected) + class TestSeriesSortingKey: def test_sort_values_key(self):