diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6ca951e946bad..48380bd9b46b8 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -369,6 +369,18 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class Shift: + def setup(self): + N = 18 + self.df = DataFrame({"g": ["a", "b"] * 9, "v": list(range(N))}) + + def time_defaults(self): + self.df.groupby("g").shift() + + def time_fill_value(self): + self.df.groupby("g").shift(fill_value=99) + + class FillNA: def setup(self): N = 100 diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index f4804215db8c1..94d62ae988f0c 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -18,6 +18,7 @@ Fixed regressions - Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`) - Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`) - Fixed regression in :meth:`DataFrame.shift` where TypeError occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`) +- Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 432dd46000eb3..e42360558d284 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -35,6 +35,7 @@ Other enhancements - Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`) - :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`) - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) +- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`) - Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`) - @@ -168,6 +169,7 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) +- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`) .. --------------------------------------------------------------------------- @@ -262,6 +264,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`) - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) - Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`) +- Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ff3fc30b870dc..bb39e18caeaa2 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -690,21 +690,28 @@ def agg(self): obj = self.obj axis = self.axis + # TODO: Avoid having to change state + self.obj = self.obj if self.axis == 0 else self.obj.T + self.axis = 0 + + result = None + try: + result = super().agg() + except TypeError as err: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {err}" + ) + raise exc from err + finally: + self.obj = obj + self.axis = axis + if axis == 1: - result = FrameRowApply( - obj.T, - self.orig_f, - self.raw, - self.result_type, - self.args, - self.kwargs, - ).agg() result = result.T if result is not None else result - else: - result = super().agg() if result is None: - result = obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) + result = self.obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) return result diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 68d7f6c6f8a22..387df6c6a6b70 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -420,9 +420,9 @@ def extract_array( return obj._values return obj - obj = obj.array + obj = obj._values - if extract_numpy and isinstance(obj, ABCPandasArray): + elif extract_numpy and isinstance(obj, ABCPandasArray): obj = obj.to_numpy() return obj diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9c695148a75c0..939cff16bf1ae 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2822,6 +2822,7 @@ def _get_cythonized_result( result_is_index: bool = False, pre_processing=None, post_processing=None, + fill_value=None, **kwargs, ): """ @@ -2872,6 +2873,8 @@ def _get_cythonized_result( second argument, i.e. the signature should be (ndarray, Type). If `needs_nullable=True`, a third argument should be `nullable`, to allow for processing specific to nullable values. + fill_value : any, default None + The scalar value to use for newly introduced missing values. **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -2896,7 +2899,7 @@ def _get_cythonized_result( grouper = self.grouper ids, _, ngroups = grouper.group_info - output: dict[base.OutputKey, np.ndarray] = {} + output: dict[base.OutputKey, ArrayLike] = {} base_func = getattr(libgroupby, how) base_func = partial(base_func, labels=ids) @@ -2911,6 +2914,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: else: result_sz = len(values) + result: ArrayLike result = np.zeros(result_sz, dtype=cython_dtype) if needs_2d: result = result.reshape((-1, 1)) @@ -2946,7 +2950,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: result = result.reshape(-1) if result_is_index: - result = algorithms.take_nd(values, result) + result = algorithms.take_nd(values, result, fill_value=fill_value) if post_processing: pp_kwargs = {} @@ -3022,7 +3026,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): tshift : Shift the time index, using the index’s frequency if available. """ - if freq is not None or axis != 0 or not isna(fill_value): + if freq is not None or axis != 0: return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) return self._get_cythonized_result( @@ -3032,6 +3036,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): needs_ngroups=True, result_is_index=True, periods=periods, + fill_value=fill_value, ) @final diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5d63fcdf7b0dc..adfecb946d822 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1920,11 +1920,11 @@ def get_block_type(values, dtype: DtypeObj | None = None): def new_block(values, placement, *, ndim: int, klass=None) -> Block: + # caller is responsible for ensuring values is NOT a PandasArray if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) - values, _ = extract_pandas_array(values, None, ndim) check_ndim(values, placement, ndim) if klass is None: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 03bb47f3a6b6e..8937c2c107c62 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1779,11 +1779,6 @@ def create_block_manager_from_blocks( return mgr -# We define this here so we can override it in tests.extension.test_numpy -def _extract_array(obj): - return extract_array(obj, extract_numpy=True) - - def create_block_manager_from_arrays( arrays, names: Index, @@ -1795,7 +1790,7 @@ def create_block_manager_from_arrays( # assert isinstance(axes, list) # assert all(isinstance(x, Index) for x in axes) - arrays = [_extract_array(x) for x in arrays] + arrays = [extract_array(x, extract_numpy=True) for x in arrays] try: blocks = _form_blocks(arrays, names, axes, consolidate) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index c5d06bcef72a4..656d38a50f77f 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -418,7 +418,11 @@ def _bins_to_cuts( bins = unique_bins side = "left" if right else "right" - ids = ensure_platform_int(bins.searchsorted(x, side=side)) + # error: No overload variant of "searchsorted" of "ndarray" matches + # argument types "Any", "str" + ids = ensure_platform_int( + bins.searchsorted(x, side=side) # type: ignore[call-overload] + ) if include_lowest: ids[np.asarray(x) == bins[0]] = 1 diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 717287360df8f..ef01602be7654 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -34,6 +34,7 @@ from pandas.core.dtypes.missing import isna from pandas.core.base import NoNewAttributesMixin +from pandas.core.construction import extract_array if TYPE_CHECKING: from pandas import ( @@ -213,10 +214,7 @@ def _validate(data): # see _libs/lib.pyx for list of inferred types allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - # TODO: avoid kludge for tests.extension.test_numpy - from pandas.core.internals.managers import _extract_array - - data = _extract_array(data) + data = extract_array(data) values = getattr(data, "categories", data) # categorical / normal diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 9dade82e9809c..af1c0ca34ec0f 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -476,8 +476,8 @@ def to_latex( Defaults to ``pandas.options.styler.sparse.index`` value. sparse_columns : bool, optional Whether to sparsify the display of a hierarchical index. Setting to False - will display each explicit level element in a hierarchical key for each row. - Defaults to ``pandas.options.styler.sparse.columns`` value. + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. multirow_align : {"c", "t", "b"} If sparsifying hierarchical MultiIndexes whether to align text centrally, at the top or bottom. @@ -815,6 +815,8 @@ def to_html( *, table_uuid: str | None = None, table_attributes: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, encoding: str | None = None, doctype_html: bool = False, exclude_styles: bool = False, @@ -840,6 +842,18 @@ def to_html( `` >`` If not given defaults to Styler's preexisting value. + sparse_index : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each row. + Defaults to ``pandas.options.styler.sparse.index`` value. + + .. versionadded:: 1.4.0 + sparse_columns : bool, optional + Whether to sparsify the display of a hierarchical index. Setting to False + will display each explicit level element in a hierarchical key for each + column. Defaults to ``pandas.options.styler.sparse.columns`` value. + + .. versionadded:: 1.4.0 encoding : str, optional Character encoding setting for file output, and HTML meta tags, defaults to "utf-8" if None. @@ -866,8 +880,15 @@ def to_html( if table_attributes: self.set_table_attributes(table_attributes) + if sparse_index is None: + sparse_index = get_option("styler.sparse.index") + if sparse_columns is None: + sparse_columns = get_option("styler.sparse.columns") + # Build HTML string.. - html = self.render( + html = self._render_html( + sparse_index=sparse_index, + sparse_columns=sparse_columns, exclude_styles=exclude_styles, encoding=encoding if encoding else "utf-8", doctype_html=doctype_html, diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 801cbdf3d0a87..62983b5327a26 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -644,13 +644,14 @@ def test_apply_dup_names_multi_agg(): tm.assert_frame_equal(result, expected) -def test_apply_nested_result_axis_1(): +@pytest.mark.parametrize("op", ["apply", "agg"]) +def test_apply_nested_result_axis_1(op): # GH 13820 def apply_list(row): return [2 * row["A"], 2 * row["C"], 2 * row["B"]] df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) - result = df.apply(apply_list, axis=1) + result = getattr(df, op)(apply_list, axis=1) expected = Series( [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] ) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index a680ae5cd695c..ed26bf6481bd9 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -23,32 +23,17 @@ ExtensionDtype, PandasDtype, ) -from pandas.core.dtypes.generic import ABCPandasArray import pandas as pd import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray -from pandas.core.internals import ( - blocks, - managers, -) +from pandas.core.internals import blocks from pandas.tests.extension import base # TODO(ArrayManager) PandasArray pytestmark = td.skip_array_manager_not_yet_implemented -def _extract_array_patched(obj): - if isinstance(obj, (pd.Index, pd.Series)): - obj = obj._values - if isinstance(obj, ABCPandasArray): - # TODO for reasons unclear, we get here in a couple of tests - # with PandasArray._typ *not* patched - obj = obj.to_numpy() - - return obj - - def _can_hold_element_patched(obj, element) -> bool: if isinstance(element, PandasArray): element = element.to_numpy() @@ -98,7 +83,6 @@ def allow_in_pandas(monkeypatch): """ with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") - m.setattr(managers, "_extract_array", _extract_array_patched) m.setattr(blocks, "can_hold_element", _can_hold_element_patched) m.setattr(tm.asserters, "assert_attr_equal", _assert_attr_equal) yield diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index c6f3e7618e3f7..e9517b4544f0b 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -55,7 +55,7 @@ def test_group_shift_with_fill_value(): columns=["Z"], index=None, ) - result = g.shift(-1, fill_value=0)[["Z"]] + result = g.shift(-1, fill_value=0) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 38a6209283080..362252e1a6b72 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1376,9 +1376,11 @@ def test_make_block_no_pandas_array(block_maker): # PandasArray, no dtype result = block_maker(arr, slice(len(arr)), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] - assert result.is_extension is False if block_maker is make_block: + # new_block requires caller to unwrap PandasArray + assert result.is_extension is False + # PandasArray, PandasDtype result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index 4e71cb4c46626..2657370bf8258 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -6,6 +6,7 @@ from pandas import ( DataFrame, MultiIndex, + option_context, ) jinja2 = pytest.importorskip("jinja2") @@ -429,3 +430,24 @@ def test_sticky_levels(styler_mi, index, columns): def test_sticky_raises(styler): with pytest.raises(ValueError, match="`axis` must be"): styler.set_sticky(axis="bad") + + +@pytest.mark.parametrize( + "sparse_index, sparse_columns", + [(True, True), (True, False), (False, True), (False, False)], +) +def test_sparse_options(sparse_index, sparse_columns): + cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) + ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=ridx, columns=cidx) + styler = df.style + + default_html = styler.to_html() # defaults under pd.options to (True , True) + + with option_context( + "styler.sparse.index", sparse_index, "styler.sparse.columns", sparse_columns + ): + html1 = styler.to_html() + assert (html1 == default_html) is (sparse_index and sparse_columns) + html2 = styler.to_html(sparse_index=sparse_index, sparse_columns=sparse_columns) + assert html1 == html2