Skip to content

Commit 340a55d

Browse files
authored
Merge branch 'main' into groupby.expanding_doc
2 parents 7daa3c0 + 183b327 commit 340a55d

File tree

19 files changed

+269
-305
lines changed

19 files changed

+269
-305
lines changed

doc/source/reference/groupby.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Function application
7979
DataFrameGroupBy.cumsum
8080
DataFrameGroupBy.describe
8181
DataFrameGroupBy.diff
82+
DataFrameGroupBy.ewm
8283
DataFrameGroupBy.expanding
8384
DataFrameGroupBy.ffill
8485
DataFrameGroupBy.first
@@ -131,6 +132,7 @@ Function application
131132
SeriesGroupBy.cumsum
132133
SeriesGroupBy.describe
133134
SeriesGroupBy.diff
135+
SeriesGroupBy.ewm
134136
SeriesGroupBy.expanding
135137
SeriesGroupBy.ffill
136138
SeriesGroupBy.first

doc/source/whatsnew/v3.0.0.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ Other Deprecations
421421
- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`)
422422
- Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`)
423423
- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`)
424+
- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`)
424425
- Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`)
425426

426427
.. ---------------------------------------------------------------------------
@@ -622,6 +623,7 @@ Performance improvements
622623
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
623624
- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
624625
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
626+
- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
625627
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
626628
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
627629
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
@@ -637,6 +639,7 @@ Bug fixes
637639
Categorical
638640
^^^^^^^^^^^
639641
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
642+
- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`)
640643
- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`)
641644
-
642645

@@ -649,6 +652,7 @@ Datetimelike
649652
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
650653
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
651654
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
655+
- Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`)
652656
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`)
653657
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
654658
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)

pandas/__init__.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,17 @@
44

55
# Let users know if they're missing any of our hard dependencies
66
_hard_dependencies = ("numpy", "dateutil")
7-
_missing_dependencies = []
87

98
for _dependency in _hard_dependencies:
109
try:
1110
__import__(_dependency)
1211
except ImportError as _e: # pragma: no cover
13-
_missing_dependencies.append(f"{_dependency}: {_e}")
12+
raise ImportError(
13+
f"Unable to import required dependency {_dependency}. "
14+
"Please see the traceback for details."
15+
) from _e
1416

15-
if _missing_dependencies: # pragma: no cover
16-
raise ImportError(
17-
"Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
18-
)
19-
del _hard_dependencies, _dependency, _missing_dependencies
17+
del _hard_dependencies, _dependency
2018

2119
try:
2220
# numpy compat

pandas/core/arrays/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ def __init__(
452452
if isinstance(values, Index):
453453
arr = values._data._pa_array.combine_chunks()
454454
else:
455-
arr = values._pa_array.combine_chunks()
455+
arr = extract_array(values)._pa_array.combine_chunks()
456456
categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype)
457457
codes = arr.indices.to_numpy()
458458
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)

pandas/core/groupby/groupby.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3866,15 +3866,79 @@ def expanding(self, *args, **kwargs) -> ExpandingGroupby:
38663866
)
38673867

38683868
@final
3869-
@Substitution(name="groupby")
3870-
@Appender(_common_see_also)
38713869
def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
38723870
"""
38733871
Return an ewm grouper, providing ewm functionality per group.
38743872
3873+
Parameters
3874+
----------
3875+
*args : tuple
3876+
Positional arguments passed to the EWM window constructor.
3877+
**kwargs : dict
3878+
Keyword arguments passed to the EWM window constructor, such as:
3879+
3880+
com : float, optional
3881+
Specify decay in terms of center of mass.
3882+
``span``, ``halflife``, and ``alpha`` are alternative ways to specify
3883+
decay.
3884+
span : float, optional
3885+
Specify decay in terms of span.
3886+
halflife : float, optional
3887+
Specify decay in terms of half-life.
3888+
alpha : float, optional
3889+
Specify smoothing factor directly.
3890+
min_periods : int, default 0
3891+
Minimum number of observations in the window required to have a value;
3892+
otherwise, result is ``np.nan``.
3893+
adjust : bool, default True
3894+
Divide by decaying adjustment factor to account for imbalance in
3895+
relative weights.
3896+
ignore_na : bool, default False
3897+
Ignore missing values when calculating weights.
3898+
times : str or array-like of datetime64, optional
3899+
Times corresponding to the observations.
3900+
axis : {0 or 'index', 1 or 'columns'}, default 0
3901+
Axis along which the EWM function is applied.
3902+
38753903
Returns
38763904
-------
38773905
pandas.api.typing.ExponentialMovingWindowGroupby
3906+
An object that supports exponentially weighted moving transformations over
3907+
each group.
3908+
3909+
See Also
3910+
--------
3911+
Series.ewm : EWM transformations for Series.
3912+
DataFrame.ewm : EWM transformations for DataFrames.
3913+
Series.groupby : Apply a function groupby to a Series.
3914+
DataFrame.groupby : Apply a function groupby.
3915+
3916+
Examples
3917+
--------
3918+
>>> df = pd.DataFrame(
3919+
... {
3920+
... "Class": ["A", "A", "A", "B", "B", "B"],
3921+
... "Value": [10, 20, 30, 40, 50, 60],
3922+
... }
3923+
... )
3924+
>>> df
3925+
Class Value
3926+
0 A 10
3927+
1 A 20
3928+
2 A 30
3929+
3 B 40
3930+
4 B 50
3931+
5 B 60
3932+
3933+
>>> df.groupby("Class").ewm(com=0.5).mean()
3934+
Value
3935+
Class
3936+
A 0 10.000000
3937+
1 17.500000
3938+
2 26.153846
3939+
B 3 40.000000
3940+
4 47.500000
3941+
5 56.153846
38783942
"""
38793943
from pandas.core.window import ExponentialMovingWindowGroupby
38803944

pandas/core/internals/blocks.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,8 @@ def where(self, other, cond) -> list[Block]:
16791679

16801680
try:
16811681
res_values = arr._where(cond, other).T
1682+
except OutOfBoundsDatetime:
1683+
raise
16821684
except (ValueError, TypeError):
16831685
if self.ndim == 1 or self.shape[0] == 1:
16841686
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
@@ -1746,6 +1748,8 @@ def putmask(self, mask, new) -> list[Block]:
17461748
try:
17471749
# Caller is responsible for ensuring matching lengths
17481750
values._putmask(mask, new)
1751+
except OutOfBoundsDatetime:
1752+
raise
17491753
except (TypeError, ValueError):
17501754
if self.ndim == 1 or self.shape[0] == 1:
17511755
if isinstance(self.dtype, IntervalDtype):

pandas/core/reshape/reshape.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,20 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
936936
[k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels]
937937
)
938938

939-
result = stack_reshape(frame, level, set_levels, stack_cols)
939+
result: Series | DataFrame
940+
if not isinstance(frame.columns, MultiIndex):
941+
# GH#58817 Fast path when we're stacking the columns of a non-MultiIndex.
942+
# When columns are homogeneous EAs, we pass through object
943+
# dtype but this is still slightly faster than the normal path.
944+
if len(frame.columns) > 0 and frame._is_homogeneous_type:
945+
dtype = frame._mgr.blocks[0].dtype
946+
else:
947+
dtype = None
948+
result = frame._constructor_sliced(
949+
frame._values.reshape(-1, order="F"), dtype=dtype
950+
)
951+
else:
952+
result = stack_reshape(frame, level, set_levels, stack_cols)
940953

941954
# Construct the correct MultiIndex by combining the frame's index and
942955
# stacked columns.
@@ -1018,6 +1031,8 @@ def stack_reshape(
10181031
-------
10191032
The data of behind the stacked DataFrame.
10201033
"""
1034+
# non-MultIndex takes a fast path.
1035+
assert isinstance(frame.columns, MultiIndex)
10211036
# If we need to drop `level` from columns, it needs to be in descending order
10221037
drop_levnums = sorted(level, reverse=True)
10231038

@@ -1027,18 +1042,14 @@ def stack_reshape(
10271042
if len(frame.columns) == 1:
10281043
data = frame.copy(deep=False)
10291044
else:
1030-
if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple):
1031-
# GH#57750 - if the frame is an Index with tuples, .loc below will fail
1032-
column_indexer = idx
1033-
else:
1034-
# Take the data from frame corresponding to this idx value
1035-
if len(level) == 1:
1036-
idx = (idx,)
1037-
gen = iter(idx)
1038-
column_indexer = tuple(
1039-
next(gen) if k in set_levels else slice(None)
1040-
for k in range(frame.columns.nlevels)
1041-
)
1045+
# Take the data from frame corresponding to this idx value
1046+
if len(level) == 1:
1047+
idx = (idx,)
1048+
gen = iter(idx)
1049+
column_indexer = tuple(
1050+
next(gen) if k in set_levels else slice(None)
1051+
for k in range(frame.columns.nlevels)
1052+
)
10421053
data = frame.loc[:, column_indexer]
10431054

10441055
if len(level) < frame.columns.nlevels:

pandas/core/series.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@
5252
doc,
5353
set_module,
5454
)
55+
from pandas.util._exceptions import (
56+
find_stack_level,
57+
)
5558
from pandas.util._validators import (
5659
validate_ascending,
5760
validate_bool_kwarg,
@@ -4320,7 +4323,7 @@ def unstack(
43204323

43214324
def map(
43224325
self,
4323-
arg: Callable | Mapping | Series,
4326+
func: Callable | Mapping | Series | None = None,
43244327
na_action: Literal["ignore"] | None = None,
43254328
**kwargs,
43264329
) -> Series:
@@ -4333,8 +4336,8 @@ def map(
43334336
43344337
Parameters
43354338
----------
4336-
arg : function, collections.abc.Mapping subclass or Series
4337-
Mapping correspondence.
4339+
func : function, collections.abc.Mapping subclass or Series
4340+
Function or mapping correspondence.
43384341
na_action : {None, 'ignore'}, default None
43394342
If 'ignore', propagate NaN values, without passing them to the
43404343
mapping correspondence.
@@ -4404,9 +4407,22 @@ def map(
44044407
3 I am a rabbit
44054408
dtype: object
44064409
"""
4407-
if callable(arg):
4408-
arg = functools.partial(arg, **kwargs)
4409-
new_values = self._map_values(arg, na_action=na_action)
4410+
if func is None:
4411+
if "arg" in kwargs:
4412+
# `.map(arg=my_func)`
4413+
func = kwargs.pop("arg")
4414+
warnings.warn(
4415+
"The parameter `arg` has been renamed to `func`, and it "
4416+
"will stop being supported in a future version of pandas.",
4417+
FutureWarning,
4418+
stacklevel=find_stack_level(),
4419+
)
4420+
else:
4421+
raise ValueError("The `func` parameter is required")
4422+
4423+
if callable(func):
4424+
func = functools.partial(func, **kwargs)
4425+
new_values = self._map_values(func, na_action=na_action)
44104426
return self._constructor(new_values, index=self.index, copy=False).__finalize__(
44114427
self, method="map"
44124428
)

pandas/tests/extension/base/reshaping.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas.core.dtypes.dtypes import NumpyEADtype
7+
68
import pandas as pd
79
import pandas._testing as tm
810
from pandas.api.extensions import ExtensionArray
@@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack):
266268
expected = expected.astype(object)
267269

268270
if isinstance(expected, pd.Series):
269-
assert result.dtype == df.iloc[:, 0].dtype
271+
if future_stack and isinstance(data.dtype, NumpyEADtype):
272+
# GH#58817 future_stack=True constructs the result specifying the dtype
273+
# using the dtype of the input; we thus get the underlying
274+
# NumPy dtype as the result instead of the NumpyExtensionArray
275+
assert result.dtype == df.iloc[:, 0].to_numpy().dtype
276+
else:
277+
assert result.dtype == df.iloc[:, 0].dtype
270278
else:
271279
assert all(result.dtypes == df.iloc[:, 0].dtype)
272280

pandas/tests/frame/methods/test_fillna.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
from pandas.errors import OutOfBoundsDatetime
5+
46
from pandas import (
57
Categorical,
68
DataFrame,
@@ -781,3 +783,15 @@ def test_fillna_with_none_object(test_frame, dtype):
781783
if test_frame:
782784
expected = expected.to_frame()
783785
tm.assert_equal(result, expected)
786+
787+
788+
def test_fillna_out_of_bounds_datetime():
789+
# GH#61208
790+
df = DataFrame(
791+
{"datetime": date_range("1/1/2011", periods=3, freq="h"), "value": [1, 2, 3]}
792+
)
793+
df.iloc[0, 0] = None
794+
795+
msg = "Cannot cast 0001-01-01 00:00:00 to unit='ns' without overflow"
796+
with pytest.raises(OutOfBoundsDatetime, match=msg):
797+
df.fillna(Timestamp("0001-01-01"))

pandas/tests/reshape/test_pivot.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import pandas as pd
1717
from pandas import (
18+
ArrowDtype,
1819
Categorical,
1920
DataFrame,
2021
Grouper,
@@ -2851,3 +2852,31 @@ def test_pivot_margins_with_none_index(self):
28512852
),
28522853
)
28532854
tm.assert_frame_equal(result, expected)
2855+
2856+
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
2857+
def test_pivot_with_pyarrow_categorical(self):
2858+
# GH#53051
2859+
pa = pytest.importorskip("pyarrow")
2860+
2861+
df = DataFrame(
2862+
{"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
2863+
).astype(
2864+
{
2865+
"string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
2866+
"number_column": "float[pyarrow]",
2867+
}
2868+
)
2869+
2870+
df = df.pivot(columns=["string_column"], values=["number_column"])
2871+
2872+
multi_index = MultiIndex.from_arrays(
2873+
[["number_column", "number_column", "number_column"], ["A", "B", "C"]],
2874+
names=(None, "string_column"),
2875+
)
2876+
df_expected = DataFrame(
2877+
[[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]],
2878+
columns=multi_index,
2879+
)
2880+
tm.assert_frame_equal(
2881+
df, df_expected, check_dtype=False, check_column_type=False
2882+
)

0 commit comments

Comments
 (0)