Skip to content

Commit eb36d8c

Browse files
author
MarcoGorelli
committed
Merge remote-tracking branch 'upstream/main' into allow-mixed-iso
2 parents 2f66f87 + 33f4f7b commit eb36d8c

File tree

10 files changed

+612
-63
lines changed

10 files changed

+612
-63
lines changed

ci/code_checks.sh

+494
Large diffs are not rendered by default.

doc/source/reference/window.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
Window
77
======
88

9-
Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling`, :func:`pandas.Series.rolling`, etc.
10-
Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc.
11-
ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:`pandas.Series.ewm`, etc.
9+
Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling`.
10+
Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding` and :func:`pandas.Series.expanding`.
11+
ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm` and :func:`pandas.Series.ewm`.
1212

1313
.. _api.functions_rolling:
1414

doc/source/whatsnew/v2.0.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,7 @@ Performance improvements
908908
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
909909
- Performance improvement in :func:`to_datetime` when parsing strings with timezone offsets (:issue:`50107`)
910910
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
911+
- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`)
911912
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
912913
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
913914
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
@@ -952,6 +953,8 @@ Bug fixes
952953
Categorical
953954
^^^^^^^^^^^
954955
- Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`)
956+
- Bug in :meth:`Series.replace` with categorical dtype when ``to_replace`` values overlap with new values (:issue:`49404`)
957+
- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`)
955958
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`)
956959
- Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`)
957960
-
@@ -1132,6 +1135,7 @@ Groupby/resample/rolling
11321135
- Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
11331136
- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
11341137
- Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
1138+
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`)
11351139
-
11361140

11371141
Reshaping

pandas/core/arrays/arrow/array.py

+34-2
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,18 @@ def _accumulate(
985985
pyarrow_meth = getattr(pc, pyarrow_name, None)
986986
if pyarrow_meth is None:
987987
return super()._accumulate(name, skipna=skipna, **kwargs)
988-
result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs)
988+
989+
data_to_accum = self._data
990+
991+
pa_dtype = data_to_accum.type
992+
if pa.types.is_duration(pa_dtype):
993+
data_to_accum = data_to_accum.cast(pa.int64())
994+
995+
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
996+
997+
if pa.types.is_duration(pa_dtype):
998+
result = result.cast(pa_dtype)
999+
9891000
return type(self)(result)
9901001

9911002
def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
@@ -1012,6 +1023,26 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
10121023
------
10131024
TypeError : subclass does not define reductions
10141025
"""
1026+
pa_type = self._data.type
1027+
1028+
data_to_reduce = self._data
1029+
1030+
if name in ["any", "all"] and (
1031+
pa.types.is_integer(pa_type)
1032+
or pa.types.is_floating(pa_type)
1033+
or pa.types.is_duration(pa_type)
1034+
):
1035+
# pyarrow only supports any/all for boolean dtype, we allow
1036+
# for other dtypes, matching our non-pyarrow behavior
1037+
1038+
if pa.types.is_duration(pa_type):
1039+
data_to_cmp = self._data.cast(pa.int64())
1040+
else:
1041+
data_to_cmp = self._data
1042+
1043+
not_eq = pc.not_equal(data_to_cmp, 0)
1044+
data_to_reduce = not_eq
1045+
10151046
if name == "sem":
10161047

10171048
def pyarrow_meth(data, skip_nulls, **kwargs):
@@ -1033,8 +1064,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
10331064
if pyarrow_meth is None:
10341065
# Let ExtensionArray._reduce raise the TypeError
10351066
return super()._reduce(name, skipna=skipna, **kwargs)
1067+
10361068
try:
1037-
result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs)
1069+
result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
10381070
except (AttributeError, NotImplementedError, TypeError) as err:
10391071
msg = (
10401072
f"'{type(self).__name__}' with dtype {self.dtype} "

pandas/core/arrays/categorical.py

+21-40
Original file line numberDiff line numberDiff line change
@@ -1137,14 +1137,9 @@ def remove_categories(self, removals):
11371137
if not is_list_like(removals):
11381138
removals = [removals]
11391139

1140-
removal_set = set(removals)
1141-
not_included = removal_set - set(self.dtype.categories)
1142-
new_categories = [c for c in self.dtype.categories if c not in removal_set]
1143-
1144-
# GH 10156
1145-
if any(isna(removals)):
1146-
not_included = {x for x in not_included if notna(x)}
1147-
new_categories = [x for x in new_categories if notna(x)]
1140+
removals = {x for x in set(removals) if notna(x)}
1141+
new_categories = self.dtype.categories.difference(removals)
1142+
not_included = removals.difference(self.dtype.categories)
11481143

11491144
if len(not_included) != 0:
11501145
raise ValueError(f"removals must all be in old categories: {not_included}")
@@ -2273,42 +2268,28 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
22732268
return algorithms.isin(self.codes, code_values)
22742269

22752270
def _replace(self, *, to_replace, value, inplace: bool = False):
2271+
from pandas import Index
2272+
22762273
inplace = validate_bool_kwarg(inplace, "inplace")
22772274
cat = self if inplace else self.copy()
22782275

2279-
# other cases, like if both to_replace and value are list-like or if
2280-
# to_replace is a dict, are handled separately in NDFrame
2281-
if not is_list_like(to_replace):
2282-
to_replace = [to_replace]
2283-
2284-
categories = cat.categories.tolist()
2285-
removals = set()
2286-
for replace_value in to_replace:
2287-
if value == replace_value:
2288-
continue
2289-
if replace_value not in cat.categories:
2290-
continue
2291-
if isna(value):
2292-
removals.add(replace_value)
2293-
continue
2294-
2295-
index = categories.index(replace_value)
2296-
2297-
if value in cat.categories:
2298-
value_index = categories.index(value)
2299-
cat._codes[cat._codes == index] = value_index
2300-
removals.add(replace_value)
2301-
else:
2302-
categories[index] = value
2303-
cat._set_categories(categories)
2276+
mask = isna(np.asarray(value))
2277+
if mask.any():
2278+
removals = np.asarray(to_replace)[mask]
2279+
removals = cat.categories[cat.categories.isin(removals)]
2280+
new_cat = cat.remove_categories(removals)
2281+
NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
23042282

2305-
if len(removals):
2306-
new_categories = [c for c in categories if c not in removals]
2307-
new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
2308-
codes = recode_for_categories(
2309-
cat.codes, cat.categories, new_dtype.categories
2310-
)
2311-
NDArrayBacked.__init__(cat, codes, new_dtype)
2283+
ser = cat.categories.to_series()
2284+
ser = ser.replace(to_replace=to_replace, value=value)
2285+
2286+
all_values = Index(ser)
2287+
new_categories = Index(ser.drop_duplicates(keep="first"))
2288+
new_codes = recode_for_categories(
2289+
cat._codes, all_values, new_categories, copy=False
2290+
)
2291+
new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
2292+
NDArrayBacked.__init__(cat, new_codes, new_dtype)
23122293

23132294
if not inplace:
23142295
return cat

pandas/core/groupby/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3394,7 +3394,7 @@ def ngroup(self, ascending: bool = True):
33943394
dtype: int64
33953395
"""
33963396
with self._group_selection_context():
3397-
index = self._selected_obj.index
3397+
index = self._selected_obj._get_axis(self.axis)
33983398
comp_ids = self.grouper.group_info[0]
33993399

34003400
dtype: type

pandas/core/internals/blocks.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -536,12 +536,10 @@ def replace(
536536

537537
if isinstance(values, Categorical):
538538
# TODO: avoid special-casing
539+
# GH49404
539540
blk = self if inplace else self.copy()
540-
# error: Item "ExtensionArray" of "Union[ndarray[Any, Any],
541-
# ExtensionArray]" has no attribute "_replace"
542-
blk.values._replace( # type: ignore[union-attr]
543-
to_replace=to_replace, value=value, inplace=True
544-
)
541+
values = cast(Categorical, blk.values)
542+
values._replace(to_replace=to_replace, value=value, inplace=True)
545543
return [blk]
546544

547545
if not self._can_hold_element(to_replace):
@@ -651,6 +649,14 @@ def replace_list(
651649
"""
652650
values = self.values
653651

652+
if isinstance(values, Categorical):
653+
# TODO: avoid special-casing
654+
# GH49404
655+
blk = self if inplace else self.copy()
656+
values = cast(Categorical, blk.values)
657+
values._replace(to_replace=src_list, value=dest_list, inplace=True)
658+
return [blk]
659+
654660
# Exclude anything that we know we won't contain
655661
pairs = [
656662
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)

pandas/tests/arrays/categorical/test_replace.py

+13
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121
((5, 6), 2, [1, 2, 3], False),
2222
([1], [2], [2, 2, 3], False),
2323
([1, 4], [5, 2], [5, 2, 3], False),
24+
# GH49404: overlap between to_replace and value
25+
([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
26+
# GH50872, GH46884: replace with null
27+
(1, None, [None, 2, 3], False),
28+
(1, pd.NA, [None, 2, 3], False),
2429
# check_categorical sorts categories, which crashes on mixed dtypes
2530
(3, "4", [1, 2, "4"], False),
2631
([1, 2, "3"], "5", ["5", "5", 3], True),
@@ -65,3 +70,11 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
6570

6671
pd.Series(cat).replace(to_replace, value, inplace=True)
6772
tm.assert_categorical_equal(cat, expected)
73+
74+
75+
def test_replace_categorical_ea_dtype():
76+
# GH49404
77+
cat = Categorical(pd.array(["a", "b"], dtype="string"))
78+
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
79+
expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
80+
tm.assert_categorical_equal(result, expected)

pandas/tests/extension/test_arrow.py

+31-8
Original file line numberDiff line numberDiff line change
@@ -372,16 +372,27 @@ def test_getitem_scalar(self, data):
372372

373373

374374
class TestBaseAccumulateTests(base.BaseAccumulateTests):
375-
def check_accumulate(self, s, op_name, skipna):
376-
result = getattr(s, op_name)(skipna=skipna).astype("Float64")
377-
expected = getattr(s.astype("Float64"), op_name)(skipna=skipna)
375+
def check_accumulate(self, ser, op_name, skipna):
376+
result = getattr(ser, op_name)(skipna=skipna)
377+
378+
if ser.dtype.kind == "m":
379+
# Just check that we match the integer behavior.
380+
ser = ser.astype("int64[pyarrow]")
381+
result = result.astype("int64[pyarrow]")
382+
383+
result = result.astype("Float64")
384+
expected = getattr(ser.astype("Float64"), op_name)(skipna=skipna)
378385
self.assert_series_equal(result, expected, check_dtype=False)
379386

380387
@pytest.mark.parametrize("skipna", [True, False])
381388
def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
382389
pa_type = data.dtype.pyarrow_dtype
383390
if (
384-
(pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type))
391+
(
392+
pa.types.is_integer(pa_type)
393+
or pa.types.is_floating(pa_type)
394+
or pa.types.is_duration(pa_type)
395+
)
385396
and all_numeric_accumulations == "cumsum"
386397
and not pa_version_under9p0
387398
):
@@ -423,9 +434,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques
423434
raises=NotImplementedError,
424435
)
425436
)
426-
elif all_numeric_accumulations == "cumsum" and (
427-
pa.types.is_duration(pa_type) or pa.types.is_boolean(pa_type)
428-
):
437+
elif all_numeric_accumulations == "cumsum" and (pa.types.is_boolean(pa_type)):
429438
request.node.add_marker(
430439
pytest.mark.xfail(
431440
reason=f"{all_numeric_accumulations} not implemented for {pa_type}",
@@ -566,10 +575,24 @@ def test_reduce_series(
566575
f"pyarrow={pa.__version__} for {pa_dtype}"
567576
),
568577
)
569-
if not pa.types.is_boolean(pa_dtype):
578+
if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype):
579+
# We *might* want to make this behave like the non-pyarrow cases,
580+
# but have not yet decided.
570581
request.node.add_marker(xfail_mark)
582+
571583
op_name = all_boolean_reductions
572584
ser = pd.Series(data)
585+
586+
if pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype):
587+
# xref GH#34479 we support this in our non-pyarrow datetime64 dtypes,
588+
# but it isn't obvious we _should_. For now, we keep the pyarrow
589+
# behavior which does not support this.
590+
591+
with pytest.raises(TypeError, match="does not support reduction"):
592+
getattr(ser, op_name)(skipna=skipna)
593+
594+
return
595+
573596
result = getattr(ser, op_name)(skipna=skipna)
574597
assert result is (op_name == "any")
575598

pandas/tests/groupby/transform/test_transform.py

-4
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,6 @@ def test_transform_broadcast(tsframe, ts):
164164
def test_transform_axis_1(request, transformation_func):
165165
# GH 36308
166166

167-
if transformation_func == "ngroup":
168-
msg = "ngroup fails with axis=1: #45986"
169-
request.node.add_marker(pytest.mark.xfail(reason=msg))
170-
171167
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
172168
args = get_groupby_method_args(transformation_func, df)
173169
result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args)

0 commit comments

Comments
 (0)