Skip to content

Commit 6119771

Browse files
Merge branch 'main' into triage-doc-changes
2 parents bd096c0 + 6a83910 commit 6119771

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+520
-119
lines changed

asv_bench/benchmarks/algorithms.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from importlib import import_module
22

33
import numpy as np
4+
import pyarrow as pa
45

56
import pandas as pd
67

@@ -72,7 +73,16 @@ class Duplicated:
7273
params = [
7374
[True, False],
7475
["first", "last", False],
75-
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
76+
[
77+
"int",
78+
"uint",
79+
"float",
80+
"string",
81+
"datetime64[ns]",
82+
"datetime64[ns, tz]",
83+
"timestamp[ms][pyarrow]",
84+
"duration[s][pyarrow]",
85+
],
7686
]
7787
param_names = ["unique", "keep", "dtype"]
7888

@@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype):
8797
"datetime64[ns, tz]": pd.date_range(
8898
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
8999
),
100+
"timestamp[ms][pyarrow]": pd.Index(
101+
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
102+
),
103+
"duration[s][pyarrow]": pd.Index(
104+
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
105+
),
90106
}[dtype]
91107
if not unique:
92108
data = data.repeat(5)

doc/source/reference/extensions.rst

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ objects.
4949
api.extensions.ExtensionArray.copy
5050
api.extensions.ExtensionArray.view
5151
api.extensions.ExtensionArray.dropna
52+
api.extensions.ExtensionArray.duplicated
5253
api.extensions.ExtensionArray.equals
5354
api.extensions.ExtensionArray.factorize
5455
api.extensions.ExtensionArray.fillna

doc/source/user_guide/10min.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ Merge
451451
Concat
452452
~~~~~~
453453

454-
pandas provides various facilities for easily combining together :class:`Series`` and
454+
pandas provides various facilities for easily combining together :class:`Series` and
455455
:class:`DataFrame` objects with various kinds of set logic for the indexes
456456
and relational algebra functionality in the case of join / merge-type
457457
operations.

doc/source/user_guide/enhancingperf.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ can be improved by passing an ``np.ndarray``.
184184
...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b,
185185
...: np.ndarray col_N):
186186
...: assert (col_a.dtype == np.float64
187-
...: and col_b.dtype == np.float64 and col_N.dtype == np.int_)
187+
...: and col_b.dtype == np.float64 and col_N.dtype == np.dtype(int))
188188
...: cdef Py_ssize_t i, n = len(col_N)
189189
...: assert (len(col_a) == len(col_b) == n)
190190
...: cdef np.ndarray[double] res = np.empty(n)

doc/source/whatsnew/v2.1.2.rst

+7
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,15 @@ Fixed regressions
2222

2323
Bug fixes
2424
~~~~~~~~~
25+
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
26+
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)
27+
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
28+
- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
2529
- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
2630
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
31+
- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
32+
- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
33+
- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
2734
-
2835

2936
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v2.2.0.rst

+33-3
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Other enhancements
7676

7777
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
7878
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
79+
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
7980
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
8081
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
8182
-
@@ -132,10 +133,36 @@ and ``sort=False``:
132133
133134
result
134135
135-
.. _whatsnew_220.notable_bug_fixes.notable_bug_fix2:
136+
.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels:
136137

137-
notable_bug_fix2
138-
^^^^^^^^^^^^^^^^
138+
:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ
139+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
140+
141+
In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder
142+
index levels when joining on two indexes with different levels (:issue:`34133`).
143+
144+
.. ipython:: python
145+
146+
left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"]))
147+
right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"]))
148+
result = left.join(right)
149+
150+
*Old Behavior*
151+
152+
.. code-block:: ipython
153+
154+
In [5]: result
155+
Out[5]:
156+
left right
157+
B A C
158+
1 x 1 1 2
159+
2 x 2 1 2
160+
161+
*New Behavior*
162+
163+
.. ipython:: python
164+
165+
result
139166
140167
.. ---------------------------------------------------------------------------
141168
.. _whatsnew_220.api_breaking:
@@ -241,6 +268,7 @@ Performance improvements
241268
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
242269
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
243270
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
271+
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
244272
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
245273
- Performance improvement when localizing time to UTC (:issue:`55241`)
246274

@@ -252,6 +280,7 @@ Bug fixes
252280
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
253281
- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
254282
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
283+
- Bug in :meth:`pandas.DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
255284
- Bug in :meth:`pandas.read_excel` with a ODS file without cached formatted cell for float values (:issue:`55219`)
256285

257286
Categorical
@@ -339,6 +368,7 @@ Reshaping
339368
^^^^^^^^^
340369
- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`)
341370
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
371+
-
342372

343373
Sparse
344374
^^^^^^

pandas/compat/numpy/__init__.py

+15
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,21 @@
2121
)
2222

2323

24+
np_long: type
25+
np_ulong: type
26+
27+
if _nlv >= Version("2.0.0.dev0"):
28+
try:
29+
np_long = np.long # type: ignore[attr-defined]
30+
np_ulong = np.ulong # type: ignore[attr-defined]
31+
except AttributeError:
32+
np_long = np.int_
33+
np_ulong = np.uint
34+
else:
35+
np_long = np.int_
36+
np_ulong = np.uint
37+
38+
2439
__all__ = [
2540
"np",
2641
"_np_version",

pandas/core/algorithms.py

+8-13
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@
5555
)
5656
from pandas.core.dtypes.concat import concat_compat
5757
from pandas.core.dtypes.dtypes import (
58-
ArrowDtype,
5958
BaseMaskedDtype,
6059
CategoricalDtype,
6160
ExtensionDtype,
@@ -979,36 +978,32 @@ def value_counts_arraylike(
979978

980979

981980
def duplicated(
982-
values: ArrayLike, keep: Literal["first", "last", False] = "first"
981+
values: ArrayLike,
982+
keep: Literal["first", "last", False] = "first",
983+
mask: npt.NDArray[np.bool_] | None = None,
983984
) -> npt.NDArray[np.bool_]:
984985
"""
985986
Return boolean ndarray denoting duplicate values.
986987
987988
Parameters
988989
----------
989-
values : nd.array, ExtensionArray or Series
990+
values : np.ndarray or ExtensionArray
990991
Array over which to check for duplicate values.
991992
keep : {'first', 'last', False}, default 'first'
992993
- ``first`` : Mark duplicates as ``True`` except for the first
993994
occurrence.
994995
- ``last`` : Mark duplicates as ``True`` except for the last
995996
occurrence.
996997
- False : Mark all duplicates as ``True``.
998+
mask : ndarray[bool], optional
999+
array indicating which elements to exclude from checking
9971000
9981001
Returns
9991002
-------
10001003
duplicated : ndarray[bool]
10011004
"""
1002-
if hasattr(values, "dtype"):
1003-
if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub":
1004-
values = values._to_masked() # type: ignore[union-attr]
1005-
1006-
if isinstance(values.dtype, BaseMaskedDtype):
1007-
values = cast("BaseMaskedArray", values)
1008-
return htable.duplicated(values._data, keep=keep, mask=values._mask)
1009-
10101005
values = _ensure_data(values)
1011-
return htable.duplicated(values, keep=keep)
1006+
return htable.duplicated(values, keep=keep, mask=mask)
10121007

10131008

10141009
def mode(
@@ -1641,7 +1636,7 @@ def safe_sort(
16411636
else:
16421637
mask = None
16431638
else:
1644-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
1639+
reverse_indexer = np.empty(len(sorter), dtype=int)
16451640
reverse_indexer.put(sorter, np.arange(len(sorter)))
16461641
# Out of bound indices will be masked with `-1` next, so we
16471642
# may deal with them here without performance loss using `mode='wrap'`

pandas/core/arrays/arrow/array.py

+68-10
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,12 @@
3030
from pandas.util._decorators import doc
3131
from pandas.util._validators import validate_fillna_kwargs
3232

33-
from pandas.core.dtypes.cast import can_hold_element
33+
from pandas.core.dtypes.cast import (
34+
can_hold_element,
35+
infer_dtype_from_scalar,
36+
)
3437
from pandas.core.dtypes.common import (
38+
CategoricalDtype,
3539
is_array_like,
3640
is_bool_dtype,
3741
is_integer,
@@ -42,6 +46,7 @@
4246
from pandas.core.dtypes.missing import isna
4347

4448
from pandas.core import (
49+
algorithms as algos,
4550
missing,
4651
roperator,
4752
)
@@ -627,7 +632,9 @@ def __setstate__(self, state) -> None:
627632

628633
def _cmp_method(self, other, op):
629634
pc_func = ARROW_CMP_FUNCS[op.__name__]
630-
if isinstance(other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)):
635+
if isinstance(
636+
other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
637+
) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
631638
result = pc_func(self._pa_array, self._box_pa(other))
632639
elif is_scalar(other):
633640
try:
@@ -1289,6 +1296,30 @@ def to_numpy(
12891296
result[~mask] = data[~mask]._pa_array.to_numpy()
12901297
return result
12911298

1299+
@doc(ExtensionArray.duplicated)
1300+
def duplicated(
1301+
self, keep: Literal["first", "last", False] = "first"
1302+
) -> npt.NDArray[np.bool_]:
1303+
pa_type = self._pa_array.type
1304+
if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
1305+
values = self.to_numpy(na_value=0)
1306+
elif pa.types.is_boolean(pa_type):
1307+
values = self.to_numpy(na_value=False)
1308+
elif pa.types.is_temporal(pa_type):
1309+
if pa_type.bit_width == 32:
1310+
pa_type = pa.int32()
1311+
else:
1312+
pa_type = pa.int64()
1313+
arr = self.astype(ArrowDtype(pa_type))
1314+
values = arr.to_numpy(na_value=0)
1315+
else:
1316+
# factorize the values to avoid the performance penalty of
1317+
# converting to object dtype
1318+
values = self.factorize()[0]
1319+
1320+
mask = self.isna() if self._hasna else None
1321+
return algos.duplicated(values, keep=keep, mask=mask)
1322+
12921323
def unique(self) -> Self:
12931324
"""
12941325
Compute the ArrowExtensionArray of unique values.
@@ -1599,13 +1630,21 @@ def _reduce(
15991630
pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
16001631

16011632
if keepdims:
1602-
result = pa.array([pa_result.as_py()], type=pa_result.type)
1633+
if isinstance(pa_result, pa.Scalar):
1634+
result = pa.array([pa_result.as_py()], type=pa_result.type)
1635+
else:
1636+
result = pa.array(
1637+
[pa_result],
1638+
type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]),
1639+
)
16031640
return type(self)(result)
16041641

16051642
if pc.is_null(pa_result).as_py():
16061643
return self.dtype.na_value
1607-
else:
1644+
elif isinstance(pa_result, pa.Scalar):
16081645
return pa_result.as_py()
1646+
else:
1647+
return pa_result
16091648

16101649
def _explode(self):
16111650
"""
@@ -1708,7 +1747,7 @@ def __setitem__(self, key, value) -> None:
17081747
data = pa.chunked_array([data])
17091748
self._pa_array = data
17101749

1711-
def _rank(
1750+
def _rank_calc(
17121751
self,
17131752
*,
17141753
axis: AxisInt = 0,
@@ -1717,9 +1756,6 @@ def _rank(
17171756
ascending: bool = True,
17181757
pct: bool = False,
17191758
):
1720-
"""
1721-
See Series.rank.__doc__.
1722-
"""
17231759
if pa_version_under9p0 or axis != 0:
17241760
ranked = super()._rank(
17251761
axis=axis,
@@ -1734,7 +1770,7 @@ def _rank(
17341770
else:
17351771
pa_type = pa.uint64()
17361772
result = pa.array(ranked, type=pa_type, from_pandas=True)
1737-
return type(self)(result)
1773+
return result
17381774

17391775
data = self._pa_array.combine_chunks()
17401776
sort_keys = "ascending" if ascending else "descending"
@@ -1773,7 +1809,29 @@ def _rank(
17731809
divisor = pc.count(result)
17741810
result = pc.divide(result, divisor)
17751811

1776-
return type(self)(result)
1812+
return result
1813+
1814+
def _rank(
1815+
self,
1816+
*,
1817+
axis: AxisInt = 0,
1818+
method: str = "average",
1819+
na_option: str = "keep",
1820+
ascending: bool = True,
1821+
pct: bool = False,
1822+
):
1823+
"""
1824+
See Series.rank.__doc__.
1825+
"""
1826+
return type(self)(
1827+
self._rank_calc(
1828+
axis=axis,
1829+
method=method,
1830+
na_option=na_option,
1831+
ascending=ascending,
1832+
pct=pct,
1833+
)
1834+
)
17771835

17781836
def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self:
17791837
"""

0 commit comments

Comments
 (0)