Skip to content

Commit f55ca62

Browse files
committed
Merge remote-tracking branch 'upstream/main' into split-arrow
2 parents 14c059c + 78a2ef2 commit f55ca62

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+618
-409
lines changed

ci/code_checks.sh

+2-54
Original file line numberDiff line numberDiff line change
@@ -70,19 +70,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7070
--format=actions \
7171
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
73-
-i "pandas.Categorical.__array__ SA01" \
74-
-i "pandas.Categorical.codes SA01" \
75-
-i "pandas.Categorical.dtype SA01" \
76-
-i "pandas.Categorical.from_codes SA01" \
77-
-i "pandas.Categorical.ordered SA01" \
78-
-i "pandas.CategoricalDtype.categories SA01" \
79-
-i "pandas.CategoricalDtype.ordered SA01" \
80-
-i "pandas.CategoricalIndex.codes SA01" \
81-
-i "pandas.CategoricalIndex.ordered SA01" \
8273
-i "pandas.DataFrame.__dataframe__ SA01" \
8374
-i "pandas.DataFrame.at_time PR01" \
84-
-i "pandas.DataFrame.hist RT03" \
85-
-i "pandas.DataFrame.infer_objects RT03" \
8675
-i "pandas.DataFrame.kurt RT03,SA01" \
8776
-i "pandas.DataFrame.kurtosis RT03,SA01" \
8877
-i "pandas.DataFrame.max RT03" \
@@ -92,62 +81,29 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
9281
-i "pandas.DataFrame.plot PR02,SA01" \
9382
-i "pandas.DataFrame.prod RT03" \
9483
-i "pandas.DataFrame.product RT03" \
95-
-i "pandas.DataFrame.reorder_levels SA01" \
9684
-i "pandas.DataFrame.sem PR01,RT03,SA01" \
9785
-i "pandas.DataFrame.skew RT03,SA01" \
9886
-i "pandas.DataFrame.sparse PR01" \
9987
-i "pandas.DataFrame.std PR01,RT03,SA01" \
10088
-i "pandas.DataFrame.sum RT03" \
10189
-i "pandas.DataFrame.swaplevel SA01" \
10290
-i "pandas.DataFrame.to_markdown SA01" \
103-
-i "pandas.DataFrame.to_parquet RT03" \
10491
-i "pandas.DataFrame.var PR01,RT03,SA01" \
105-
-i "pandas.DatetimeIndex.freqstr SA01" \
106-
-i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \
107-
-i "pandas.DatetimeIndex.indexer_between_time RT03" \
108-
-i "pandas.DatetimeIndex.snap PR01,RT03" \
109-
-i "pandas.DatetimeIndex.std PR01,RT03" \
110-
-i "pandas.DatetimeIndex.to_period RT03" \
111-
-i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \
112-
-i "pandas.DatetimeIndex.tz_convert RT03" \
113-
-i "pandas.DatetimeTZDtype SA01" \
114-
-i "pandas.DatetimeTZDtype.tz SA01" \
11592
-i "pandas.Grouper PR02" \
11693
-i "pandas.Index PR07" \
117-
-i "pandas.Index.T SA01" \
11894
-i "pandas.Index.append PR07,RT03,SA01" \
119-
-i "pandas.Index.copy PR07,SA01" \
12095
-i "pandas.Index.difference PR07,RT03,SA01" \
121-
-i "pandas.Index.drop PR07,SA01" \
122-
-i "pandas.Index.drop_duplicates RT03" \
123-
-i "pandas.Index.droplevel RT03,SA01" \
124-
-i "pandas.Index.dropna RT03,SA01" \
125-
-i "pandas.Index.duplicated RT03" \
126-
-i "pandas.Index.empty GL08" \
127-
-i "pandas.Index.fillna RT03" \
12896
-i "pandas.Index.get_indexer PR07,SA01" \
12997
-i "pandas.Index.get_indexer_for PR01,SA01" \
13098
-i "pandas.Index.get_indexer_non_unique PR07,SA01" \
13199
-i "pandas.Index.get_loc PR07,RT03,SA01" \
132-
-i "pandas.Index.get_slice_bound PR07" \
133-
-i "pandas.Index.identical PR01,SA01" \
134-
-i "pandas.Index.inferred_type SA01" \
135-
-i "pandas.Index.insert PR07,RT03,SA01" \
136-
-i "pandas.Index.intersection PR07,RT03,SA01" \
137-
-i "pandas.Index.item SA01" \
138100
-i "pandas.Index.join PR07,RT03,SA01" \
139-
-i "pandas.Index.memory_usage RT03" \
140101
-i "pandas.Index.names GL08" \
141-
-i "pandas.Index.nunique RT03" \
142102
-i "pandas.Index.putmask PR01,RT03" \
143103
-i "pandas.Index.ravel PR01,RT03" \
144-
-i "pandas.Index.reindex PR07" \
145104
-i "pandas.Index.slice_indexer PR07,RT03,SA01" \
146-
-i "pandas.Index.slice_locs RT03" \
147105
-i "pandas.Index.str PR01,SA01" \
148-
-i "pandas.Index.symmetric_difference PR07,RT03,SA01" \
149106
-i "pandas.Index.take PR01,PR07" \
150-
-i "pandas.Index.union PR07,RT03,SA01" \
151107
-i "pandas.Index.view GL08" \
152108
-i "pandas.Int16Dtype SA01" \
153109
-i "pandas.Int32Dtype SA01" \
@@ -175,7 +131,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
175131
-i "pandas.MultiIndex.append PR07,SA01" \
176132
-i "pandas.MultiIndex.copy PR07,RT03,SA01" \
177133
-i "pandas.MultiIndex.drop PR07,RT03,SA01" \
178-
-i "pandas.MultiIndex.droplevel RT03,SA01" \
179134
-i "pandas.MultiIndex.dtypes SA01" \
180135
-i "pandas.MultiIndex.get_indexer PR07,SA01" \
181136
-i "pandas.MultiIndex.get_level_values SA01" \
@@ -216,7 +171,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
216171
-i "pandas.PeriodIndex.dayofyear SA01" \
217172
-i "pandas.PeriodIndex.days_in_month SA01" \
218173
-i "pandas.PeriodIndex.daysinmonth SA01" \
219-
-i "pandas.PeriodIndex.freqstr SA01" \
220174
-i "pandas.PeriodIndex.from_fields PR07,SA01" \
221175
-i "pandas.PeriodIndex.from_ordinals SA01" \
222176
-i "pandas.PeriodIndex.hour SA01" \
@@ -237,7 +191,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
237191
-i "pandas.RangeIndex.step SA01" \
238192
-i "pandas.RangeIndex.stop SA01" \
239193
-i "pandas.Series SA01" \
240-
-i "pandas.Series.T SA01" \
241194
-i "pandas.Series.__iter__ RT03,SA01" \
242195
-i "pandas.Series.add PR07" \
243196
-i "pandas.Series.at_time PR01" \
@@ -248,7 +201,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
248201
-i "pandas.Series.cat.as_ordered PR01" \
249202
-i "pandas.Series.cat.as_unordered PR01" \
250203
-i "pandas.Series.cat.codes SA01" \
251-
-i "pandas.Series.cat.ordered SA01" \
252204
-i "pandas.Series.cat.remove_categories PR01,PR02" \
253205
-i "pandas.Series.cat.remove_unused_categories PR01" \
254206
-i "pandas.Series.cat.rename_categories PR01,PR02" \
@@ -272,23 +224,20 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
272224
-i "pandas.Series.dt.round PR01,PR02" \
273225
-i "pandas.Series.dt.seconds SA01" \
274226
-i "pandas.Series.dt.strftime PR01,PR02" \
275-
-i "pandas.Series.dt.to_period PR01,PR02,RT03" \
227+
-i "pandas.Series.dt.to_period PR01,PR02" \
276228
-i "pandas.Series.dt.total_seconds PR01" \
277-
-i "pandas.Series.dt.tz_convert PR01,PR02,RT03" \
229+
-i "pandas.Series.dt.tz_convert PR01,PR02" \
278230
-i "pandas.Series.dt.tz_localize PR01,PR02" \
279231
-i "pandas.Series.dt.unit GL08" \
280232
-i "pandas.Series.dtype SA01" \
281-
-i "pandas.Series.empty GL08" \
282233
-i "pandas.Series.eq PR07,SA01" \
283234
-i "pandas.Series.floordiv PR07" \
284235
-i "pandas.Series.ge PR07,SA01" \
285236
-i "pandas.Series.gt PR07,SA01" \
286237
-i "pandas.Series.hasnans SA01" \
287-
-i "pandas.Series.infer_objects RT03" \
288238
-i "pandas.Series.is_monotonic_decreasing SA01" \
289239
-i "pandas.Series.is_monotonic_increasing SA01" \
290240
-i "pandas.Series.is_unique SA01" \
291-
-i "pandas.Series.item SA01" \
292241
-i "pandas.Series.kurt RT03,SA01" \
293242
-i "pandas.Series.kurtosis RT03,SA01" \
294243
-i "pandas.Series.le PR07,SA01" \
@@ -304,7 +253,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
304253
-i "pandas.Series.mode SA01" \
305254
-i "pandas.Series.mul PR07" \
306255
-i "pandas.Series.ne PR07,SA01" \
307-
-i "pandas.Series.nunique RT03" \
308256
-i "pandas.Series.pad PR01,SA01" \
309257
-i "pandas.Series.plot PR02,SA01" \
310258
-i "pandas.Series.pop RT03,SA01" \

doc/source/whatsnew/v3.0.0.rst

+8
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Other enhancements
4242
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
4343
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
4444
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
45+
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
4546

4647
.. ---------------------------------------------------------------------------
4748
.. _whatsnew_300.notable_bug_fixes:
@@ -220,6 +221,7 @@ Removal of prior version deprecations/changes
220221
- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
221222
- All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
222223
- All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
224+
- Changed behavior of :meth:`Series.__getitem__` and :meth:`Series.__setitem__` to always treat integer keys as labels, never as positional, consistent with :class:`DataFrame` behavior (:issue:`50617`)
223225
- Disallow allowing logical operations (``||``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``); wrap the objects in :class:`Series`, :class:`Index`, or ``np.array`` first instead (:issue:`52264`)
224226
- Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`)
225227
- Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`)
@@ -340,6 +342,9 @@ Performance improvements
340342
- Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`)
341343
- Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`)
342344
- Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`)
345+
- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`)
346+
- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`)
347+
- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`)
343348
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`)
344349
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
345350
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
@@ -385,6 +390,7 @@ Numeric
385390

386391
Conversion
387392
^^^^^^^^^^
393+
- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`)
388394
- Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
389395
- Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`)
390396
- Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`)
@@ -466,6 +472,7 @@ Styler
466472
Other
467473
^^^^^
468474
- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
475+
- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
469476
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
470477
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
471478
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
@@ -477,6 +484,7 @@ Other
477484
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
478485
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
479486
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
487+
- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
480488

481489
.. ***DO NOT USE THIS SECTION***
482490

pandas/_libs/lib.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -2808,14 +2808,14 @@ def maybe_convert_objects(ndarray[object] objects,
28082808
from pandas.core.arrays import IntegerArray
28092809

28102810
# Set these values to 1 to be deterministic, match
2811-
# IntegerArray._internal_fill_value
2811+
# IntegerDtype._internal_fill_value
28122812
result[mask] = 1
28132813
result = IntegerArray(result, mask)
28142814
elif result is floats and convert_to_nullable_dtype:
28152815
from pandas.core.arrays import FloatingArray
28162816

28172817
# Set these values to 1.0 to be deterministic, match
2818-
# FloatingArray._internal_fill_value
2818+
# FloatingDtype._internal_fill_value
28192819
result[mask] = 1.0
28202820
result = FloatingArray(result, mask)
28212821

pandas/_typing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ def readline(self) -> bytes: ...
314314

315315

316316
class WriteExcelBuffer(WriteBuffer[bytes], Protocol):
317-
def truncate(self, size: int | None = ...) -> int: ...
317+
def truncate(self, size: int | None = ..., /) -> int: ...
318318

319319

320320
class ReadCsvBuffer(ReadBuffer[AnyStr_co], Protocol):

pandas/core/arrays/arrow/accessors.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,9 @@ def len(self) -> Series:
110110
from pandas import Series
111111

112112
value_lengths = pc.list_value_length(self._pa_array)
113-
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
113+
return Series(
114+
value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index
115+
)
114116

115117
def __getitem__(self, key: int | slice) -> Series:
116118
"""
@@ -149,7 +151,9 @@ def __getitem__(self, key: int | slice) -> Series:
149151
# if key < 0:
150152
# key = pc.add(key, pc.list_value_length(self._pa_array))
151153
element = pc.list_element(self._pa_array, key)
152-
return Series(element, dtype=ArrowDtype(element.type))
154+
return Series(
155+
element, dtype=ArrowDtype(element.type), index=self._data.index
156+
)
153157
elif isinstance(key, slice):
154158
if pa_version_under11p0:
155159
raise NotImplementedError(
@@ -167,7 +171,7 @@ def __getitem__(self, key: int | slice) -> Series:
167171
if step is None:
168172
step = 1
169173
sliced = pc.list_slice(self._pa_array, start, stop, step)
170-
return Series(sliced, dtype=ArrowDtype(sliced.type))
174+
return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index)
171175
else:
172176
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
173177

@@ -195,15 +199,17 @@ def flatten(self) -> Series:
195199
... )
196200
>>> s.list.flatten()
197201
0 1
198-
1 2
199-
2 3
200-
3 3
202+
0 2
203+
0 3
204+
1 3
201205
dtype: int64[pyarrow]
202206
"""
203207
from pandas import Series
204208

205-
flattened = pc.list_flatten(self._pa_array)
206-
return Series(flattened, dtype=ArrowDtype(flattened.type))
209+
counts = pa.compute.list_value_length(self._pa_array)
210+
flattened = pa.compute.list_flatten(self._pa_array)
211+
index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type)))
212+
return Series(flattened, dtype=ArrowDtype(flattened.type), index=index)
207213

208214

209215
class StructAccessor(ArrowAccessor):

pandas/core/arrays/arrow/array.py

+2
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,8 @@ def _box_pa_array(
525525
if pa_type is not None and pa_array.type != pa_type:
526526
if pa.types.is_dictionary(pa_type):
527527
pa_array = pa_array.dictionary_encode()
528+
if pa_array.type != pa_type:
529+
pa_array = pa_array.cast(pa_type)
528530
else:
529531
try:
530532
pa_array = pa_array.cast(pa_type)

pandas/core/arrays/boolean.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ class BooleanDtype(BaseMaskedDtype):
6868

6969
name: ClassVar[str] = "boolean"
7070

71+
# The value used to fill '_data' to avoid upcasting
72+
_internal_fill_value = False
73+
7174
# https://github.com/python/mypy/issues/4125
7275
# error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
7376
@property
@@ -293,13 +296,6 @@ class BooleanArray(BaseMaskedArray):
293296
Length: 3, dtype: boolean
294297
"""
295298

296-
# The value used to fill '_data' to avoid upcasting
297-
_internal_fill_value = False
298-
# Fill values used for any/all
299-
# Incompatible types in assignment (expression has type "bool", base class
300-
# "BaseMaskedArray" defined the type as "<typing special form>")
301-
_truthy_value = True # type: ignore[assignment]
302-
_falsey_value = False # type: ignore[assignment]
303299
_TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
304300
_FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
305301

pandas/core/arrays/categorical.py

+28
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,11 @@ def dtype(self) -> CategoricalDtype:
497497
"""
498498
The :class:`~pandas.api.types.CategoricalDtype` for this instance.
499499
500+
See Also
501+
--------
502+
astype : Cast argument to a specified dtype.
503+
CategoricalDtype : Type for categorical data.
504+
500505
Examples
501506
--------
502507
>>> cat = pd.Categorical(["a", "b"], ordered=True)
@@ -721,6 +726,11 @@ def from_codes(
721726
-------
722727
Categorical
723728
729+
See Also
730+
--------
731+
codes : The category codes of the categorical.
732+
CategoricalIndex : An Index with an underlying ``Categorical``.
733+
724734
Examples
725735
--------
726736
>>> dtype = pd.CategoricalDtype(["a", "b"], ordered=True)
@@ -810,6 +820,12 @@ def ordered(self) -> Ordered:
810820
"""
811821
Whether the categories have an ordered relationship.
812822
823+
See Also
824+
--------
825+
set_ordered : Set the ordered attribute.
826+
as_ordered : Set the Categorical to be ordered.
827+
as_unordered : Set the Categorical to be unordered.
828+
813829
Examples
814830
--------
815831
For :class:`pandas.Series`:
@@ -861,6 +877,11 @@ def codes(self) -> np.ndarray:
861877
ndarray[int]
862878
A non-writable view of the ``codes`` array.
863879
880+
See Also
881+
--------
882+
Categorical.from_codes : Make a Categorical from codes.
883+
CategoricalIndex : An Index with an underlying ``Categorical``.
884+
864885
Examples
865886
--------
866887
For :class:`pandas.Categorical`:
@@ -1641,6 +1662,9 @@ def __array__(
16411662
"""
16421663
The numpy array interface.
16431664
1665+
Users should not call this directly. Rather, it is invoked by
1666+
:func:`numpy.array` and :func:`numpy.asarray`.
1667+
16441668
Parameters
16451669
----------
16461670
dtype : np.dtype or None
@@ -1656,6 +1680,10 @@ def __array__(
16561680
if dtype==None (default), the same dtype as
16571681
categorical.categories.dtype.
16581682
1683+
See Also
1684+
--------
1685+
numpy.asarray : Convert input to numpy.ndarray.
1686+
16591687
Examples
16601688
--------
16611689

pandas/core/arrays/datetimelike.py

+5
Original file line numberDiff line numberDiff line change
@@ -875,6 +875,11 @@ def freqstr(self) -> str | None:
875875
"""
876876
Return the frequency object as a string if it's set, otherwise None.
877877
878+
See Also
879+
--------
880+
DatetimeIndex.inferred_freq : Returns a string representing a frequency
881+
generated by infer_freq.
882+
878883
Examples
879884
--------
880885
For DatetimeIndex:

0 commit comments

Comments
 (0)