Skip to content

Commit f6582c1

Browse files
Merge branch 'main' into reflect-changes-in-components
2 parents 9ea73cf + 564d0d9 commit f6582c1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1168
-630
lines changed

ci/code_checks.sh

+1-50
Original file line numberDiff line numberDiff line change
@@ -70,64 +70,24 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7070
--format=actions \
7171
-i ES01 `# For now it is ok if docstrings are missing the extended summary` \
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
73-
-i "pandas.Categorical.__array__ SA01" \
74-
-i "pandas.Categorical.codes SA01" \
75-
-i "pandas.Categorical.dtype SA01" \
76-
-i "pandas.Categorical.from_codes SA01" \
77-
-i "pandas.Categorical.ordered SA01" \
78-
-i "pandas.CategoricalDtype.categories SA01" \
79-
-i "pandas.CategoricalDtype.ordered SA01" \
80-
-i "pandas.CategoricalIndex.codes SA01" \
81-
-i "pandas.CategoricalIndex.ordered SA01" \
82-
-i "pandas.DataFrame.__dataframe__ SA01" \
83-
-i "pandas.DataFrame.at_time PR01" \
84-
-i "pandas.DataFrame.kurt RT03,SA01" \
85-
-i "pandas.DataFrame.kurtosis RT03,SA01" \
8673
-i "pandas.DataFrame.max RT03" \
8774
-i "pandas.DataFrame.mean RT03,SA01" \
8875
-i "pandas.DataFrame.median RT03,SA01" \
8976
-i "pandas.DataFrame.min RT03" \
9077
-i "pandas.DataFrame.plot PR02,SA01" \
91-
-i "pandas.DataFrame.prod RT03" \
92-
-i "pandas.DataFrame.product RT03" \
9378
-i "pandas.DataFrame.sem PR01,RT03,SA01" \
94-
-i "pandas.DataFrame.skew RT03,SA01" \
95-
-i "pandas.DataFrame.sparse PR01" \
9679
-i "pandas.DataFrame.std PR01,RT03,SA01" \
9780
-i "pandas.DataFrame.sum RT03" \
9881
-i "pandas.DataFrame.swaplevel SA01" \
9982
-i "pandas.DataFrame.to_markdown SA01" \
10083
-i "pandas.DataFrame.var PR01,RT03,SA01" \
101-
-i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \
102-
-i "pandas.DatetimeIndex.snap PR01,RT03" \
103-
-i "pandas.DatetimeIndex.to_period RT03" \
10484
-i "pandas.Grouper PR02" \
10585
-i "pandas.Index PR07" \
106-
-i "pandas.Index.append PR07,RT03,SA01" \
107-
-i "pandas.Index.difference PR07,RT03,SA01" \
108-
-i "pandas.Index.duplicated RT03" \
109-
-i "pandas.Index.get_indexer PR07,SA01" \
110-
-i "pandas.Index.get_indexer_for PR01,SA01" \
111-
-i "pandas.Index.get_indexer_non_unique PR07,SA01" \
11286
-i "pandas.Index.get_loc PR07,RT03,SA01" \
113-
-i "pandas.Index.identical PR01,SA01" \
114-
-i "pandas.Index.insert PR07,RT03,SA01" \
115-
-i "pandas.Index.intersection PR07,RT03,SA01" \
11687
-i "pandas.Index.join PR07,RT03,SA01" \
11788
-i "pandas.Index.names GL08" \
118-
-i "pandas.Index.nunique RT03" \
119-
-i "pandas.Index.putmask PR01,RT03" \
12089
-i "pandas.Index.ravel PR01,RT03" \
121-
-i "pandas.Index.slice_indexer PR07,RT03,SA01" \
12290
-i "pandas.Index.str PR01,SA01" \
123-
-i "pandas.Index.symmetric_difference PR07,RT03,SA01" \
124-
-i "pandas.Index.take PR01,PR07" \
125-
-i "pandas.Index.union PR07,RT03,SA01" \
126-
-i "pandas.Index.view GL08" \
127-
-i "pandas.Int16Dtype SA01" \
128-
-i "pandas.Int32Dtype SA01" \
129-
-i "pandas.Int64Dtype SA01" \
130-
-i "pandas.Int8Dtype SA01" \
13191
-i "pandas.Interval PR02" \
13292
-i "pandas.Interval.closed SA01" \
13393
-i "pandas.Interval.left SA01" \
@@ -137,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
13797
-i "pandas.IntervalDtype.subtype SA01" \
13898
-i "pandas.IntervalIndex.closed SA01" \
13999
-i "pandas.IntervalIndex.contains RT03" \
140-
-i "pandas.IntervalIndex.get_indexer PR07,SA01" \
141100
-i "pandas.IntervalIndex.get_loc PR07,RT03,SA01" \
142101
-i "pandas.IntervalIndex.is_non_overlapping_monotonic SA01" \
143102
-i "pandas.IntervalIndex.left GL08" \
@@ -151,7 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
151110
-i "pandas.MultiIndex.copy PR07,RT03,SA01" \
152111
-i "pandas.MultiIndex.drop PR07,RT03,SA01" \
153112
-i "pandas.MultiIndex.dtypes SA01" \
154-
-i "pandas.MultiIndex.get_indexer PR07,SA01" \
155113
-i "pandas.MultiIndex.get_level_values SA01" \
156114
-i "pandas.MultiIndex.get_loc PR07" \
157115
-i "pandas.MultiIndex.get_loc_level PR07" \
@@ -212,15 +170,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
212170
-i "pandas.Series SA01" \
213171
-i "pandas.Series.__iter__ RT03,SA01" \
214172
-i "pandas.Series.add PR07" \
215-
-i "pandas.Series.at_time PR01" \
216173
-i "pandas.Series.backfill PR01,SA01" \
217174
-i "pandas.Series.case_when RT03" \
218175
-i "pandas.Series.cat PR07,SA01" \
219176
-i "pandas.Series.cat.add_categories PR01,PR02" \
220177
-i "pandas.Series.cat.as_ordered PR01" \
221178
-i "pandas.Series.cat.as_unordered PR01" \
222179
-i "pandas.Series.cat.codes SA01" \
223-
-i "pandas.Series.cat.ordered SA01" \
224180
-i "pandas.Series.cat.remove_categories PR01,PR02" \
225181
-i "pandas.Series.cat.remove_unused_categories PR01" \
226182
-i "pandas.Series.cat.rename_categories PR01,PR02" \
@@ -244,7 +200,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
244200
-i "pandas.Series.dt.round PR01,PR02" \
245201
-i "pandas.Series.dt.seconds SA01" \
246202
-i "pandas.Series.dt.strftime PR01,PR02" \
247-
-i "pandas.Series.dt.to_period PR01,PR02,RT03" \
203+
-i "pandas.Series.dt.to_period PR01,PR02" \
248204
-i "pandas.Series.dt.total_seconds PR01" \
249205
-i "pandas.Series.dt.tz_convert PR01,PR02" \
250206
-i "pandas.Series.dt.tz_localize PR01,PR02" \
@@ -273,7 +229,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
273229
-i "pandas.Series.mode SA01" \
274230
-i "pandas.Series.mul PR07" \
275231
-i "pandas.Series.ne PR07,SA01" \
276-
-i "pandas.Series.nunique RT03" \
277232
-i "pandas.Series.pad PR01,SA01" \
278233
-i "pandas.Series.plot PR02,SA01" \
279234
-i "pandas.Series.pop RT03,SA01" \
@@ -432,10 +387,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
432387
-i "pandas.Timestamp.weekday SA01" \
433388
-i "pandas.Timestamp.weekofyear SA01" \
434389
-i "pandas.Timestamp.year GL08" \
435-
-i "pandas.UInt16Dtype SA01" \
436-
-i "pandas.UInt32Dtype SA01" \
437-
-i "pandas.UInt64Dtype SA01" \
438-
-i "pandas.UInt8Dtype SA01" \
439390
-i "pandas.api.extensions.ExtensionArray SA01" \
440391
-i "pandas.api.extensions.ExtensionArray._accumulate RT03,SA01" \
441392
-i "pandas.api.extensions.ExtensionArray._concat_same_type PR07,SA01" \

doc/source/whatsnew/v3.0.0.rst

+11
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,11 @@ Other enhancements
3939
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
4040
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
4141
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
42+
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
4243
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
4344
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
4445
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
46+
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
4547

4648
.. ---------------------------------------------------------------------------
4749
.. _whatsnew_300.notable_bug_fixes:
@@ -220,6 +222,7 @@ Removal of prior version deprecations/changes
220222
- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
221223
- All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
222224
- All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
225+
- Changed behavior of :meth:`Series.__getitem__` and :meth:`Series.__setitem__` to always treat integer keys as labels, never as positional, consistent with :class:`DataFrame` behavior (:issue:`50617`)
223226
- Disallow allowing logical operations (``||``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``); wrap the objects in :class:`Series`, :class:`Index`, or ``np.array`` first instead (:issue:`52264`)
224227
- Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`)
225228
- Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`)
@@ -333,13 +336,17 @@ Performance improvements
333336
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
334337
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
335338
- Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`)
339+
- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`)
336340
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
337341
- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
338342
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
339343
- Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
340344
- Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`)
341345
- Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`)
342346
- Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`)
347+
- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`)
348+
- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`)
349+
- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`)
343350
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`)
344351
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
345352
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
@@ -397,6 +404,7 @@ Numeric
397404

398405
Conversion
399406
^^^^^^^^^^
407+
- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`)
400408
- Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
401409
- Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`)
402410
- Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`)
@@ -453,6 +461,7 @@ Groupby/resample/rolling
453461
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
454462
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
455463
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
464+
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
456465

457466

458467
Reshaping
@@ -477,6 +486,7 @@ Styler
477486
Other
478487
^^^^^
479488
- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
489+
- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
480490
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
481491
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
482492
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
@@ -488,6 +498,7 @@ Other
488498
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
489499
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
490500
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
501+
- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
491502

492503
.. ***DO NOT USE THIS SECTION***
493504

pandas/_libs/hashing.pyx

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import numpy as np
1111

1212
from numpy cimport (
1313
import_array,
14+
ndarray,
1415
uint8_t,
1516
uint64_t,
1617
)
@@ -22,7 +23,7 @@ from pandas._libs.util cimport is_nan
2223

2324
@cython.boundscheck(False)
2425
def hash_object_array(
25-
object[:] arr, str key, str encoding="utf8"
26+
ndarray[object, ndim=1] arr, str key, str encoding="utf8"
2627
) -> np.ndarray[np.uint64]:
2728
"""
2829
Parameters

pandas/_libs/index.pyi

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ...
7474

7575
class BaseMultiIndexCodesEngine:
7676
levels: list[np.ndarray]
77-
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
77+
offsets: np.ndarray # np.ndarray[..., ndim=1]
7878

7979
def __init__(
8080
self,
8181
levels: list[Index], # all entries hashable
8282
labels: list[np.ndarray], # all entries integer-dtyped
83-
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
83+
offsets: np.ndarray, # np.ndarray[..., ndim=1]
8484
) -> None: ...
8585
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
8686
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...

pandas/_libs/index.pyx

+38-10
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ from numpy cimport (
99
intp_t,
1010
ndarray,
1111
uint8_t,
12-
uint64_t,
1312
)
1413

1514
cnp.import_array()
@@ -699,16 +698,15 @@ cdef class BaseMultiIndexCodesEngine:
699698
Keys are located by first locating each component against the respective
700699
level, then locating (the integer representation of) codes.
701700
"""
702-
def __init__(self, object levels, object labels,
703-
ndarray[uint64_t, ndim=1] offsets):
701+
def __init__(self, object levels, object labels, ndarray offsets):
704702
"""
705703
Parameters
706704
----------
707705
levels : list-like of numpy arrays
708706
Levels of the MultiIndex.
709707
labels : list-like of numpy arrays of integer dtype
710708
Labels of the MultiIndex.
711-
offsets : numpy array of uint64 dtype
709+
offsets : numpy array of int dtype
712710
Pre-calculated offsets, one for each level of the index.
713711
"""
714712
self.levels = levels
@@ -718,8 +716,9 @@ cdef class BaseMultiIndexCodesEngine:
718716
# with positive integers (-1 for NaN becomes 1). This enables us to
719717
# differentiate between values that are missing in other and matching
720718
# NaNs. We will set values that are not found to 0 later:
721-
labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
722-
codes = labels_arr.astype("uint64", copy=False)
719+
codes = np.array(labels).T
720+
codes += multiindex_nulls_shift # inplace sum optimisation
721+
723722
self.level_has_nans = [-1 in lab for lab in labels]
724723

725724
# Map each codes combination in the index to an integer unambiguously
@@ -731,8 +730,37 @@ cdef class BaseMultiIndexCodesEngine:
731730
# integers representing labels: we will use its get_loc and get_indexer
732731
self._base.__init__(self, lab_ints)
733732

734-
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
735-
raise NotImplementedError("Implemented by subclass") # pragma: no cover
733+
def _codes_to_ints(self, ndarray codes) -> np.ndarray:
734+
"""
735+
Transform combination(s) of uint in one uint or Python integer (each), in a
736+
strictly monotonic way (i.e. respecting the lexicographic order of integer
737+
combinations).
738+
739+
Parameters
740+
----------
741+
codes : 1- or 2-dimensional array of dtype uint
742+
Combinations of integers (one per row)
743+
744+
Returns
745+
-------
746+
scalar or 1-dimensional array, of dtype _codes_dtype
747+
Integer(s) representing one combination (each).
748+
"""
749+
# To avoid overflows, first make sure we are working with the right dtype:
750+
codes = codes.astype(self._codes_dtype, copy=False)
751+
752+
# Shift the representation of each level by the pre-calculated number of bits:
753+
codes <<= self.offsets # inplace shift optimisation
754+
755+
# Now sum and OR are in fact interchangeable. This is a simple
756+
# composition of the (disjunct) significant bits of each level (i.e.
757+
# each column in "codes") in a single positive integer (per row):
758+
if codes.ndim == 1:
759+
# Single key
760+
return np.bitwise_or.reduce(codes)
761+
762+
# Multiple keys
763+
return np.bitwise_or.reduce(codes, axis=1)
736764

737765
def _extract_level_codes(self, target) -> np.ndarray:
738766
"""
@@ -757,7 +785,7 @@ cdef class BaseMultiIndexCodesEngine:
757785
codes[codes > 0] += 1
758786
if self.level_has_nans[i]:
759787
codes[target.codes[i] == -1] += 1
760-
return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
788+
return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)
761789

762790
def get_indexer(self, target: np.ndarray) -> np.ndarray:
763791
"""
@@ -788,7 +816,7 @@ cdef class BaseMultiIndexCodesEngine:
788816
raise KeyError(key)
789817

790818
# Transform indices into single integer:
791-
lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
819+
lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))
792820

793821
return self._base.get_loc(self, lab_int)
794822

pandas/_libs/lib.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -2808,14 +2808,14 @@ def maybe_convert_objects(ndarray[object] objects,
28082808
from pandas.core.arrays import IntegerArray
28092809

28102810
# Set these values to 1 to be deterministic, match
2811-
# IntegerArray._internal_fill_value
2811+
# IntegerDtype._internal_fill_value
28122812
result[mask] = 1
28132813
result = IntegerArray(result, mask)
28142814
elif result is floats and convert_to_nullable_dtype:
28152815
from pandas.core.arrays import FloatingArray
28162816

28172817
# Set these values to 1.0 to be deterministic, match
2818-
# FloatingArray._internal_fill_value
2818+
# FloatingDtype._internal_fill_value
28192819
result[mask] = 1.0
28202820
result = FloatingArray(result, mask)
28212821

0 commit comments

Comments
 (0)