pandas-dev
diff --git a/‎ci/code_checks.sh
Lines changed: 1 addition & 35 deletions b/‎ci/code_checks.sh
Lines changed: 1 addition & 35 deletions
diff --git a/‎doc/source/whatsnew/v3.0.0.rst
Lines changed: 10 additions & 0 deletions b/‎doc/source/whatsnew/v3.0.0.rst
Lines changed: 10 additions & 0 deletions
diff --git a/‎pandas/_libs/hashing.pyx
Lines changed: 2 additions & 1 deletion b/‎pandas/_libs/hashing.pyx
Lines changed: 2 additions & 1 deletion
diff --git a/‎pandas/_libs/index.pyi
Lines changed: 2 additions & 2 deletions b/‎pandas/_libs/index.pyi
Lines changed: 2 additions & 2 deletions
diff --git a/‎pandas/_libs/index.pyx
Lines changed: 38 additions & 10 deletions b/‎pandas/_libs/index.pyx
Lines changed: 38 additions & 10 deletions
diff --git a/‎pandas/_libs/lib.pyx
Lines changed: 2 additions & 2 deletions b/‎pandas/_libs/lib.pyx
Lines changed: 2 additions & 2 deletions
diff --git a/‎pandas/core/arrays/arrow/accessors.py
Lines changed: 14 additions & 8 deletions b/‎pandas/core/arrays/arrow/accessors.py
Lines changed: 14 additions & 8 deletions
diff --git a/‎pandas/core/arrays/arrow/array.py
Lines changed: 2 additions & 0 deletions b/‎pandas/core/arrays/arrow/array.py
Lines changed: 2 additions & 0 deletions
@@ -70,19 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         --format=actions \
         -i ES01 `# For now it is ok if docstrings are missing the extended summary` \
         -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
-        -i "pandas.Categorical.__array__ SA01" \
-        -i "pandas.Categorical.codes SA01" \
-        -i "pandas.Categorical.dtype SA01" \
-        -i "pandas.Categorical.from_codes SA01" \
-        -i "pandas.Categorical.ordered SA01" \
-        -i "pandas.CategoricalDtype.categories SA01" \
-        -i "pandas.CategoricalDtype.ordered SA01" \
-        -i "pandas.CategoricalIndex.codes SA01" \
-        -i "pandas.CategoricalIndex.ordered SA01" \
-        -i "pandas.DataFrame.__dataframe__ SA01" \
-        -i "pandas.DataFrame.at_time PR01" \
-        -i "pandas.DataFrame.kurt RT03,SA01" \
-        -i "pandas.DataFrame.kurtosis RT03,SA01" \
         -i "pandas.DataFrame.max RT03" \
         -i "pandas.DataFrame.mean RT03,SA01" \
         -i "pandas.DataFrame.median RT03,SA01" \
@@ -98,35 +85,17 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.DataFrame.swaplevel SA01" \
         -i "pandas.DataFrame.to_markdown SA01" \
         -i "pandas.DataFrame.var PR01,RT03,SA01" \
-        -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \
-        -i "pandas.DatetimeIndex.snap PR01,RT03" \
-        -i "pandas.DatetimeIndex.to_period RT03" \
-        -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \
         -i "pandas.Grouper PR02" \
         -i "pandas.Index PR07" \
-        -i "pandas.Index.append PR07,RT03,SA01" \
-        -i "pandas.Index.difference PR07,RT03,SA01" \
-        -i "pandas.Index.drop PR07,SA01" \
-        -i "pandas.Index.duplicated RT03" \
         -i "pandas.Index.get_indexer PR07,SA01" \
         -i "pandas.Index.get_indexer_for PR01,SA01" \
         -i "pandas.Index.get_indexer_non_unique PR07,SA01" \
         -i "pandas.Index.get_loc PR07,RT03,SA01" \
-        -i "pandas.Index.identical PR01,SA01" \
-        -i "pandas.Index.insert PR07,RT03,SA01" \
-        -i "pandas.Index.intersection PR07,RT03,SA01" \
         -i "pandas.Index.join PR07,RT03,SA01" \
         -i "pandas.Index.names GL08" \
-        -i "pandas.Index.nunique RT03" \
         -i "pandas.Index.putmask PR01,RT03" \
         -i "pandas.Index.ravel PR01,RT03" \
-        -i "pandas.Index.reindex PR07" \
-        -i "pandas.Index.slice_indexer PR07,RT03,SA01" \
         -i "pandas.Index.str PR01,SA01" \
-        -i "pandas.Index.symmetric_difference PR07,RT03,SA01" \
-        -i "pandas.Index.take PR01,PR07" \
-        -i "pandas.Index.union PR07,RT03,SA01" \
-        -i "pandas.Index.view GL08" \
         -i "pandas.Int16Dtype SA01" \
         -i "pandas.Int32Dtype SA01" \
         -i "pandas.Int64Dtype SA01" \
@@ -215,15 +184,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series SA01" \
         -i "pandas.Series.__iter__ RT03,SA01" \
         -i "pandas.Series.add PR07" \
-        -i "pandas.Series.at_time PR01" \
         -i "pandas.Series.backfill PR01,SA01" \
         -i "pandas.Series.case_when RT03" \
         -i "pandas.Series.cat PR07,SA01" \
         -i "pandas.Series.cat.add_categories PR01,PR02" \
         -i "pandas.Series.cat.as_ordered PR01" \
         -i "pandas.Series.cat.as_unordered PR01" \
         -i "pandas.Series.cat.codes SA01" \
-        -i "pandas.Series.cat.ordered SA01" \
         -i "pandas.Series.cat.remove_categories PR01,PR02" \
         -i "pandas.Series.cat.remove_unused_categories PR01" \
         -i "pandas.Series.cat.rename_categories PR01,PR02" \
@@ -247,7 +214,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.round PR01,PR02" \
         -i "pandas.Series.dt.seconds SA01" \
         -i "pandas.Series.dt.strftime PR01,PR02" \
-        -i "pandas.Series.dt.to_period PR01,PR02,RT03" \
+        -i "pandas.Series.dt.to_period PR01,PR02" \
         -i "pandas.Series.dt.total_seconds PR01" \
         -i "pandas.Series.dt.tz_convert PR01,PR02" \
         -i "pandas.Series.dt.tz_localize PR01,PR02" \
@@ -276,7 +243,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.mode SA01" \
         -i "pandas.Series.mul PR07" \
         -i "pandas.Series.ne PR07,SA01" \
-        -i "pandas.Series.nunique RT03" \
         -i "pandas.Series.pad PR01,SA01" \
         -i "pandas.Series.plot PR02,SA01" \
         -i "pandas.Series.pop RT03,SA01" \
 
@@ -51,9 +51,11 @@ Other enhancements
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
 - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
+- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
+- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.notable_bug_fixes:
@@ -232,6 +234,7 @@ Removal of prior version deprecations/changes
 - :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
 - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
 - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
+- Changed behavior of :meth:`Series.__getitem__` and :meth:`Series.__setitem__` to always treat integer keys as labels, never as positional, consistent with :class:`DataFrame` behavior (:issue:`50617`)
 - Disallow allowing logical operations (``||``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``); wrap the objects in :class:`Series`, :class:`Index`, or ``np.array`` first instead (:issue:`52264`)
 - Disallow automatic casting to object in :class:`Series` logical operations (``&``, ``^``, ``||``) between series with mismatched indexes and dtypes other than ``object`` or ``bool`` (:issue:`52538`)
 - Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`)
@@ -345,13 +348,17 @@ Performance improvements
 - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
 - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
 - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`)
+- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`)
 - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
 - Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
 - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
 - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
 - Performance improvement in :meth:`RangeIndex.argmin` and :meth:`RangeIndex.argmax` (:issue:`57823`)
 - Performance improvement in :meth:`RangeIndex.insert` returning a :class:`RangeIndex` instead of a :class:`Index` when the :class:`RangeIndex` is empty. (:issue:`57833`)
 - Performance improvement in :meth:`RangeIndex.round` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57824`)
+- Performance improvement in :meth:`RangeIndex.searchsorted` (:issue:`58376`)
+- Performance improvement in :meth:`RangeIndex.to_numpy` when specifying an ``na_value`` (:issue:`58376`)
+- Performance improvement in :meth:`RangeIndex.value_counts` (:issue:`58376`)
 - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`)
 - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
 - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
@@ -397,6 +404,7 @@ Numeric
 
 Conversion
 ^^^^^^^^^^
+- Bug in :meth:`DataFrame.astype` not casting ``values`` for Arrow-based dictionary dtype correctly (:issue:`58479`)
 - Bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
 - Bug in :meth:`Series.astype` might modify read-only array inplace when casting to a string dtype (:issue:`57212`)
 - Bug in :meth:`Series.reindex` not maintaining ``float32`` type when a ``reindex`` introduces a missing value (:issue:`45857`)
@@ -477,6 +485,7 @@ Styler
 Other
 ^^^^^
 - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
+- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
 - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
@@ -488,6 +497,7 @@ Other
 - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
 - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
+- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
 
 .. ***DO NOT USE THIS SECTION***
 
 
@@ -11,6 +11,7 @@ import numpy as np
 
 from numpy cimport (
     import_array,
+    ndarray,
     uint8_t,
     uint64_t,
 )
@@ -22,7 +23,7 @@ from pandas._libs.util cimport is_nan
 
 @cython.boundscheck(False)
 def hash_object_array(
-    object[:] arr, str key, str encoding="utf8"
+    ndarray[object, ndim=1] arr, str key, str encoding="utf8"
 ) -> np.ndarray[np.uint64]:
     """
     Parameters
 
@@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ...
 
 class BaseMultiIndexCodesEngine:
     levels: list[np.ndarray]
-    offsets: np.ndarray  # ndarray[uint64_t, ndim=1]
+    offsets: np.ndarray  # np.ndarray[..., ndim=1]
 
     def __init__(
         self,
         levels: list[Index],  # all entries hashable
         labels: list[np.ndarray],  # all entries integer-dtyped
-        offsets: np.ndarray,  # np.ndarray[np.uint64, ndim=1]
+        offsets: np.ndarray,  # np.ndarray[..., ndim=1]
     ) -> None: ...
     def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
     def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
 
@@ -9,7 +9,6 @@ from numpy cimport (
     intp_t,
     ndarray,
     uint8_t,
-    uint64_t,
 )
 
 cnp.import_array()
@@ -699,16 +698,15 @@ cdef class BaseMultiIndexCodesEngine:
     Keys are located by first locating each component against the respective
     level, then locating (the integer representation of) codes.
     """
-    def __init__(self, object levels, object labels,
-                 ndarray[uint64_t, ndim=1] offsets):
+    def __init__(self, object levels, object labels, ndarray offsets):
         """
         Parameters
         ----------
         levels : list-like of numpy arrays
             Levels of the MultiIndex.
         labels : list-like of numpy arrays of integer dtype
             Labels of the MultiIndex.
-        offsets : numpy array of uint64 dtype
+        offsets : numpy array of int dtype
             Pre-calculated offsets, one for each level of the index.
         """
         self.levels = levels
@@ -718,8 +716,9 @@ cdef class BaseMultiIndexCodesEngine:
         # with positive integers (-1 for NaN becomes 1). This enables us to
         # differentiate between values that are missing in other and matching
         # NaNs. We will set values that are not found to 0 later:
-        labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
-        codes = labels_arr.astype("uint64", copy=False)
+        codes = np.array(labels).T
+        codes += multiindex_nulls_shift  # inplace sum optimisation
+
         self.level_has_nans = [-1 in lab for lab in labels]
 
         # Map each codes combination in the index to an integer unambiguously
@@ -731,8 +730,37 @@ cdef class BaseMultiIndexCodesEngine:
         # integers representing labels: we will use its get_loc and get_indexer
         self._base.__init__(self, lab_ints)
 
-    def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
-        raise NotImplementedError("Implemented by subclass")  # pragma: no cover
+    def _codes_to_ints(self, ndarray codes) -> np.ndarray:
+        """
+        Transform combination(s) of uint in one uint or Python integer (each), in a
+        strictly monotonic way (i.e. respecting the lexicographic order of integer
+        combinations).
+
+        Parameters
+        ----------
+        codes : 1- or 2-dimensional array of dtype uint
+            Combinations of integers (one per row)
+
+        Returns
+        -------
+        scalar or 1-dimensional array, of dtype _codes_dtype
+            Integer(s) representing one combination (each).
+        """
+        # To avoid overflows, first make sure we are working with the right dtype:
+        codes = codes.astype(self._codes_dtype, copy=False)
+
+        # Shift the representation of each level by the pre-calculated number of bits:
+        codes <<= self.offsets  # inplace shift optimisation
+
+        # Now sum and OR are in fact interchangeable. This is a simple
+        # composition of the (disjunct) significant bits of each level (i.e.
+        # each column in "codes") in a single positive integer (per row):
+        if codes.ndim == 1:
+            # Single key
+            return np.bitwise_or.reduce(codes)
+
+        # Multiple keys
+        return np.bitwise_or.reduce(codes, axis=1)
 
     def _extract_level_codes(self, target) -> np.ndarray:
         """
@@ -757,7 +785,7 @@ cdef class BaseMultiIndexCodesEngine:
             codes[codes > 0] += 1
             if self.level_has_nans[i]:
                 codes[target.codes[i] == -1] += 1
-        return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
+        return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)
 
     def get_indexer(self, target: np.ndarray) -> np.ndarray:
         """
@@ -788,7 +816,7 @@ cdef class BaseMultiIndexCodesEngine:
             raise KeyError(key)
 
         # Transform indices into single integer:
-        lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
+        lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))
 
         return self._base.get_loc(self, lab_int)
 
 
@@ -2808,14 +2808,14 @@ def maybe_convert_objects(ndarray[object] objects,
             from pandas.core.arrays import IntegerArray
 
             # Set these values to 1 to be deterministic, match
-            #  IntegerArray._internal_fill_value
+            #  IntegerDtype._internal_fill_value
             result[mask] = 1
             result = IntegerArray(result, mask)
         elif result is floats and convert_to_nullable_dtype:
             from pandas.core.arrays import FloatingArray
 
             # Set these values to 1.0 to be deterministic, match
-            #  FloatingArray._internal_fill_value
+            #  FloatingDtype._internal_fill_value
             result[mask] = 1.0
             result = FloatingArray(result, mask)
 
 
@@ -110,7 +110,9 @@ def len(self) -> Series:
         from pandas import Series
 
         value_lengths = pc.list_value_length(self._pa_array)
-        return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))
+        return Series(
+            value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index
+        )
 
     def __getitem__(self, key: int | slice) -> Series:
         """
@@ -149,7 +151,9 @@ def __getitem__(self, key: int | slice) -> Series:
             # if key < 0:
             #     key = pc.add(key, pc.list_value_length(self._pa_array))
             element = pc.list_element(self._pa_array, key)
-            return Series(element, dtype=ArrowDtype(element.type))
+            return Series(
+                element, dtype=ArrowDtype(element.type), index=self._data.index
+            )
         elif isinstance(key, slice):
             if pa_version_under11p0:
                 raise NotImplementedError(
@@ -167,7 +171,7 @@ def __getitem__(self, key: int | slice) -> Series:
             if step is None:
                 step = 1
             sliced = pc.list_slice(self._pa_array, start, stop, step)
-            return Series(sliced, dtype=ArrowDtype(sliced.type))
+            return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index)
         else:
             raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
 
@@ -195,15 +199,17 @@ def flatten(self) -> Series:
         ... )
         >>> s.list.flatten()
         0    1
-        1    2
-        2    3
-        3    3
+        0    2
+        0    3
+        1    3
         dtype: int64[pyarrow]
         """
         from pandas import Series
 
-        flattened = pc.list_flatten(self._pa_array)
-        return Series(flattened, dtype=ArrowDtype(flattened.type))
+        counts = pa.compute.list_value_length(self._pa_array)
+        flattened = pa.compute.list_flatten(self._pa_array)
+        index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type)))
+        return Series(flattened, dtype=ArrowDtype(flattened.type), index=index)
 
 
 class StructAccessor(ArrowAccessor):
 
@@ -525,6 +525,8 @@ def _box_pa_array(
         if pa_type is not None and pa_array.type != pa_type:
             if pa.types.is_dictionary(pa_type):
                 pa_array = pa_array.dictionary_encode()
+                if pa_array.type != pa_type:
+                    pa_array = pa_array.cast(pa_type)
             else:
                 try:
                     pa_array = pa_array.cast(pa_type)