Merge remote-tracking branch 'upstream/main' into tst/np/2

mroeschke · mroeschke · commit 87aa7acbb428 · 2024-06-18T17:23:49.000-07:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -462,7 +462,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
         -i "pandas.io.stata.StataWriter.write_file SA01" \
         -i "pandas.json_normalize RT03,SA01" \
-        -i "pandas.merge_asof PR07,RT03" \
         -i "pandas.period_range RT03,SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
         -i "pandas.plotting.lag_plot RT03,SA01" \
diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -1606,7 +1606,7 @@ For instance:
 This method does not convert the row to a Series object; it merely
 returns the values inside a namedtuple. Therefore,
 :meth:`~DataFrame.itertuples` preserves the data type of the values
-and is generally faster as :meth:`~DataFrame.iterrows`.
+and is generally faster than :meth:`~DataFrame.iterrows`.
 
 .. note::
 
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -3003,7 +3003,7 @@ However, if XPath does not reference node names such as default, ``/*``, then
 .. note::
 
    Since ``xpath`` identifies the parent of content to be parsed, only immediate
-   desendants which include child nodes or current attributes are parsed.
+   descendants which include child nodes or current attributes are parsed.
    Therefore, ``read_xml`` will not parse the text of grandchildren or other
    descendants and will not parse attributes of any descendant. To retrieve
    lower level content, adjust xpath to lower level. For example,
@@ -3535,7 +3535,7 @@ For example, to read in a ``MultiIndex`` index without names:
    df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1])
    df
 
-If the index has level names, they will parsed as well, using the same
+If the index has level names, they will be parsed as well, using the same
 parameters.
 
 .. ipython:: python
@@ -5847,10 +5847,10 @@ You can check if a table exists using :func:`~pandas.io.sql.has_table`
 Schema support
 ''''''''''''''
 
-Reading from and writing to different schema's is supported through the ``schema``
+Reading from and writing to different schemas is supported through the ``schema``
 keyword in the :func:`~pandas.read_sql_table` and :func:`~pandas.DataFrame.to_sql`
 functions. Note however that this depends on the database flavor (sqlite does not
-have schema's). For example:
+have schemas). For example:
 
 .. code-block:: python
 
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -319,7 +319,7 @@ Missing values propagate through arithmetic operations between pandas objects.
 
 The descriptive statistics and computational methods discussed in the
 :ref:`data structure overview <basics.stats>` (and listed :ref:`here
-<api.series.stats>` and :ref:`here <api.dataframe.stats>`) are all
+<api.series.stats>` and :ref:`here <api.dataframe.stats>`) all
 account for missing data.
 
 When summing data, NA values or empty data will be treated as zero.
diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst
@@ -8,7 +8,7 @@ Options and settings
 
 Overview
 --------
-pandas has an options API configure and customize global behavior related to
+pandas has an options API to configure and customize global behavior related to
 :class:`DataFrame` display, data behavior and more.
 
 Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``).
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -1479,7 +1479,7 @@ or some other non-observed day.  Defined observance rules are:
     "after_nearest_workday", "apply ``nearest_workday`` and then move to next workday after that day"
     "sunday_to_monday", "move Sunday to following Monday"
     "next_monday_or_tuesday", "move Saturday to Monday and Sunday/Monday to Tuesday"
-    "previous_friday", move Saturday and Sunday to previous Friday"
+    "previous_friday", "move Saturday and Sunday to previous Friday"
     "next_monday", "move Saturday and Sunday to following Monday"
     "weekend_to_monday", "same as ``next_monday``"
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -503,8 +503,8 @@ Timezones
 
 Numeric
 ^^^^^^^
+- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
 - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
--
 
 Conversion
 ^^^^^^^^^^
@@ -546,6 +546,7 @@ I/O
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
+- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
 
@@ -608,6 +609,7 @@ Other
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
 - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
+- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
 - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
 - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -18,7 +18,6 @@
 
 from pandas._libs import lib
 from pandas._libs.tslibs import (
-    NaT,
     Timedelta,
     Timestamp,
     timezones,
@@ -2612,17 +2611,19 @@ def _str_wrap(self, width: int, **kwargs) -> Self:
     @property
     def _dt_days(self) -> Self:
         return type(self)(
-            pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32())
+            pa.array(
+                self._to_timedeltaarray().components.days,
+                from_pandas=True,
+                type=pa.int32(),
+            )
         )
 
     @property
     def _dt_hours(self) -> Self:
         return type(self)(
             pa.array(
-                [
-                    td.components.hours if td is not NaT else None
-                    for td in self._to_timedeltaarray()
-                ],
+                self._to_timedeltaarray().components.hours,
+                from_pandas=True,
                 type=pa.int32(),
             )
         )
@@ -2631,10 +2632,8 @@ def _dt_hours(self) -> Self:
     def _dt_minutes(self) -> Self:
         return type(self)(
             pa.array(
-                [
-                    td.components.minutes if td is not NaT else None
-                    for td in self._to_timedeltaarray()
-                ],
+                self._to_timedeltaarray().components.minutes,
+                from_pandas=True,
                 type=pa.int32(),
             )
         )
@@ -2643,18 +2642,18 @@ def _dt_minutes(self) -> Self:
     def _dt_seconds(self) -> Self:
         return type(self)(
             pa.array(
-                self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32()
+                self._to_timedeltaarray().components.seconds,
+                from_pandas=True,
+                type=pa.int32(),
             )
         )
 
     @property
     def _dt_milliseconds(self) -> Self:
         return type(self)(
             pa.array(
-                [
-                    td.components.milliseconds if td is not NaT else None
-                    for td in self._to_timedeltaarray()
-                ],
+                self._to_timedeltaarray().components.milliseconds,
+                from_pandas=True,
                 type=pa.int32(),
             )
         )
@@ -2663,7 +2662,7 @@ def _dt_milliseconds(self) -> Self:
     def _dt_microseconds(self) -> Self:
         return type(self)(
             pa.array(
-                self._to_timedeltaarray().microseconds,
+                self._to_timedeltaarray().components.microseconds,
                 from_pandas=True,
                 type=pa.int32(),
             )
@@ -2673,7 +2672,9 @@ def _dt_microseconds(self) -> Self:
     def _dt_nanoseconds(self) -> Self:
         return type(self)(
             pa.array(
-                self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32()
+                self._to_timedeltaarray().components.nanoseconds,
+                from_pandas=True,
+                type=pa.int32(),
             )
         )
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -13078,7 +13078,7 @@ def quantile(
 
         if len(data.columns) == 0:
             # GH#23925 _get_numeric_data may have dropped all columns
-            cols = Index([], name=self.columns.name)
+            cols = self.columns[:0]
 
             dtype = np.float64
             if axis == 1:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -158,7 +158,6 @@
     Index,
     MultiIndex,
     PeriodIndex,
-    RangeIndex,
     default_index,
     ensure_index,
 )
@@ -1852,7 +1851,7 @@ def _drop_labels_or_levels(self, keys, axis: AxisInt = 0):
                 else:
                     # Drop the last level of Index by replacing with
                     # a RangeIndex
-                    dropped.columns = RangeIndex(dropped.columns.size)
+                    dropped.columns = default_index(dropped.columns.size)
 
             # Handle dropping index labels
             if labels_to_drop:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -128,7 +128,6 @@ class providing the base-class of operations.
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
-    RangeIndex,
     default_index,
 )
 from pandas.core.internals.blocks import ensure_block_shape
@@ -1264,7 +1263,7 @@ def _set_result_index_ordered(
         if self._grouper.has_dropped_na:
             # Add back in any missing rows due to dropna - index here is integral
             # with values referring to the row of the input so can use RangeIndex
-            result = result.reindex(RangeIndex(len(index)), axis=0)
+            result = result.reindex(default_index(len(index)), axis=0)
         result = result.set_axis(index, axis=0)
 
         return result
@@ -1334,7 +1333,7 @@ def _wrap_aggregated_output(
             #   enforced in __init__
             result = self._insert_inaxis_grouper(result, qs=qs)
             result = result._consolidate()
-            result.index = RangeIndex(len(result))
+            result.index = default_index(len(result))
 
         else:
             index = self._grouper.result_index
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -34,6 +34,7 @@
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
+    default_index,
 )
 from pandas.core.series import Series
 
@@ -901,7 +902,7 @@ def is_in_obj(gpr) -> bool:
     if len(groupings) == 0 and len(obj):
         raise ValueError("No group keys passed!")
     if len(groupings) == 0:
-        groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
+        groupings.append(Grouping(default_index(0), np.array([], dtype=np.intp)))
 
     # create the internals grouper
     grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -130,7 +130,7 @@ def _get_combined_index(
     # TODO: handle index names!
     indexes = _get_distinct_objs(indexes)
     if len(indexes) == 0:
-        index = Index([])
+        index: Index = default_index(0)
     elif len(indexes) == 1:
         index = indexes[0]
     elif intersect:
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -249,7 +249,7 @@ def blklocs(self) -> npt.NDArray[np.intp]:
     def make_empty(self, axes=None) -> Self:
         """return an empty BlockManager with the items axis of len 0"""
         if axes is None:
-            axes = [Index([])] + self.axes[1:]
+            axes = [default_index(0)] + self.axes[1:]
 
         # preserve dtype if possible
         if self.ndim == 1:
diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py
@@ -29,6 +29,8 @@
 )
 from pandas.core.dtypes.dtypes import BaseMaskedDtype
 
+from pandas.core.indexes.api import default_index
+
 if TYPE_CHECKING:
     from pandas._typing import (
         DtypeObj,
@@ -38,6 +40,7 @@
 
     from pandas import (
         DataFrame,
+        Index,
         Series,
     )
 else:
@@ -199,8 +202,6 @@ def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> No
         self.columns = columns
 
     def compute(self, method: str) -> DataFrame:
-        from pandas.core.api import Index
-
         n = self.n
         frame = self.obj
         columns = self.columns
@@ -227,7 +228,7 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:
         original_index = frame.index
         cur_frame = frame = frame.reset_index(drop=True)
         cur_n = n
-        indexer = Index([], dtype=np.int64)
+        indexer: Index = default_index(0)
 
         for i, column in enumerate(columns):
             # For each column we apply method to cur_frame[column].
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -673,7 +673,9 @@ def merge_asof(
     Parameters
     ----------
     left : DataFrame or named Series
+        First pandas object to merge.
     right : DataFrame or named Series
+        Second pandas object to merge.
     on : label
         Field name to join on. Must be found in both DataFrames.
         The data MUST be ordered. Furthermore this must be a numeric column,
@@ -712,6 +714,7 @@ def merge_asof(
     Returns
     -------
     DataFrame
+        A DataFrame of the two merged objects.
 
     See Also
     --------
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -42,7 +42,7 @@
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
-    RangeIndex,
+    default_index,
 )
 from pandas.core.reshape.concat import concat
 from pandas.core.series import Series
@@ -1047,7 +1047,7 @@ def stack_reshape(
             if data.ndim == 1:
                 data.name = 0
             else:
-                data.columns = RangeIndex(len(data.columns))
+                data.columns = default_index(len(data.columns))
         buf.append(data)
 
     if len(buf) > 0 and not frame.empty:
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -1178,7 +1178,7 @@ def read_html(
     **after** `skiprows` is applied.
 
     This function will *always* return a list of :class:`DataFrame` *or*
-    it will fail, e.g., it will *not* return an empty list.
+    it will fail, i.e., it will *not* return an empty list.
 
     Examples
     --------
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py