Merge branch 'main' of github.com:Wong2333/pandas

Wong2333 · Wong2333 · commit dd59499784de · 2024-12-10T09:54:19.000+08:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.IntervalArray.length SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
@@ -95,9 +94,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.std SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
-        -i "pandas.errors.NullFrequencyError SA01" \
-        -i "pandas.errors.NumbaUtilError SA01" \
-        -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -185,7 +185,6 @@ Reindexing / selection / label manipulation
    DataFrame.duplicated
    DataFrame.equals
    DataFrame.filter
-   DataFrame.head
    DataFrame.idxmax
    DataFrame.idxmin
    DataFrame.reindex
@@ -196,7 +195,6 @@ Reindexing / selection / label manipulation
    DataFrame.sample
    DataFrame.set_axis
    DataFrame.set_index
-   DataFrame.tail
    DataFrame.take
    DataFrame.truncate
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -626,6 +626,7 @@ Datetimelike
 - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`)
 - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
 - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
+- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`)
 - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
 - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
 - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
@@ -798,6 +799,7 @@ Other
 - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
+- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
 - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 
 .. ***DO NOT USE THIS SECTION***
diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py
@@ -117,7 +117,10 @@ def len(self) -> Series:
 
         value_lengths = pc.list_value_length(self._pa_array)
         return Series(
-            value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index
+            value_lengths,
+            dtype=ArrowDtype(value_lengths.type),
+            index=self._data.index,
+            name=self._data.name,
         )
 
     def __getitem__(self, key: int | slice) -> Series:
@@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series:
             #     key = pc.add(key, pc.list_value_length(self._pa_array))
             element = pc.list_element(self._pa_array, key)
             return Series(
-                element, dtype=ArrowDtype(element.type), index=self._data.index
+                element,
+                dtype=ArrowDtype(element.type),
+                index=self._data.index,
+                name=self._data.name,
             )
         elif isinstance(key, slice):
             if pa_version_under11p0:
@@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series:
             if step is None:
                 step = 1
             sliced = pc.list_slice(self._pa_array, start, stop, step)
-            return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index)
+            return Series(
+                sliced,
+                dtype=ArrowDtype(sliced.type),
+                index=self._data.index,
+                name=self._data.name,
+            )
         else:
             raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
 
@@ -223,7 +234,12 @@ def flatten(self) -> Series:
         counts = pa.compute.list_value_length(self._pa_array)
         flattened = pa.compute.list_flatten(self._pa_array)
         index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type)))
-        return Series(flattened, dtype=ArrowDtype(flattened.type), index=index)
+        return Series(
+            flattened,
+            dtype=ArrowDtype(flattened.type),
+            index=index,
+            name=self._data.name,
+        )
 
 
 class StructAccessor(ArrowAccessor):
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
@@ -205,7 +205,7 @@ def generate(self, v) -> str:
         val = v.tostring(self.encoding)
         return f"({self.lhs} {self.op} {val})"
 
-    def convert_value(self, v) -> TermValue:
+    def convert_value(self, conv_val) -> TermValue:
         """
         convert the expression that is in the term to something that is
         accepted by pytables
@@ -219,44 +219,44 @@ def stringify(value):
         kind = ensure_decoded(self.kind)
         meta = ensure_decoded(self.meta)
         if kind == "datetime" or (kind and kind.startswith("datetime64")):
-            if isinstance(v, (int, float)):
-                v = stringify(v)
-            v = ensure_decoded(v)
-            v = Timestamp(v).as_unit("ns")
-            if v.tz is not None:
-                v = v.tz_convert("UTC")
-            return TermValue(v, v._value, kind)
+            if isinstance(conv_val, (int, float)):
+                conv_val = stringify(conv_val)
+            conv_val = ensure_decoded(conv_val)
+            conv_val = Timestamp(conv_val).as_unit("ns")
+            if conv_val.tz is not None:
+                conv_val = conv_val.tz_convert("UTC")
+            return TermValue(conv_val, conv_val._value, kind)
         elif kind in ("timedelta64", "timedelta"):
-            if isinstance(v, str):
-                v = Timedelta(v)
+            if isinstance(conv_val, str):
+                conv_val = Timedelta(conv_val)
             else:
-                v = Timedelta(v, unit="s")
-            v = v.as_unit("ns")._value
-            return TermValue(int(v), v, kind)
+                conv_val = Timedelta(conv_val, unit="s")
+            conv_val = conv_val.as_unit("ns")._value
+            return TermValue(int(conv_val), conv_val, kind)
         elif meta == "category":
             metadata = extract_array(self.metadata, extract_numpy=True)
             result: npt.NDArray[np.intp] | np.intp | int
-            if v not in metadata:
+            if conv_val not in metadata:
                 result = -1
             else:
-                result = metadata.searchsorted(v, side="left")
+                result = metadata.searchsorted(conv_val, side="left")
             return TermValue(result, result, "integer")
         elif kind == "integer":
             try:
-                v_dec = Decimal(v)
+                v_dec = Decimal(conv_val)
             except InvalidOperation:
                 # GH 54186
                 # convert v to float to raise float's ValueError
-                float(v)
+                float(conv_val)
             else:
-                v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
-            return TermValue(v, v, kind)
+                conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
+            return TermValue(conv_val, conv_val, kind)
         elif kind == "float":
-            v = float(v)
-            return TermValue(v, v, kind)
+            conv_val = float(conv_val)
+            return TermValue(conv_val, conv_val, kind)
         elif kind == "bool":
-            if isinstance(v, str):
-                v = v.strip().lower() not in [
+            if isinstance(conv_val, str):
+                conv_val = conv_val.strip().lower() not in [
                     "false",
                     "f",
                     "no",
@@ -268,13 +268,13 @@ def stringify(value):
                     "",
                 ]
             else:
-                v = bool(v)
-            return TermValue(v, v, kind)
-        elif isinstance(v, str):
+                conv_val = bool(conv_val)
+            return TermValue(conv_val, conv_val, kind)
+        elif isinstance(conv_val, str):
             # string quoting
-            return TermValue(v, stringify(v), "string")
+            return TermValue(conv_val, stringify(conv_val), "string")
         else:
-            raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column")
+            raise TypeError(f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column")
 
     def convert_values(self) -> None:
         pass
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1018,7 +1018,7 @@ def shape(self) -> tuple[int, int]:
 
         See Also
         --------
-        ndarray.shape : Tuple of array dimensions.
+        numpy.ndarray.shape : Tuple of array dimensions.
 
         Examples
         --------
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -640,7 +640,7 @@ def ndim(self) -> int:
 
         See Also
         --------
-        ndarray.ndim : Number of array dimensions.
+        numpy.ndarray.ndim : Number of array dimensions.
 
         Examples
         --------
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -694,7 +694,7 @@ def bfill(self, limit: int | None = None):
 
         References
         ----------
-        .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
+        .. [1] https://en.wikipedia.org/wiki/Imputation_%28statistics%29
 
         Examples
         --------
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1374,6 +1374,11 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
         """
         Determine if each string starts with a match of a regular expression.
 
+        Determines whether each string in the Series or Index starts with a
+        match to a specified regular expression. This function is especially
+        useful for validating prefixes, such as ensuring that codes, tags, or
+        identifiers begin with a specific pattern.
+
         Parameters
         ----------
         pat : str
@@ -1419,6 +1424,11 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
         """
         Determine if each string entirely matches a regular expression.
 
+        Checks if each string in the Series or Index fully matches the
+        specified regular expression pattern. This function is useful when the
+        requirement is for an entire string to conform to a pattern, such as
+        validating formats like phone numbers or email addresses.
+
         Parameters
         ----------
         pat : str
@@ -1647,6 +1657,10 @@ def repeat(self, repeats):
         """
         Duplicate each string in the Series or Index.
 
+        Duplicates each string in the Series or Index, either by applying the
+        same repeat count to all elements or by using different repeat values
+        for each element.
+
         Parameters
         ----------
         repeats : int or sequence of int
@@ -1710,6 +1724,12 @@ def pad(
         """
         Pad strings in the Series/Index up to width.
 
+        This function pads strings in a Series or Index to a specified width,
+        filling the extra space with a character of your choice. It provides
+        flexibility in positioning the padding, allowing it to be added to the
+        left, right, or both sides. This is useful for formatting strings to
+        align text or ensure consistent string lengths in data processing.
+
         Parameters
         ----------
         width : int
@@ -1920,6 +1940,11 @@ def slice(self, start=None, stop=None, step=None):
         """
         Slice substrings from each element in the Series or Index.
 
+        Slicing substrings from strings in a Series or Index helps extract
+        specific portions of data, making it easier to analyze or manipulate
+        text. This is useful for tasks like parsing structured text fields or
+        isolating parts of strings with a consistent format.
+
         Parameters
         ----------
         start : int, optional
@@ -1996,6 +2021,11 @@ def slice_replace(self, start=None, stop=None, repl=None):
         """
         Replace a positional slice of a string with another value.
 
+        This function allows replacing specific parts of a string in a Series
+        or Index by specifying start and stop positions. It is useful for
+        modifying substrings in a controlled way, such as updating sections of
+        text based on their positions or patterns.
+
         Parameters
         ----------
         start : int, optional
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -44,6 +44,7 @@
 from pandas.core.dtypes.common import (
     ensure_object,
     is_float,
+    is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_list_like,
@@ -1153,6 +1154,10 @@ def coerce(values):
         # we allow coercion to if errors allows
         values = to_numeric(values, errors=errors)
 
+        # prevent prevision issues in case of float32 # GH#60506
+        if is_float_dtype(values.dtype):
+            values = values.astype("float64")
+
         # prevent overflow in case of int8 or int16
         if is_integer_dtype(values.dtype):
             values = values.astype("int64")
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
@@ -45,6 +45,11 @@ class NullFrequencyError(ValueError):
     Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``,
     ``PeriodIndex.shift``.
 
+    See Also
+    --------
+    Index.shift : Shift values of Index.
+    Series.shift : Shift values of Series.
+
     Examples
     --------
     >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None)
@@ -58,6 +63,12 @@ class PerformanceWarning(Warning):
     """
     Warning raised when there is a possible performance impact.
 
+    See Also
+    --------
+    DataFrame.set_index : Set the DataFrame index using existing columns.
+    DataFrame.loc : Access a group of rows and columns by label(s) \
+    or a boolean array.
+
     Examples
     --------
     >>> df = pd.DataFrame(
@@ -385,6 +396,13 @@ class NumbaUtilError(Exception):
     """
     Error raised for unsupported Numba engine routines.
 
+    See Also
+    --------
+    DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
+    Series.groupby : Group Series using a mapper or by a Series of columns.
+    DataFrame.agg : Aggregate using one or more operations over the specified axis.
+    Series.agg : Aggregate using one or more operations over the specified axis.
+
     Examples
     --------
     >>> df = pd.DataFrame(
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py