Merge remote-tracking branch 'upstream/main' into fix-docstring

fbourgey · fbourgey · commit 347fc7f63edb · 2024-09-13T10:12:38.000-04:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -151,14 +151,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
         -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.max SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.min SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
         -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
@@ -167,13 +164,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.max SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.min SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \
         -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
         -i "pandas.core.resample.Resampler.ffill RT03" \
         -i "pandas.core.resample.Resampler.get_group RT03,SA01" \
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -102,6 +102,7 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -627,6 +627,7 @@ I/O
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
+- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -600,6 +600,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool:
                     if not array_equivalent(x, y):
                         return False
 
+            elif PyArray_Check(x) or PyArray_Check(y):
+                return False
             elif (x is C_NA) ^ (y is C_NA):
                 return False
             elif not (
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1338,7 +1338,13 @@ def string_storage(request):
         pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
         pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
         ("python", np.nan),
-    ]
+    ],
+    ids=[
+        "string=string[python]",
+        "string=string[pyarrow]",
+        "string=str[pyarrow]",
+        "string=str[python]",
+    ],
 )
 def string_dtype_arguments(request):
     """
@@ -1369,6 +1375,7 @@ def dtype_backend(request):
 
 # Alias so we can test with cartesian product of string_storage
 string_storage2 = string_storage
+string_dtype_arguments2 = string_dtype_arguments
 
 
 @pytest.fixture(params=tm.BYTES_DTYPES)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1999,7 +1999,7 @@ def _rank(
         """
         See Series.rank.__doc__.
         """
-        return self._convert_int_result(
+        return self._convert_rank_result(
             self._rank_calc(
                 axis=axis,
                 method=method,
@@ -2318,6 +2318,9 @@ def _convert_bool_result(self, result):
     def _convert_int_result(self, result):
         return type(self)(result)
 
+    def _convert_rank_result(self, result):
+        return type(self)(result)
+
     def _str_count(self, pat: str, flags: int = 0) -> Self:
         if flags:
             raise NotImplementedError(f"count not implemented with {flags=}")
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -46,6 +46,7 @@
     nanops,
     ops,
 )
+from pandas.core.algorithms import isin
 from pandas.core.array_algos import masked_reductions
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.arrays.floating import (
@@ -65,6 +66,7 @@
     import pyarrow
 
     from pandas._typing import (
+        ArrayLike,
         AxisInt,
         Dtype,
         DtypeObj,
@@ -735,6 +737,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
         # base class implementation that uses __setitem__
         ExtensionArray._putmask(self, mask, value)
 
+    def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
+        if isinstance(values, BaseStringArray) or (
+            isinstance(values, ExtensionArray) and is_string_dtype(values.dtype)
+        ):
+            values = values.astype(self.dtype, copy=False)
+        else:
+            if not lib.is_string_array(np.asarray(values), skipna=True):
+                values = np.array(
+                    [val for val in values if isinstance(val, str) or isna(val)],
+                    dtype=object,
+                )
+                if not len(values):
+                    return np.zeros(self.shape, dtype=bool)
+
+            values = self._from_sequence(values, dtype=self.dtype)
+
+        return isin(np.asarray(self), np.asarray(values))
+
     def astype(self, dtype, copy: bool = True):
         dtype = pandas_dtype(dtype)
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -29,6 +29,7 @@
 from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
 from pandas.core.arrays.arrow import ArrowExtensionArray
 from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.floating import Float64Dtype
 from pandas.core.arrays.integer import Int64Dtype
 from pandas.core.arrays.numeric import NumericDtype
 from pandas.core.arrays.string_ import (
@@ -395,6 +396,16 @@ def _convert_int_result(self, result):
 
         return Int64Dtype().__from_arrow__(result)
 
+    def _convert_rank_result(self, result):
+        if self.dtype.na_value is np.nan:
+            if isinstance(result, pa.Array):
+                result = result.to_numpy(zero_copy_only=False)
+            else:
+                result = result.to_numpy()
+            return result.astype("float64", copy=False)
+
+        return Float64Dtype().__from_arrow__(result)
+
     def _reduce(
         self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
     ):
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -615,6 +615,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs):
 
         See Also
         --------
+        Series.filter: Filter elements of ungrouped Series.
         DataFrameGroupBy.filter : Filter elements from groups base on criterion.
 
         Notes
@@ -1963,6 +1964,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame:
 
         See Also
         --------
+        DataFrame.filter: Filter elements of ungrouped DataFrame.
         SeriesGroupBy.filter : Filter elements from groups base on criterion.
 
         Notes
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -199,6 +199,15 @@ class providing the base-class of operations.
 Series or DataFrame
     Computed {fname} of values within each group.
 
+See Also
+--------
+SeriesGroupBy.min : Return the min of the group values.
+DataFrameGroupBy.min : Return the min of the group values.
+SeriesGroupBy.max : Return the max of the group values.
+DataFrameGroupBy.max : Return the max of the group values.
+SeriesGroupBy.sum : Return the sum of the group values.
+DataFrameGroupBy.sum : Return the sum of the group values.
+
 Examples
 --------
 {example}
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -1648,7 +1648,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T
             if keep_default_na:
                 v = set(v) | STR_NA_VALUES
 
-            na_values[k] = v
+            na_values[k] = _stringify_na_values(v, floatify)
         na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
     else:
         if not is_list_like(na_values):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -30,6 +30,12 @@ def dtype(string_dtype_arguments):
     return pd.StringDtype(storage=storage, na_value=na_value)
 
 
+@pytest.fixture
+def dtype2(string_dtype_arguments2):
+    storage, na_value = string_dtype_arguments2
+    return pd.StringDtype(storage=storage, na_value=na_value)
+
+
 @pytest.fixture
 def cls(dtype):
     """Fixture giving array type from parametrized 'dtype'"""
@@ -662,11 +668,7 @@ def test_isin(dtype, fixed_now_ts):
     tm.assert_series_equal(result, expected)
 
     result = s.isin(["a", pd.NA])
-    if dtype.storage == "python" and dtype.na_value is np.nan:
-        # TODO(infer_string) we should make this consistent
-        expected = pd.Series([True, False, False])
-    else:
-        expected = pd.Series([True, False, True])
+    expected = pd.Series([True, False, True])
     tm.assert_series_equal(result, expected)
 
     result = s.isin([])
@@ -677,6 +679,35 @@ def test_isin(dtype, fixed_now_ts):
     expected = pd.Series([True, False, False])
     tm.assert_series_equal(result, expected)
 
+    result = s.isin([fixed_now_ts])
+    expected = pd.Series([False, False, False])
+    tm.assert_series_equal(result, expected)
+
+
+def test_isin_string_array(dtype, dtype2):
+    s = pd.Series(["a", "b", None], dtype=dtype)
+
+    result = s.isin(pd.array(["a", "c"], dtype=dtype2))
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(pd.array(["a", None], dtype=dtype2))
+    expected = pd.Series([True, False, True])
+    tm.assert_series_equal(result, expected)
+
+
+def test_isin_arrow_string_array(dtype):
+    pa = pytest.importorskip("pyarrow")
+    s = pd.Series(["a", "b", None], dtype=dtype)
+
+    result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
+    expected = pd.Series([True, False, True])
+    tm.assert_series_equal(result, expected)
+
 
 def test_setitem_scalar_with_mask_validation(dtype):
     # https://github.com/pandas-dev/pandas/issues/47628
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
@@ -1,4 +1,3 @@
-from contextlib import nullcontext
 from datetime import datetime
 from decimal import Decimal
 
@@ -7,7 +6,6 @@
 
 from pandas._libs import missing as libmissing
 from pandas._libs.tslibs import iNaT
-from pandas.compat.numpy import np_version_gte1p25
 
 from pandas.core.dtypes.common import (
     is_float,
@@ -458,15 +456,7 @@ def test_array_equivalent_dti(dtype_equal):
 )
 def test_array_equivalent_series(val):
     arr = np.array([1, 2])
-    msg = "elementwise comparison failed"
-    cm = (
-        # stacklevel is chosen to make sense when called from .equals
-        tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False)
-        if isinstance(val, str) and not np_version_gte1p25
-        else nullcontext()
-    )
-    with cm:
-        assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
+    assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
 
 
 def test_array_equivalent_array_mismatched_shape():
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
@@ -6,15 +6,11 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.algos import (
     Infinity,
     NegInfinity,
 )
-from pandas.compat import HAS_PYARROW
 
-import pandas as pd
 from pandas import (
     DataFrame,
     Index,
@@ -467,23 +463,10 @@ def test_rank_inf_nans_na_option(
             ("top", False, [2.0, 3.0, 1.0, 4.0]),
         ],
     )
-    def test_rank_object_first(
-        self,
-        request,
-        frame_or_series,
-        na_option,
-        ascending,
-        expected,
-        using_infer_string,
-    ):
+    def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
         obj = frame_or_series(["foo", "foo", None, "foo"])
-        if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series):
-            request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
-
         result = obj.rank(method="first", na_option=na_option, ascending=ascending)
         expected = frame_or_series(expected)
-        if using_infer_string and isinstance(obj, Series):
-            expected = expected.astype("uint64")
         tm.assert_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -507,7 +490,9 @@ def test_rank_string_dtype(self, string_dtype_no_object):
         # GH#55362
         obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
         result = obj.rank(method="first")
-        exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
+        exp_dtype = (
+            "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64"
+        )
         if string_dtype_no_object.storage == "python":
             # TODO nullable string[python] should also return nullable Int64
             exp_dtype = "float64"
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -812,3 +812,21 @@ def test_bool_and_nan_to_float(all_parsers):
     result = parser.read_csv(StringIO(data), dtype="float")
     expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+@pytest.mark.parametrize(
+    "na_values",
+    [[-99.0, -99], [-99, -99.0]],
+)
+def test_na_values_dict_without_dtype(all_parsers, na_values):
+    parser = all_parsers
+    data = """A
+-99
+-99
+-99.0
+-99.0"""
+
+    result = parser.read_csv(StringIO(data), na_values=na_values)
+    expected = DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]})
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py
diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py
diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py