Skip to content

Commit 347fc7f

Browse files
committed
Merge remote-tracking branch 'upstream/main' into fix-docstring
2 parents 5c543ca + 0d2505d commit 347fc7f

File tree

18 files changed

+179
-71
lines changed

18 files changed

+179
-71
lines changed

ci/code_checks.sh

-6
Original file line numberDiff line numberDiff line change
@@ -151,14 +151,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
151151
-i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
152152
-i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
153153
-i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
154-
-i "pandas.core.groupby.DataFrameGroupBy.max SA01" \
155-
-i "pandas.core.groupby.DataFrameGroupBy.min SA01" \
156154
-i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
157155
-i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
158156
-i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \
159157
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
160158
-i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
161-
-i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \
162159
-i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
163160
-i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
164161
-i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
@@ -167,13 +164,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
167164
-i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
168165
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
169166
-i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
170-
-i "pandas.core.groupby.SeriesGroupBy.max SA01" \
171-
-i "pandas.core.groupby.SeriesGroupBy.min SA01" \
172167
-i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
173168
-i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \
174169
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
175170
-i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
176-
-i "pandas.core.groupby.SeriesGroupBy.sum SA01" \
177171
-i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
178172
-i "pandas.core.resample.Resampler.ffill RT03" \
179173
-i "pandas.core.resample.Resampler.get_group RT03,SA01" \

doc/source/whatsnew/v2.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ Conversion
102102

103103
Strings
104104
^^^^^^^
105+
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
105106
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
106107
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
107108
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ I/O
627627
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
628628
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
629629
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
630+
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
630631
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
631632
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
632633
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

pandas/_libs/lib.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool:
600600
if not array_equivalent(x, y):
601601
return False
602602

603+
elif PyArray_Check(x) or PyArray_Check(y):
604+
return False
603605
elif (x is C_NA) ^ (y is C_NA):
604606
return False
605607
elif not (

pandas/conftest.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1338,7 +1338,13 @@ def string_storage(request):
13381338
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
13391339
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
13401340
("python", np.nan),
1341-
]
1341+
],
1342+
ids=[
1343+
"string=string[python]",
1344+
"string=string[pyarrow]",
1345+
"string=str[pyarrow]",
1346+
"string=str[python]",
1347+
],
13421348
)
13431349
def string_dtype_arguments(request):
13441350
"""
@@ -1369,6 +1375,7 @@ def dtype_backend(request):
13691375

13701376
# Alias so we can test with cartesian product of string_storage
13711377
string_storage2 = string_storage
1378+
string_dtype_arguments2 = string_dtype_arguments
13721379

13731380

13741381
@pytest.fixture(params=tm.BYTES_DTYPES)

pandas/core/arrays/arrow/array.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1999,7 +1999,7 @@ def _rank(
19991999
"""
20002000
See Series.rank.__doc__.
20012001
"""
2002-
return self._convert_int_result(
2002+
return self._convert_rank_result(
20032003
self._rank_calc(
20042004
axis=axis,
20052005
method=method,
@@ -2318,6 +2318,9 @@ def _convert_bool_result(self, result):
23182318
def _convert_int_result(self, result):
23192319
return type(self)(result)
23202320

2321+
def _convert_rank_result(self, result):
2322+
return type(self)(result)
2323+
23212324
def _str_count(self, pat: str, flags: int = 0) -> Self:
23222325
if flags:
23232326
raise NotImplementedError(f"count not implemented with {flags=}")

pandas/core/arrays/string_.py

+20
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
nanops,
4747
ops,
4848
)
49+
from pandas.core.algorithms import isin
4950
from pandas.core.array_algos import masked_reductions
5051
from pandas.core.arrays.base import ExtensionArray
5152
from pandas.core.arrays.floating import (
@@ -65,6 +66,7 @@
6566
import pyarrow
6667

6768
from pandas._typing import (
69+
ArrayLike,
6870
AxisInt,
6971
Dtype,
7072
DtypeObj,
@@ -735,6 +737,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
735737
# base class implementation that uses __setitem__
736738
ExtensionArray._putmask(self, mask, value)
737739

740+
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
741+
if isinstance(values, BaseStringArray) or (
742+
isinstance(values, ExtensionArray) and is_string_dtype(values.dtype)
743+
):
744+
values = values.astype(self.dtype, copy=False)
745+
else:
746+
if not lib.is_string_array(np.asarray(values), skipna=True):
747+
values = np.array(
748+
[val for val in values if isinstance(val, str) or isna(val)],
749+
dtype=object,
750+
)
751+
if not len(values):
752+
return np.zeros(self.shape, dtype=bool)
753+
754+
values = self._from_sequence(values, dtype=self.dtype)
755+
756+
return isin(np.asarray(self), np.asarray(values))
757+
738758
def astype(self, dtype, copy: bool = True):
739759
dtype = pandas_dtype(dtype)
740760

pandas/core/arrays/string_arrow.py

+11
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
3030
from pandas.core.arrays.arrow import ArrowExtensionArray
3131
from pandas.core.arrays.boolean import BooleanDtype
32+
from pandas.core.arrays.floating import Float64Dtype
3233
from pandas.core.arrays.integer import Int64Dtype
3334
from pandas.core.arrays.numeric import NumericDtype
3435
from pandas.core.arrays.string_ import (
@@ -395,6 +396,16 @@ def _convert_int_result(self, result):
395396

396397
return Int64Dtype().__from_arrow__(result)
397398

399+
def _convert_rank_result(self, result):
400+
if self.dtype.na_value is np.nan:
401+
if isinstance(result, pa.Array):
402+
result = result.to_numpy(zero_copy_only=False)
403+
else:
404+
result = result.to_numpy()
405+
return result.astype("float64", copy=False)
406+
407+
return Float64Dtype().__from_arrow__(result)
408+
398409
def _reduce(
399410
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
400411
):

pandas/core/groupby/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs):
615615
616616
See Also
617617
--------
618+
Series.filter: Filter elements of ungrouped Series.
618619
DataFrameGroupBy.filter : Filter elements from groups base on criterion.
619620
620621
Notes
@@ -1963,6 +1964,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame:
19631964
19641965
See Also
19651966
--------
1967+
DataFrame.filter: Filter elements of ungrouped DataFrame.
19661968
SeriesGroupBy.filter : Filter elements from groups base on criterion.
19671969
19681970
Notes

pandas/core/groupby/groupby.py

+9
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,15 @@ class providing the base-class of operations.
199199
Series or DataFrame
200200
Computed {fname} of values within each group.
201201
202+
See Also
203+
--------
204+
SeriesGroupBy.min : Return the min of the group values.
205+
DataFrameGroupBy.min : Return the min of the group values.
206+
SeriesGroupBy.max : Return the max of the group values.
207+
DataFrameGroupBy.max : Return the max of the group values.
208+
SeriesGroupBy.sum : Return the sum of the group values.
209+
DataFrameGroupBy.sum : Return the sum of the group values.
210+
202211
Examples
203212
--------
204213
{example}

pandas/io/parsers/readers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1648,7 +1648,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T
16481648
if keep_default_na:
16491649
v = set(v) | STR_NA_VALUES
16501650

1651-
na_values[k] = v
1651+
na_values[k] = _stringify_na_values(v, floatify)
16521652
na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
16531653
else:
16541654
if not is_list_like(na_values):

pandas/tests/arrays/string_/test_string.py

+36-5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ def dtype(string_dtype_arguments):
3030
return pd.StringDtype(storage=storage, na_value=na_value)
3131

3232

33+
@pytest.fixture
34+
def dtype2(string_dtype_arguments2):
35+
storage, na_value = string_dtype_arguments2
36+
return pd.StringDtype(storage=storage, na_value=na_value)
37+
38+
3339
@pytest.fixture
3440
def cls(dtype):
3541
"""Fixture giving array type from parametrized 'dtype'"""
@@ -662,11 +668,7 @@ def test_isin(dtype, fixed_now_ts):
662668
tm.assert_series_equal(result, expected)
663669

664670
result = s.isin(["a", pd.NA])
665-
if dtype.storage == "python" and dtype.na_value is np.nan:
666-
# TODO(infer_string) we should make this consistent
667-
expected = pd.Series([True, False, False])
668-
else:
669-
expected = pd.Series([True, False, True])
671+
expected = pd.Series([True, False, True])
670672
tm.assert_series_equal(result, expected)
671673

672674
result = s.isin([])
@@ -677,6 +679,35 @@ def test_isin(dtype, fixed_now_ts):
677679
expected = pd.Series([True, False, False])
678680
tm.assert_series_equal(result, expected)
679681

682+
result = s.isin([fixed_now_ts])
683+
expected = pd.Series([False, False, False])
684+
tm.assert_series_equal(result, expected)
685+
686+
687+
def test_isin_string_array(dtype, dtype2):
688+
s = pd.Series(["a", "b", None], dtype=dtype)
689+
690+
result = s.isin(pd.array(["a", "c"], dtype=dtype2))
691+
expected = pd.Series([True, False, False])
692+
tm.assert_series_equal(result, expected)
693+
694+
result = s.isin(pd.array(["a", None], dtype=dtype2))
695+
expected = pd.Series([True, False, True])
696+
tm.assert_series_equal(result, expected)
697+
698+
699+
def test_isin_arrow_string_array(dtype):
700+
pa = pytest.importorskip("pyarrow")
701+
s = pd.Series(["a", "b", None], dtype=dtype)
702+
703+
result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
704+
expected = pd.Series([True, False, False])
705+
tm.assert_series_equal(result, expected)
706+
707+
result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
708+
expected = pd.Series([True, False, True])
709+
tm.assert_series_equal(result, expected)
710+
680711

681712
def test_setitem_scalar_with_mask_validation(dtype):
682713
# https://github.com/pandas-dev/pandas/issues/47628

pandas/tests/dtypes/test_missing.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from contextlib import nullcontext
21
from datetime import datetime
32
from decimal import Decimal
43

@@ -7,7 +6,6 @@
76

87
from pandas._libs import missing as libmissing
98
from pandas._libs.tslibs import iNaT
10-
from pandas.compat.numpy import np_version_gte1p25
119

1210
from pandas.core.dtypes.common import (
1311
is_float,
@@ -458,15 +456,7 @@ def test_array_equivalent_dti(dtype_equal):
458456
)
459457
def test_array_equivalent_series(val):
460458
arr = np.array([1, 2])
461-
msg = "elementwise comparison failed"
462-
cm = (
463-
# stacklevel is chosen to make sense when called from .equals
464-
tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False)
465-
if isinstance(val, str) and not np_version_gte1p25
466-
else nullcontext()
467-
)
468-
with cm:
469-
assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
459+
assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
470460

471461

472462
def test_array_equivalent_array_mismatched_shape():

pandas/tests/frame/methods/test_rank.py

+4-19
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,11 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._config import using_string_dtype
10-
119
from pandas._libs.algos import (
1210
Infinity,
1311
NegInfinity,
1412
)
15-
from pandas.compat import HAS_PYARROW
1613

17-
import pandas as pd
1814
from pandas import (
1915
DataFrame,
2016
Index,
@@ -467,23 +463,10 @@ def test_rank_inf_nans_na_option(
467463
("top", False, [2.0, 3.0, 1.0, 4.0]),
468464
],
469465
)
470-
def test_rank_object_first(
471-
self,
472-
request,
473-
frame_or_series,
474-
na_option,
475-
ascending,
476-
expected,
477-
using_infer_string,
478-
):
466+
def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
479467
obj = frame_or_series(["foo", "foo", None, "foo"])
480-
if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series):
481-
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
482-
483468
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
484469
expected = frame_or_series(expected)
485-
if using_infer_string and isinstance(obj, Series):
486-
expected = expected.astype("uint64")
487470
tm.assert_equal(result, expected)
488471

489472
@pytest.mark.parametrize(
@@ -507,7 +490,9 @@ def test_rank_string_dtype(self, string_dtype_no_object):
507490
# GH#55362
508491
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
509492
result = obj.rank(method="first")
510-
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
493+
exp_dtype = (
494+
"Float64" if string_dtype_no_object == "string[pyarrow]" else "float64"
495+
)
511496
if string_dtype_no_object.storage == "python":
512497
# TODO nullable string[python] should also return nullable Int64
513498
exp_dtype = "float64"

pandas/tests/io/parser/test_na_values.py

+18
Original file line numberDiff line numberDiff line change
@@ -812,3 +812,21 @@ def test_bool_and_nan_to_float(all_parsers):
812812
result = parser.read_csv(StringIO(data), dtype="float")
813813
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
814814
tm.assert_frame_equal(result, expected)
815+
816+
817+
@xfail_pyarrow
818+
@pytest.mark.parametrize(
819+
"na_values",
820+
[[-99.0, -99], [-99, -99.0]],
821+
)
822+
def test_na_values_dict_without_dtype(all_parsers, na_values):
823+
parser = all_parsers
824+
data = """A
825+
-99
826+
-99
827+
-99.0
828+
-99.0"""
829+
830+
result = parser.read_csv(StringIO(data), na_values=na_values)
831+
expected = DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]})
832+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)