Skip to content

Commit 8aa7a96

Browse files
authored
Adjust tests in array folder for new string option (#56188)
* Adjust tests in array directory for new string option * BUG: value_counts not preserving object dtype * Adjust tests in array folder for new string option * Fixup * Fix * Fix * Revert "BUG: value_counts not preserving object dtype" This reverts commit f570a4f
1 parent f0b61c5 commit 8aa7a96

12 files changed

+129
-47
lines changed

pandas/core/algorithms.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -932,7 +932,10 @@ def value_counts_internal(
932932
idx = Index(keys)
933933
if idx.dtype == bool and keys.dtype == object:
934934
idx = idx.astype(object)
935-
elif idx.dtype != keys.dtype:
935+
elif (
936+
idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714
937+
and idx.dtype != "string[pyarrow_numpy]"
938+
):
936939
warnings.warn(
937940
# GH#56161
938941
"The behavior of value_counts with object-dtype is deprecated. "

pandas/tests/arrays/boolean/test_arithmetic.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname):
9090
# -----------------------------------------------------------------------------
9191

9292

93-
def test_error_invalid_values(data, all_arithmetic_operators):
93+
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
9494
# invalid ops
9595

96+
if using_infer_string:
97+
import pyarrow as pa
98+
99+
err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
100+
else:
101+
err = TypeError
102+
96103
op = all_arithmetic_operators
97104
s = pd.Series(data)
98105
ops = getattr(s, op)
@@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
110117
[
111118
r"unsupported operand type\(s\) for",
112119
"Concatenation operation is not implemented for NumPy arrays",
120+
"has no kernel",
113121
]
114122
)
115-
with pytest.raises(TypeError, match=msg):
123+
with pytest.raises(err, match=msg):
116124
ops(pd.Timestamp("20180101"))
117125

118126
# invalid array-likes
@@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators):
123131
r"unsupported operand type\(s\) for",
124132
"can only concatenate str",
125133
"not all arguments converted during string formatting",
134+
"has no kernel",
135+
"not implemented",
126136
]
127137
)
128-
with pytest.raises(TypeError, match=msg):
138+
with pytest.raises(err, match=msg):
129139
ops(pd.Series("foo", index=s.index))

pandas/tests/arrays/categorical/test_astype.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def test_astype(self, ordered):
8989
expected = np.array(cat)
9090
tm.assert_numpy_array_equal(result, expected)
9191

92-
msg = r"Cannot cast object dtype to float64"
92+
msg = r"Cannot cast object|string dtype to float64"
9393
with pytest.raises(ValueError, match=msg):
9494
cat.astype(float)
9595

pandas/tests/arrays/categorical/test_constructors.py

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import numpy as np
77
import pytest
88

9+
from pandas._config import using_pyarrow_string_dtype
10+
911
from pandas.core.dtypes.common import (
1012
is_float_dtype,
1113
is_integer_dtype,
@@ -447,6 +449,7 @@ def test_constructor_str_unknown(self):
447449
with pytest.raises(ValueError, match="Unknown dtype"):
448450
Categorical([1, 2], dtype="foo")
449451

452+
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings")
450453
def test_constructor_np_strs(self):
451454
# GH#31499 Hashtable.map_locations needs to work on np.str_ objects
452455
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])

pandas/tests/arrays/categorical/test_operators.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def test_comparisons(self, factor):
9292
cat > cat_unordered
9393

9494
# comparison (in both directions) with Series will raise
95-
s = Series(["b", "b", "b"])
95+
s = Series(["b", "b", "b"], dtype=object)
9696
msg = (
9797
"Cannot compare a Categorical for op __gt__ with type "
9898
r"<class 'numpy\.ndarray'>"
@@ -108,7 +108,7 @@ def test_comparisons(self, factor):
108108

109109
# comparison with numpy.array will raise in both direction, but only on
110110
# newer numpy versions
111-
a = np.array(["b", "b", "b"])
111+
a = np.array(["b", "b", "b"], dtype=object)
112112
with pytest.raises(TypeError, match=msg):
113113
cat > a
114114
with pytest.raises(TypeError, match=msg):
@@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base):
248248
cat_base = Series(
249249
Categorical(base, categories=cat.cat.categories, ordered=True)
250250
)
251-
s = Series(base)
251+
s = Series(base, dtype=object if base == list("bbb") else None)
252252
a = np.array(base)
253253

254254
# comparisons need to take categories ordering into account

pandas/tests/arrays/categorical/test_repr.py

+22-8
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
import numpy as np
2+
import pytest
3+
4+
from pandas._config import using_pyarrow_string_dtype
25

36
from pandas import (
47
Categorical,
58
CategoricalDtype,
69
CategoricalIndex,
10+
Index,
711
Series,
812
date_range,
913
option_context,
@@ -13,11 +17,17 @@
1317

1418

1519
class TestCategoricalReprWithFactor:
16-
def test_print(self, factor):
17-
expected = [
18-
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
19-
"Categories (3, object): ['a' < 'b' < 'c']",
20-
]
20+
def test_print(self, factor, using_infer_string):
21+
if using_infer_string:
22+
expected = [
23+
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
24+
"Categories (3, string): [a < b < c]",
25+
]
26+
else:
27+
expected = [
28+
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
29+
"Categories (3, object): ['a' < 'b' < 'c']",
30+
]
2131
expected = "\n".join(expected)
2232
actual = repr(factor)
2333
assert actual == expected
@@ -26,7 +36,7 @@ def test_print(self, factor):
2636
class TestCategoricalRepr:
2737
def test_big_print(self):
2838
codes = np.array([0, 1, 2, 0, 1, 2] * 100)
29-
dtype = CategoricalDtype(categories=["a", "b", "c"])
39+
dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
3040
factor = Categorical.from_codes(codes, dtype=dtype)
3141
expected = [
3242
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
@@ -40,13 +50,13 @@ def test_big_print(self):
4050
assert actual == expected
4151

4252
def test_empty_print(self):
43-
factor = Categorical([], ["a", "b", "c"])
53+
factor = Categorical([], Index(["a", "b", "c"], dtype=object))
4454
expected = "[], Categories (3, object): ['a', 'b', 'c']"
4555
actual = repr(factor)
4656
assert actual == expected
4757

4858
assert expected == actual
49-
factor = Categorical([], ["a", "b", "c"], ordered=True)
59+
factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
5060
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
5161
actual = repr(factor)
5262
assert expected == actual
@@ -66,6 +76,10 @@ def test_print_none_width(self):
6676
with option_context("display.width", None):
6777
assert exp == repr(a)
6878

79+
@pytest.mark.skipif(
80+
using_pyarrow_string_dtype(),
81+
reason="Change once infer_string is set to True by default",
82+
)
6983
def test_unicode_print(self):
7084
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
7185
expected = """\

pandas/tests/arrays/floating/test_arithmetic.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other):
122122
# -----------------------------------------------------------------------------
123123

124124

125-
def test_error_invalid_values(data, all_arithmetic_operators):
125+
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
126126
op = all_arithmetic_operators
127127
s = pd.Series(data)
128128
ops = getattr(s, op)
129129

130+
if using_infer_string:
131+
import pyarrow as pa
132+
133+
errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
134+
else:
135+
errs = TypeError
136+
130137
# invalid scalars
131138
msg = "|".join(
132139
[
@@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators):
140147
"ufunc '.*' not supported for the input types, and the inputs could not",
141148
"ufunc '.*' did not contain a loop with signature matching types",
142149
"Concatenation operation is not implemented for NumPy arrays",
150+
"has no kernel",
151+
"not implemented",
143152
]
144153
)
145-
with pytest.raises(TypeError, match=msg):
154+
with pytest.raises(errs, match=msg):
146155
ops("foo")
147-
with pytest.raises(TypeError, match=msg):
156+
with pytest.raises(errs, match=msg):
148157
ops(pd.Timestamp("20180101"))
149158

150159
# invalid array-likes
151-
with pytest.raises(TypeError, match=msg):
160+
with pytest.raises(errs, match=msg):
152161
ops(pd.Series("foo", index=s.index))
153162

154163
msg = "|".join(
@@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
167176
),
168177
r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
169178
"cannot subtract DatetimeArray from ndarray",
179+
"has no kernel",
180+
"not implemented",
170181
]
171182
)
172-
with pytest.raises(TypeError, match=msg):
183+
with pytest.raises(errs, match=msg):
173184
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
174185

175186

pandas/tests/arrays/integer/test_arithmetic.py

+24-9
Original file line numberDiff line numberDiff line change
@@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other):
172172
# -----------------------------------------------------------------------------
173173

174174

175-
def test_error_invalid_values(data, all_arithmetic_operators):
175+
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
176176
op = all_arithmetic_operators
177177
s = pd.Series(data)
178178
ops = getattr(s, op)
179179

180+
if using_infer_string:
181+
import pyarrow as pa
182+
183+
errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
184+
else:
185+
errs = TypeError
186+
180187
# invalid scalars
181188
msg = "|".join(
182189
[
@@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators):
188195
"ufunc '.*' not supported for the input types, and the inputs could not",
189196
"ufunc '.*' did not contain a loop with signature matching types",
190197
"Addition/subtraction of integers and integer-arrays with Timestamp",
198+
"has no kernel",
199+
"not implemented",
191200
]
192201
)
193-
with pytest.raises(TypeError, match=msg):
202+
with pytest.raises(errs, match=msg):
194203
ops("foo")
195-
with pytest.raises(TypeError, match=msg):
204+
with pytest.raises(errs, match=msg):
196205
ops(pd.Timestamp("20180101"))
197206

198207
# invalid array-likes
199208
str_ser = pd.Series("foo", index=s.index)
200209
# with pytest.raises(TypeError, match=msg):
201-
if all_arithmetic_operators in [
202-
"__mul__",
203-
"__rmul__",
204-
]: # (data[~data.isna()] >= 0).all():
210+
if (
211+
all_arithmetic_operators
212+
in [
213+
"__mul__",
214+
"__rmul__",
215+
]
216+
and not using_infer_string
217+
): # (data[~data.isna()] >= 0).all():
205218
res = ops(str_ser)
206219
expected = pd.Series(["foo" * x for x in data], index=s.index)
207220
expected = expected.fillna(np.nan)
@@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators):
210223
# more-correct than np.nan here.
211224
tm.assert_series_equal(res, expected)
212225
else:
213-
with pytest.raises(TypeError, match=msg):
226+
with pytest.raises(errs, match=msg):
214227
ops(str_ser)
215228

216229
msg = "|".join(
@@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
223236
r"can only concatenate str \(not \"int\"\) to str",
224237
"not all arguments converted during string",
225238
"cannot subtract DatetimeArray from ndarray",
239+
"has no kernel",
240+
"not implemented",
226241
]
227242
)
228-
with pytest.raises(TypeError, match=msg):
243+
with pytest.raises(errs, match=msg):
229244
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
230245

231246

pandas/tests/arrays/integer/test_reduction.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected):
102102
["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
103103
],
104104
)
105-
def test_mixed_reductions(op, expected):
105+
def test_mixed_reductions(op, expected, using_infer_string):
106+
if op in ["any", "all"] and using_infer_string:
107+
expected = expected.astype("bool")
106108
df = DataFrame(
107109
{
108110
"A": ["a", "b", "b"],

pandas/tests/arrays/string_/test_string.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def test_mul(dtype):
191191
@pytest.mark.xfail(reason="GH-28527")
192192
def test_add_strings(dtype):
193193
arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
194-
df = pd.DataFrame([["t", "y", "v", "w"]])
194+
df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object)
195195
assert arr.__add__(df) is NotImplemented
196196

197197
result = arr + df
@@ -498,10 +498,17 @@ def test_arrow_array(dtype):
498498

499499

500500
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
501-
def test_arrow_roundtrip(dtype, string_storage2):
501+
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
502502
# roundtrip possible from arrow 1.0.0
503503
pa = pytest.importorskip("pyarrow")
504504

505+
if using_infer_string and string_storage2 != "pyarrow_numpy":
506+
request.applymarker(
507+
pytest.mark.xfail(
508+
reason="infer_string takes precedence over string storage"
509+
)
510+
)
511+
505512
data = pd.array(["a", "b", None], dtype=dtype)
506513
df = pd.DataFrame({"a": data})
507514
table = pa.table(df)
@@ -516,10 +523,19 @@ def test_arrow_roundtrip(dtype, string_storage2):
516523

517524

518525
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
519-
def test_arrow_load_from_zero_chunks(dtype, string_storage2):
526+
def test_arrow_load_from_zero_chunks(
527+
dtype, string_storage2, request, using_infer_string
528+
):
520529
# GH-41040
521530
pa = pytest.importorskip("pyarrow")
522531

532+
if using_infer_string and string_storage2 != "pyarrow_numpy":
533+
request.applymarker(
534+
pytest.mark.xfail(
535+
reason="infer_string takes precedence over string storage"
536+
)
537+
)
538+
523539
data = pd.array([], dtype=dtype)
524540
df = pd.DataFrame({"a": data})
525541
table = pa.table(df)

0 commit comments

Comments
 (0)