Skip to content

Commit ce4169a

Browse files
authored
Fix new string dtype tests for frame folder (#55409)
* Start fixing string tests * BUG: interpolate raising wrong error for ea * Fix more tests * REGR: join segfaulting for arrow string with nulls * Fix more tests * Fix more tests * BUG: rank raising for arrow string dtypes * BUG: eq not implemented for categorical and arrow backed strings * More tests * BUG: ndim of string block incorrect with string inference * Fix test * Fix tests * Fix tests * Fix more indexing tests * BUG: Index.insert raising when inserting None into new string dtype * Fix tests * BUG: Inserting ndim=0 array does not infer string dtype * Fix tests * Fix tests * Fix more tests * Fix more tests * BUG: idxmax raising for arrow strings * Fix * Fix more tests * Fix more tests * Fix more tests * Fix remaining tests * Fix remaining tests * Change default * BUG: Groupby not keeping string dtype for empty objects * Start fixing gb tests * Fix tests * Merge main * Update config_init.py * Fixup * Update
1 parent 71a3e3c commit ce4169a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+477
-170
lines changed

pandas/tests/frame/constructors/test_from_dict.py

+5
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas._config import using_pyarrow_string_dtype
7+
68
from pandas import (
79
DataFrame,
810
Index,
@@ -42,6 +44,9 @@ def test_constructor_single_row(self):
4244
)
4345
tm.assert_frame_equal(result, expected)
4446

47+
@pytest.mark.skipif(
48+
using_pyarrow_string_dtype(), reason="columns inferring logic broken"
49+
)
4550
def test_constructor_list_of_series(self):
4651
data = [
4752
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),

pandas/tests/frame/constructors/test_from_records.py

+5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import pytest
77
import pytz
88

9+
from pandas._config import using_pyarrow_string_dtype
10+
911
from pandas.compat import is_platform_little_endian
1012

1113
from pandas import (
@@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self):
5658
expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]")
5759
tm.assert_frame_equal(result, expected)
5860

61+
@pytest.mark.skipif(
62+
using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work"
63+
)
5964
def test_from_records_sequencelike(self):
6065
df = DataFrame(
6166
{

pandas/tests/frame/indexing/test_getitem.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def test_getitem_list_duplicates(self):
103103

104104
def test_getitem_dupe_cols(self):
105105
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
106-
msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\""
106+
msg = "\"None of [Index(['baf'], dtype="
107107
with pytest.raises(KeyError, match=re.escape(msg)):
108108
df[["baf"]]
109109

pandas/tests/frame/indexing/test_indexing.py

+25-10
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,9 @@ def test_setattr_column(self):
288288
df.foobar = 5
289289
assert (df.foobar == 5).all()
290290

291-
def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
291+
def test_setitem(
292+
self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string
293+
):
292294
# not sure what else to do here
293295
series = float_frame["A"][::2]
294296
float_frame["col5"] = series
@@ -331,7 +333,10 @@ def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
331333
with pytest.raises(SettingWithCopyError, match=msg):
332334
smaller["col10"] = ["1", "2"]
333335

334-
assert smaller["col10"].dtype == np.object_
336+
if using_infer_string:
337+
assert smaller["col10"].dtype == "string"
338+
else:
339+
assert smaller["col10"].dtype == np.object_
335340
assert (smaller["col10"] == ["1", "2"]).all()
336341

337342
def test_setitem2(self):
@@ -426,7 +431,7 @@ def test_setitem_cast(self, float_frame):
426431
float_frame["something"] = 2.5
427432
assert float_frame["something"].dtype == np.float64
428433

429-
def test_setitem_corner(self, float_frame):
434+
def test_setitem_corner(self, float_frame, using_infer_string):
430435
# corner case
431436
df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3))
432437
del df["B"]
@@ -463,10 +468,16 @@ def test_setitem_corner(self, float_frame):
463468
dm["foo"] = "bar"
464469
del dm["foo"]
465470
dm["foo"] = "bar"
466-
assert dm["foo"].dtype == np.object_
471+
if using_infer_string:
472+
assert dm["foo"].dtype == "string"
473+
else:
474+
assert dm["foo"].dtype == np.object_
467475

468476
dm["coercible"] = ["1", "2", "3"]
469-
assert dm["coercible"].dtype == np.object_
477+
if using_infer_string:
478+
assert dm["coercible"].dtype == "string"
479+
else:
480+
assert dm["coercible"].dtype == np.object_
470481

471482
def test_setitem_corner2(self):
472483
data = {
@@ -483,7 +494,7 @@ def test_setitem_corner2(self):
483494
assert df.loc[1, "title"] == "foobar"
484495
assert df.loc[1, "cruft"] == 0
485496

486-
def test_setitem_ambig(self):
497+
def test_setitem_ambig(self, using_infer_string):
487498
# Difficulties with mixed-type data
488499
# Created as float type
489500
dm = DataFrame(index=range(3), columns=range(3))
@@ -499,18 +510,22 @@ def test_setitem_ambig(self):
499510

500511
dm[2] = uncoercable_series
501512
assert len(dm.columns) == 3
502-
assert dm[2].dtype == np.object_
513+
if using_infer_string:
514+
assert dm[2].dtype == "string"
515+
else:
516+
assert dm[2].dtype == np.object_
503517

504-
def test_setitem_None(self, float_frame):
518+
def test_setitem_None(self, float_frame, using_infer_string):
505519
# GH #766
506520
float_frame[None] = float_frame["A"]
521+
key = None if not using_infer_string else np.nan
507522
tm.assert_series_equal(
508523
float_frame.iloc[:, -1], float_frame["A"], check_names=False
509524
)
510525
tm.assert_series_equal(
511-
float_frame.loc[:, None], float_frame["A"], check_names=False
526+
float_frame.loc[:, key], float_frame["A"], check_names=False
512527
)
513-
tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False)
528+
tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False)
514529

515530
def test_loc_setitem_boolean_mask_allfalse(self):
516531
# GH 9596

pandas/tests/frame/indexing/test_set_value.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_set_value(self, float_frame):
1616
float_frame._set_value(idx, col, 1)
1717
assert float_frame[col][idx] == 1
1818

19-
def test_set_value_resize(self, float_frame):
19+
def test_set_value_resize(self, float_frame, using_infer_string):
2020
res = float_frame._set_value("foobar", "B", 0)
2121
assert res is None
2222
assert float_frame.index[-1] == "foobar"
@@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame):
2727

2828
res = float_frame.copy()
2929
res._set_value("foobar", "baz", "sam")
30-
assert res["baz"].dtype == np.object_
31-
30+
if using_infer_string:
31+
assert res["baz"].dtype == "string"
32+
else:
33+
assert res["baz"].dtype == np.object_
3234
res = float_frame.copy()
3335
with tm.assert_produces_warning(
3436
FutureWarning, match="Setting an item of incompatible dtype"

pandas/tests/frame/indexing/test_setitem.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1319,7 +1319,7 @@ def test_setitem_column_frame_as_category(self):
13191319
df["col2"] = Series([1, 2, 3], dtype="category")
13201320

13211321
expected_types = Series(
1322-
["int64", "category", "category"], index=[0, "col1", "col2"]
1322+
["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object
13231323
)
13241324
tm.assert_series_equal(df.dtypes, expected_types)
13251325

pandas/tests/frame/indexing/test_where.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1077,9 +1077,13 @@ def test_where_producing_ea_cond_for_np_dtype():
10771077
@pytest.mark.parametrize(
10781078
"replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)]
10791079
)
1080-
def test_where_int_overflow(replacement):
1080+
def test_where_int_overflow(replacement, using_infer_string, request):
10811081
# GH 31687
10821082
df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]])
1083+
if using_infer_string and replacement not in (None, "snake"):
1084+
request.node.add_marker(
1085+
pytest.mark.xfail(reason="Can't set non-string into string column")
1086+
)
10831087
result = df.where(pd.notnull(df), replacement)
10841088
expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]])
10851089

pandas/tests/frame/methods/test_align.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
107107
af, bf = float_frame.align(
108108
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
109109
)
110-
tm.assert_index_equal(bf.index, Index([]))
110+
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
111111

112112
msg = (
113113
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
@@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
117117
af, bf = float_frame.align(
118118
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
119119
)
120-
tm.assert_index_equal(bf.index, Index([]))
120+
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
121121

122122
# Try to align DataFrame to Series along bad axis
123123
msg = "No axis named 2 for object type DataFrame"

pandas/tests/frame/methods/test_astype.py

+15-7
Original file line numberDiff line numberDiff line change
@@ -166,21 +166,22 @@ def test_astype_str(self):
166166
"c": [Timedelta(x)._repr_base() for x in c._values],
167167
"d": list(map(str, d._values)),
168168
"e": list(map(str, e._values)),
169-
}
169+
},
170+
dtype="object",
170171
)
171172

172173
tm.assert_frame_equal(result, expected)
173174

174175
def test_astype_str_float(self):
175176
# see GH#11302
176177
result = DataFrame([np.nan]).astype(str)
177-
expected = DataFrame(["nan"])
178+
expected = DataFrame(["nan"], dtype="object")
178179

179180
tm.assert_frame_equal(result, expected)
180181
result = DataFrame([1.12345678901234567890]).astype(str)
181182

182183
val = "1.1234567890123457"
183-
expected = DataFrame([val])
184+
expected = DataFrame([val], dtype="object")
184185
tm.assert_frame_equal(result, expected)
185186

186187
@pytest.mark.parametrize("dtype_class", [dict, Series])
@@ -199,7 +200,7 @@ def test_astype_dict_like(self, dtype_class):
199200
expected = DataFrame(
200201
{
201202
"a": a,
202-
"b": Series(["0", "1", "2", "3", "4"]),
203+
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
203204
"c": c,
204205
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
205206
}
@@ -282,7 +283,7 @@ def test_astype_duplicate_col_series_arg(self):
282283
result = df.astype(dtypes)
283284
expected = DataFrame(
284285
{
285-
0: vals[:, 0].astype(str),
286+
0: Series(vals[:, 0].astype(str), dtype=object),
286287
1: vals[:, 1],
287288
2: pd.array(vals[:, 2], dtype="Float64"),
288289
3: vals[:, 3],
@@ -620,6 +621,7 @@ def test_astype_arg_for_errors_dictlist(self):
620621
{"a": 2.2, "b": "15.3", "c": "another_test"},
621622
]
622623
)
624+
expected["c"] = expected["c"].astype("object")
623625
type_dict = {"a": "float64", "b": "float64", "c": "object"}
624626

625627
result = df.astype(dtype=type_dict, errors="ignore")
@@ -680,6 +682,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
680682
],
681683
],
682684
columns=timezone_frame.columns,
685+
dtype="object",
683686
)
684687
tm.assert_frame_equal(result, expected)
685688

@@ -754,7 +757,9 @@ def test_astype_tz_object_conversion(self, tz):
754757
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
755758
tm.assert_frame_equal(result, expected)
756759

757-
def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
760+
def test_astype_dt64_to_string(
761+
self, frame_or_series, tz_naive_fixture, using_infer_string
762+
):
758763
# GH#41409
759764
tz = tz_naive_fixture
760765

@@ -772,7 +777,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
772777
item = result.iloc[0]
773778
if frame_or_series is DataFrame:
774779
item = item.iloc[0]
775-
assert item is pd.NA
780+
if using_infer_string:
781+
assert item is np.nan
782+
else:
783+
assert item is pd.NA
776784

777785
# For non-NA values, we should match what we get for non-EA str
778786
alt = obj.astype(str)

pandas/tests/frame/methods/test_combine_first.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_combine_first_mixed(self):
3030
combined = f.combine_first(g)
3131
tm.assert_frame_equal(combined, exp)
3232

33-
def test_combine_first(self, float_frame):
33+
def test_combine_first(self, float_frame, using_infer_string):
3434
# disjoint
3535
head, tail = float_frame[:5], float_frame[5:]
3636

@@ -76,7 +76,9 @@ def test_combine_first(self, float_frame):
7676
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
7777

7878
# corner cases
79-
comb = float_frame.combine_first(DataFrame())
79+
warning = FutureWarning if using_infer_string else None
80+
with tm.assert_produces_warning(warning, match="empty entries"):
81+
comb = float_frame.combine_first(DataFrame())
8082
tm.assert_frame_equal(comb, float_frame)
8183

8284
comb = DataFrame().combine_first(float_frame)

pandas/tests/frame/methods/test_convert_dtypes.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,13 @@ class TestConvertDtypes:
1111
@pytest.mark.parametrize(
1212
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1313
)
14-
def test_convert_dtypes(self, convert_integer, expected, string_storage):
14+
def test_convert_dtypes(
15+
self, convert_integer, expected, string_storage, using_infer_string
16+
):
1517
# Specific types are tested in tests/series/test_dtypes.py
1618
# Just check that it works for DataFrame here
19+
if using_infer_string:
20+
string_storage = "pyarrow_numpy"
1721
df = pd.DataFrame(
1822
{
1923
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),

pandas/tests/frame/methods/test_cov_corr.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ def test_corrwith(self, datetime_frame, dtype):
326326
for row in index[:4]:
327327
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
328328

329-
def test_corrwith_with_objects(self):
329+
def test_corrwith_with_objects(self, using_infer_string):
330330
df1 = DataFrame(
331331
np.random.default_rng(2).standard_normal((10, 4)),
332332
columns=Index(list("ABCD"), dtype=object),
@@ -338,8 +338,14 @@ def test_corrwith_with_objects(self):
338338
df1["obj"] = "foo"
339339
df2["obj"] = "bar"
340340

341-
with pytest.raises(TypeError, match="Could not convert"):
342-
df1.corrwith(df2)
341+
if using_infer_string:
342+
import pyarrow as pa
343+
344+
with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
345+
df1.corrwith(df2)
346+
else:
347+
with pytest.raises(TypeError, match="Could not convert"):
348+
df1.corrwith(df2)
343349
result = df1.corrwith(df2, numeric_only=True)
344350
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
345351
tm.assert_series_equal(result, expected)

pandas/tests/frame/methods/test_drop.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ def test_drop_with_duplicate_columns2(self):
510510

511511
def test_drop_inplace_no_leftover_column_reference(self):
512512
# GH 13934
513-
df = DataFrame({"a": [1, 2, 3]})
513+
df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object"))
514514
a = df.a
515515
df.drop(["a"], axis=1, inplace=True)
516516
tm.assert_index_equal(df.columns, Index([], dtype="object"))

pandas/tests/frame/methods/test_drop_duplicates.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
def test_drop_duplicates_with_misspelled_column_name(subset):
1717
# GH 19730
1818
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
19-
msg = re.escape("Index(['a'], dtype='object')")
19+
msg = re.escape("Index(['a'], dtype=")
2020

2121
with pytest.raises(KeyError, match=msg):
2222
df.drop_duplicates(subset)

pandas/tests/frame/methods/test_dtypes.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,12 @@ def test_dtypes_timedeltas(self):
142142
)
143143
tm.assert_series_equal(result, expected)
144144

145-
def test_frame_apply_np_array_return_type(self):
145+
def test_frame_apply_np_array_return_type(self, using_infer_string):
146146
# GH 35517
147147
df = DataFrame([["foo"]])
148148
result = df.apply(lambda col: np.array("bar"))
149-
expected = Series(["bar"])
149+
if using_infer_string:
150+
expected = Series([np.array(["bar"])])
151+
else:
152+
expected = Series(["bar"])
150153
tm.assert_series_equal(result, expected)

pandas/tests/frame/methods/test_duplicated.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
def test_duplicated_with_misspelled_column_name(subset):
1717
# GH 19730
1818
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
19-
msg = re.escape("Index(['a'], dtype='object')")
19+
msg = re.escape("Index(['a'], dtype=")
2020

2121
with pytest.raises(KeyError, match=msg):
2222
df.duplicated(subset)

pandas/tests/frame/methods/test_equals.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ def test_dataframe_not_equal(self):
1414
df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]})
1515
assert df1.equals(df2) is False
1616

17-
def test_equals_different_blocks(self, using_array_manager):
17+
def test_equals_different_blocks(self, using_array_manager, using_infer_string):
1818
# GH#9330
1919
df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
2020
df1 = df0.reset_index()[["A", "B", "C"]]
21-
if not using_array_manager:
21+
if not using_array_manager and not using_infer_string:
2222
# this assert verifies that the above operations have
2323
# induced a block rearrangement
2424
assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype

0 commit comments

Comments
 (0)