Skip to content

Fix new string dtype tests for frame folder #55409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 48 commits into from
Dec 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
5dd7d93
Start fixing string tests
phofl Oct 1, 2023
9320144
BUG: interpolate raising wrong error for ea
phofl Oct 1, 2023
3a6db3d
Merge branch 'ea_interpolate' into string_dtype_tests
phofl Oct 1, 2023
be20fb2
Fix more tests
phofl Oct 1, 2023
2a3af77
REGR: join segfaulting for arrow string with nulls
phofl Oct 1, 2023
31dd1f6
Merge branch 'join_index' into string_dtype_tests
phofl Oct 1, 2023
f48d384
Fix more tests
phofl Oct 1, 2023
1cbeced
Fix more tests
phofl Oct 1, 2023
9dddb80
BUG: rank raising for arrow string dtypes
phofl Oct 2, 2023
e07f639
BUG: eq not implemented for categorical and arrow backed strings
phofl Oct 2, 2023
4c074c1
More tests
phofl Oct 2, 2023
bf4b3ca
BUG: ndim of string block incorrect with string inference
phofl Oct 2, 2023
21a39ca
Merge branch 'rank_string' into string_dtype_tests
phofl Oct 2, 2023
314be85
Merge branch 'cat_eq_string' into string_dtype_tests
phofl Oct 2, 2023
f2016ec
Merge branch 'string_dtype_block_dim' into string_dtype_tests
phofl Oct 2, 2023
3b5974d
Fix test
phofl Oct 2, 2023
df81cc0
Fix tests
phofl Oct 2, 2023
74e09e4
Fix tests
phofl Oct 2, 2023
255267f
Fix more indexing tests
phofl Oct 2, 2023
a623421
BUG: Index.insert raising when inserting None into new string dtype
phofl Oct 2, 2023
1a8a897
Merge branch 'index_insert' into string_dtype_tests
phofl Oct 2, 2023
3cf79ef
Fix tests
phofl Oct 2, 2023
e823c97
BUG: Inserting ndim=0 array does not infer string dtype
phofl Oct 2, 2023
a9efc05
Merge branch 'indexing_ndim_0' into string_dtype_tests
phofl Oct 2, 2023
c89da87
Fix tests
phofl Oct 2, 2023
04f3d9d
Fix tests
phofl Oct 2, 2023
28f5411
Fix more tests
phofl Oct 2, 2023
ca3b8fe
Merge remote-tracking branch 'upstream/main' into string_dtype_tests
phofl Oct 3, 2023
ca296ec
Fix more tests
phofl Oct 3, 2023
ab35982
BUG: idxmax raising for arrow strings
phofl Oct 3, 2023
d910efa
Fix
phofl Oct 3, 2023
52ce001
Merge branch 'idxmax_string' into string_dtype_tests
phofl Oct 3, 2023
d0221e3
Fix more tests
phofl Oct 3, 2023
18d5e62
Fix more tests
phofl Oct 3, 2023
130eeb3
Fix more tests
phofl Oct 4, 2023
1e7b93e
Fix remaining tests
phofl Oct 4, 2023
ccabdb3
Merge remote-tracking branch 'upstream/main' into string_dtype_tests
phofl Oct 4, 2023
6e55ce2
Fix remaining tests
phofl Oct 4, 2023
7048361
Change default
phofl Oct 4, 2023
0cb459c
BUG: Groupby not keeping string dtype for empty objects
phofl Oct 21, 2023
aff4f17
Start fixing gb tests
phofl Oct 21, 2023
ff7f00d
Merge remote-tracking branch 'origin/string_dtype_groupby_len_zero' i…
phofl Oct 21, 2023
620555e
Fix tests
phofl Oct 21, 2023
05f1c95
Merge main
phofl Dec 8, 2023
e58e8b3
Merge remote-tracking branch 'upstream/main' into string_dtype_tests
phofl Dec 8, 2023
c7ba717
Update config_init.py
phofl Dec 8, 2023
15fb683
Fixup
phofl Dec 8, 2023
fcc8245
Update
phofl Dec 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -42,6 +44,9 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(
using_pyarrow_string_dtype(), reason="columns inferring logic broken"
)
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pytest
import pytz

from pandas._config import using_pyarrow_string_dtype

from pandas.compat import is_platform_little_endian

from pandas import (
Expand Down Expand Up @@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self):
expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]")
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(
using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work"
)
def test_from_records_sequencelike(self):
df = DataFrame(
{
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_getitem_list_duplicates(self):

def test_getitem_dupe_cols(self):
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\""
msg = "\"None of [Index(['baf'], dtype="
with pytest.raises(KeyError, match=re.escape(msg)):
df[["baf"]]

Expand Down
35 changes: 25 additions & 10 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,9 @@ def test_setattr_column(self):
df.foobar = 5
assert (df.foobar == 5).all()

def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
def test_setitem(
self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string
):
# not sure what else to do here
series = float_frame["A"][::2]
float_frame["col5"] = series
Expand Down Expand Up @@ -331,7 +333,10 @@ def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
with pytest.raises(SettingWithCopyError, match=msg):
smaller["col10"] = ["1", "2"]

assert smaller["col10"].dtype == np.object_
if using_infer_string:
assert smaller["col10"].dtype == "string"
else:
assert smaller["col10"].dtype == np.object_
assert (smaller["col10"] == ["1", "2"]).all()

def test_setitem2(self):
Expand Down Expand Up @@ -426,7 +431,7 @@ def test_setitem_cast(self, float_frame):
float_frame["something"] = 2.5
assert float_frame["something"].dtype == np.float64

def test_setitem_corner(self, float_frame):
def test_setitem_corner(self, float_frame, using_infer_string):
# corner case
df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3))
del df["B"]
Expand Down Expand Up @@ -463,10 +468,16 @@ def test_setitem_corner(self, float_frame):
dm["foo"] = "bar"
del dm["foo"]
dm["foo"] = "bar"
assert dm["foo"].dtype == np.object_
if using_infer_string:
assert dm["foo"].dtype == "string"
else:
assert dm["foo"].dtype == np.object_

dm["coercible"] = ["1", "2", "3"]
assert dm["coercible"].dtype == np.object_
if using_infer_string:
assert dm["coercible"].dtype == "string"
else:
assert dm["coercible"].dtype == np.object_

def test_setitem_corner2(self):
data = {
Expand All @@ -483,7 +494,7 @@ def test_setitem_corner2(self):
assert df.loc[1, "title"] == "foobar"
assert df.loc[1, "cruft"] == 0

def test_setitem_ambig(self):
def test_setitem_ambig(self, using_infer_string):
# Difficulties with mixed-type data
# Created as float type
dm = DataFrame(index=range(3), columns=range(3))
Expand All @@ -499,18 +510,22 @@ def test_setitem_ambig(self):

dm[2] = uncoercable_series
assert len(dm.columns) == 3
assert dm[2].dtype == np.object_
if using_infer_string:
assert dm[2].dtype == "string"
else:
assert dm[2].dtype == np.object_

def test_setitem_None(self, float_frame):
def test_setitem_None(self, float_frame, using_infer_string):
# GH #766
float_frame[None] = float_frame["A"]
key = None if not using_infer_string else np.nan
tm.assert_series_equal(
float_frame.iloc[:, -1], float_frame["A"], check_names=False
)
tm.assert_series_equal(
float_frame.loc[:, None], float_frame["A"], check_names=False
float_frame.loc[:, key], float_frame["A"], check_names=False
)
tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False)
tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False)

def test_loc_setitem_boolean_mask_allfalse(self):
# GH 9596
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/frame/indexing/test_set_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_set_value(self, float_frame):
float_frame._set_value(idx, col, 1)
assert float_frame[col][idx] == 1

def test_set_value_resize(self, float_frame):
def test_set_value_resize(self, float_frame, using_infer_string):
res = float_frame._set_value("foobar", "B", 0)
assert res is None
assert float_frame.index[-1] == "foobar"
Expand All @@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame):

res = float_frame.copy()
res._set_value("foobar", "baz", "sam")
assert res["baz"].dtype == np.object_

if using_infer_string:
assert res["baz"].dtype == "string"
else:
assert res["baz"].dtype == np.object_
res = float_frame.copy()
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,7 @@ def test_setitem_column_frame_as_category(self):
df["col2"] = Series([1, 2, 3], dtype="category")

expected_types = Series(
["int64", "category", "category"], index=[0, "col1", "col2"]
["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object
)
tm.assert_series_equal(df.dtypes, expected_types)

Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/frame/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,9 +1077,13 @@ def test_where_producing_ea_cond_for_np_dtype():
@pytest.mark.parametrize(
"replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)]
)
def test_where_int_overflow(replacement):
def test_where_int_overflow(replacement, using_infer_string, request):
# GH 31687
df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]])
if using_infer_string and replacement not in (None, "snake"):
request.node.add_marker(
pytest.mark.xfail(reason="Can't set non-string into string column")
)
result = df.where(pd.notnull(df), replacement)
expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]])

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
)
tm.assert_index_equal(bf.index, Index([]))
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))

msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
Expand All @@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]))
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))

# Try to align DataFrame to Series along bad axis
msg = "No axis named 2 for object type DataFrame"
Expand Down
22 changes: 15 additions & 7 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,21 +166,22 @@ def test_astype_str(self):
"c": [Timedelta(x)._repr_base() for x in c._values],
"d": list(map(str, d._values)),
"e": list(map(str, e._values)),
}
},
dtype="object",
)

tm.assert_frame_equal(result, expected)

def test_astype_str_float(self):
# see GH#11302
result = DataFrame([np.nan]).astype(str)
expected = DataFrame(["nan"])
expected = DataFrame(["nan"], dtype="object")

tm.assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(str)

val = "1.1234567890123457"
expected = DataFrame([val])
expected = DataFrame([val], dtype="object")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("dtype_class", [dict, Series])
Expand All @@ -199,7 +200,7 @@ def test_astype_dict_like(self, dtype_class):
expected = DataFrame(
{
"a": a,
"b": Series(["0", "1", "2", "3", "4"]),
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
"c": c,
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
}
Expand Down Expand Up @@ -282,7 +283,7 @@ def test_astype_duplicate_col_series_arg(self):
result = df.astype(dtypes)
expected = DataFrame(
{
0: vals[:, 0].astype(str),
0: Series(vals[:, 0].astype(str), dtype=object),
1: vals[:, 1],
2: pd.array(vals[:, 2], dtype="Float64"),
3: vals[:, 3],
Expand Down Expand Up @@ -620,6 +621,7 @@ def test_astype_arg_for_errors_dictlist(self):
{"a": 2.2, "b": "15.3", "c": "another_test"},
]
)
expected["c"] = expected["c"].astype("object")
type_dict = {"a": "float64", "b": "float64", "c": "object"}

result = df.astype(dtype=type_dict, errors="ignore")
Expand Down Expand Up @@ -680,6 +682,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
],
],
columns=timezone_frame.columns,
dtype="object",
)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -754,7 +757,9 @@ def test_astype_tz_object_conversion(self, tz):
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
tm.assert_frame_equal(result, expected)

def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
def test_astype_dt64_to_string(
self, frame_or_series, tz_naive_fixture, using_infer_string
):
# GH#41409
tz = tz_naive_fixture

Expand All @@ -772,7 +777,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
item = result.iloc[0]
if frame_or_series is DataFrame:
item = item.iloc[0]
assert item is pd.NA
if using_infer_string:
assert item is np.nan
else:
assert item is pd.NA

# For non-NA values, we should match what we get for non-EA str
alt = obj.astype(str)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_combine_first_mixed(self):
combined = f.combine_first(g)
tm.assert_frame_equal(combined, exp)

def test_combine_first(self, float_frame):
def test_combine_first(self, float_frame, using_infer_string):
# disjoint
head, tail = float_frame[:5], float_frame[5:]

Expand Down Expand Up @@ -76,7 +76,9 @@ def test_combine_first(self, float_frame):
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])

# corner cases
comb = float_frame.combine_first(DataFrame())
warning = FutureWarning if using_infer_string else None
with tm.assert_produces_warning(warning, match="empty entries"):
comb = float_frame.combine_first(DataFrame())
tm.assert_frame_equal(comb, float_frame)

comb = DataFrame().combine_first(float_frame)
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/frame/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@ class TestConvertDtypes:
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
def test_convert_dtypes(self, convert_integer, expected, string_storage):
def test_convert_dtypes(
self, convert_integer, expected, string_storage, using_infer_string
):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
if using_infer_string:
string_storage = "pyarrow_numpy"
df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def test_corrwith(self, datetime_frame, dtype):
for row in index[:4]:
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))

def test_corrwith_with_objects(self):
def test_corrwith_with_objects(self, using_infer_string):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
Expand All @@ -338,8 +338,14 @@ def test_corrwith_with_objects(self):
df1["obj"] = "foo"
df2["obj"] = "bar"

with pytest.raises(TypeError, match="Could not convert"):
df1.corrwith(df2)
if using_infer_string:
import pyarrow as pa

with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
df1.corrwith(df2)
else:
with pytest.raises(TypeError, match="Could not convert"):
df1.corrwith(df2)
result = df1.corrwith(df2, numeric_only=True)
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
tm.assert_series_equal(result, expected)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def test_drop_with_duplicate_columns2(self):

def test_drop_inplace_no_leftover_column_reference(self):
# GH 13934
df = DataFrame({"a": [1, 2, 3]})
df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object"))
a = df.a
df.drop(["a"], axis=1, inplace=True)
tm.assert_index_equal(df.columns, Index([], dtype="object"))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def test_drop_duplicates_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype='object')")
msg = re.escape("Index(['a'], dtype=")

with pytest.raises(KeyError, match=msg):
df.drop_duplicates(subset)
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/frame/methods/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,12 @@ def test_dtypes_timedeltas(self):
)
tm.assert_series_equal(result, expected)

def test_frame_apply_np_array_return_type(self):
def test_frame_apply_np_array_return_type(self, using_infer_string):
# GH 35517
df = DataFrame([["foo"]])
result = df.apply(lambda col: np.array("bar"))
expected = Series(["bar"])
if using_infer_string:
expected = Series([np.array(["bar"])])
else:
expected = Series(["bar"])
tm.assert_series_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_duplicated.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def test_duplicated_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype='object')")
msg = re.escape("Index(['a'], dtype=")

with pytest.raises(KeyError, match=msg):
df.duplicated(subset)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_equals.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self):
df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]})
assert df1.equals(df2) is False

def test_equals_different_blocks(self, using_array_manager):
def test_equals_different_blocks(self, using_array_manager, using_infer_string):
# GH#9330
df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
df1 = df0.reset_index()[["A", "B", "C"]]
if not using_array_manager:
if not using_array_manager and not using_infer_string:
# this assert verifies that the above operations have
# induced a block rearrangement
assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype
Expand Down
Loading