Skip to content

Commit ee3c18f

Browse files
TST (string dtype): resolve all xfails in IO parser tests (pandas-dev#60321)
1 parent fba5f08 commit ee3c18f

11 files changed

+49
-63
lines changed

pandas/tests/io/parser/common/test_chunksize.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas._libs import parsers as libparsers
1412
from pandas.errors import DtypeWarning
1513

@@ -231,8 +229,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
231229
assert result.a.dtype == float
232230

233231

234-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
235-
def test_warn_if_chunks_have_mismatched_type(all_parsers):
232+
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
236233
warning_type = None
237234
parser = all_parsers
238235
size = 10000
@@ -260,8 +257,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
260257
"Specify dtype option on import or set low_memory=False.",
261258
buf,
262259
)
263-
264-
assert df.a.dtype == object
260+
if parser.engine == "c" and parser.low_memory:
261+
assert df.a.dtype == object
262+
elif using_infer_string:
263+
assert df.a.dtype == "str"
264+
else:
265+
assert df.a.dtype == object
265266

266267

267268
@pytest.mark.parametrize("iterator", [True, False])

pandas/tests/io/parser/common/test_file_buffer_url.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
import numpy as np
1616
import pytest
1717

18-
from pandas._config import using_string_dtype
19-
2018
from pandas.compat import WASM
2119
from pandas.errors import (
2220
EmptyDataError,
@@ -71,14 +69,13 @@ def test_local_file(all_parsers, csv_dir_path):
7169
pytest.skip("Failing on: " + " ".join(platform.uname()))
7270

7371

74-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
7572
@xfail_pyarrow # AssertionError: DataFrame.index are different
7673
def test_path_path_lib(all_parsers):
7774
parser = all_parsers
7875
df = DataFrame(
7976
1.1 * np.arange(120).reshape((30, 4)),
80-
columns=Index(list("ABCD"), dtype=object),
81-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
77+
columns=Index(list("ABCD")),
78+
index=Index([f"i-{i}" for i in range(30)]),
8279
)
8380
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
8481
tm.assert_frame_equal(df, result)

pandas/tests/io/parser/common/test_index.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99

1010
import pytest
1111

12-
from pandas._config import using_string_dtype
13-
1412
from pandas import (
1513
DataFrame,
1614
Index,
@@ -88,9 +86,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
8886
tm.assert_frame_equal(result, expected)
8987

9088

91-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
9289
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
93-
def test_multi_index_no_level_names(all_parsers, index_col):
90+
def test_multi_index_no_level_names(
91+
request, all_parsers, index_col, using_infer_string
92+
):
93+
if using_infer_string and all_parsers.engine == "pyarrow":
94+
# result should have string columns instead of object dtype
95+
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
9496
data = """index1,index2,A,B,C,D
9597
foo,one,2,3,4,5
9698
foo,two,7,8,9,10

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import numpy as np
1010
import pytest
1111

12-
from pandas._config import using_string_dtype
13-
1412
from pandas.errors import ParserWarning
1513

1614
import pandas as pd
@@ -57,7 +55,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
5755
tm.assert_frame_equal(result, expected)
5856

5957

60-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
6158
@pytest.mark.usefixtures("pyarrow_xfail")
6259
def test_dtype_per_column(all_parsers):
6360
parser = all_parsers
@@ -71,7 +68,6 @@ def test_dtype_per_column(all_parsers):
7168
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
7269
)
7370
expected["one"] = expected["one"].astype(np.float64)
74-
expected["two"] = expected["two"].astype(object)
7571

7672
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
7773
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/test_c_parser_only.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
import numpy as np
1919
import pytest
2020

21-
from pandas._config import using_string_dtype
22-
2321
from pandas.compat import WASM
2422
from pandas.compat.numpy import np_version_gte1p24
2523
from pandas.errors import (
@@ -184,8 +182,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
184182
assert max(precise_errors) <= max(normal_errors)
185183

186184

187-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
188-
def test_usecols_dtypes(c_parser_only):
185+
def test_usecols_dtypes(c_parser_only, using_infer_string):
189186
parser = c_parser_only
190187
data = """\
191188
1,2,3
@@ -210,8 +207,12 @@ def test_usecols_dtypes(c_parser_only):
210207
dtype={"b": int, "c": float},
211208
)
212209

213-
assert (result.dtypes == [object, int, float]).all()
214-
assert (result2.dtypes == [object, float]).all()
210+
if using_infer_string:
211+
assert (result.dtypes == ["string", int, float]).all()
212+
assert (result2.dtypes == ["string", float]).all()
213+
else:
214+
assert (result.dtypes == [object, int, float]).all()
215+
assert (result2.dtypes == [object, float]).all()
215216

216217

217218
def test_disable_bool_parsing(c_parser_only):

pandas/tests/io/parser/test_converters.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import numpy as np
1010
import pytest
1111

12-
from pandas._config import using_string_dtype
13-
1412
import pandas as pd
1513
from pandas import (
1614
DataFrame,
@@ -188,7 +186,6 @@ def convert_score(x):
188186
tm.assert_frame_equal(results[0], results[1])
189187

190188

191-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
192189
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
193190
def test_converter_index_col_bug(all_parsers, conv_f):
194191
# see gh-1835 , GH#40589
@@ -207,7 +204,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
207204
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
208205
)
209206

210-
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
207+
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
211208
tm.assert_frame_equal(rs, xp)
212209

213210

pandas/tests/io/parser/test_index_col.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
import numpy as np
1010
import pytest
1111

12-
from pandas._config import using_string_dtype
13-
1412
from pandas import (
1513
DataFrame,
1614
Index,
@@ -345,7 +343,6 @@ def test_infer_types_boolean_sum(all_parsers):
345343
tm.assert_frame_equal(result, expected, check_index_type=False)
346344

347345

348-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
349346
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
350347
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
351348
# GH#9435
@@ -356,7 +353,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
356353
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
357354
)
358355
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
359-
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
356+
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
360357
tm.assert_frame_equal(result, expected)
361358

362359

pandas/tests/io/parser/test_mangle_dupes.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88

99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
13-
from pandas import DataFrame
11+
from pandas import (
12+
DataFrame,
13+
Index,
14+
)
1415
import pandas._testing as tm
1516

1617
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@@ -121,7 +122,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
121122
parser.read_csv(StringIO(data), names=names)
122123

123124

124-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
125125
@xfail_pyarrow # AssertionError: DataFrame.columns are different
126126
def test_mangled_unnamed_placeholders(all_parsers):
127127
# xref gh-13017
@@ -133,7 +133,7 @@ def test_mangled_unnamed_placeholders(all_parsers):
133133

134134
# This test recursively updates `df`.
135135
for i in range(3):
136-
expected = DataFrame()
136+
expected = DataFrame(columns=Index([], dtype="str"))
137137

138138
for j in range(i + 1):
139139
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)

pandas/tests/io/parser/test_na_values.py

+17-14
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas._libs.parsers import STR_NA_VALUES
1412

1513
from pandas import (
@@ -261,7 +259,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
261259
tm.assert_frame_equal(result, expected)
262260

263261

264-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
265262
@pytest.mark.parametrize(
266263
"kwargs,expected",
267264
[
@@ -299,7 +296,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
299296
),
300297
],
301298
)
302-
def test_na_values_keep_default(all_parsers, kwargs, expected, request):
299+
def test_na_values_keep_default(
300+
all_parsers, kwargs, expected, request, using_infer_string
301+
):
303302
data = """\
304303
A,B,C
305304
a,1,one
@@ -317,8 +316,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request):
317316
with pytest.raises(ValueError, match=msg):
318317
parser.read_csv(StringIO(data), **kwargs)
319318
return
320-
mark = pytest.mark.xfail()
321-
request.applymarker(mark)
319+
if not using_infer_string or "na_values" in kwargs:
320+
mark = pytest.mark.xfail()
321+
request.applymarker(mark)
322322

323323
result = parser.read_csv(StringIO(data), **kwargs)
324324
expected = DataFrame(expected)
@@ -429,23 +429,28 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
429429
tm.assert_frame_equal(result, expected)
430430

431431

432-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
433-
@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case
434432
@pytest.mark.parametrize(
435433
"na_filter,row_data",
436434
[
437435
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
438436
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
439437
],
440438
)
441-
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
439+
def test_na_values_na_filter_override(
440+
request, all_parsers, na_filter, row_data, using_infer_string
441+
):
442+
parser = all_parsers
443+
if parser.engine == "pyarrow":
444+
# mismatched dtypes in both cases, FutureWarning in the True case
445+
if not (using_infer_string and na_filter):
446+
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
447+
request.applymarker(mark)
442448
data = """\
443449
A,B
444450
1,A
445451
nan,B
446452
3,C
447453
"""
448-
parser = all_parsers
449454
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
450455

451456
expected = DataFrame(row_data, columns=["A", "B"])
@@ -536,7 +541,6 @@ def test_na_values_dict_aliasing(all_parsers):
536541
tm.assert_dict_equal(na_values, na_values_copy)
537542

538543

539-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
540544
def test_na_values_dict_null_column_name(all_parsers):
541545
# see gh-57547
542546
parser = all_parsers
@@ -560,11 +564,10 @@ def test_na_values_dict_null_column_name(all_parsers):
560564
return
561565

562566
expected = DataFrame(
563-
{None: ["MA", "NA", "OA"], "x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]}
567+
{"x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]},
568+
index=Index(["MA", "NA", "OA"], dtype=object),
564569
)
565570

566-
expected = expected.set_index(None)
567-
568571
result = parser.read_csv(
569572
StringIO(data),
570573
index_col=0,

pandas/tests/io/parser/test_parse_dates.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
import numpy as np
1414
import pytest
1515

16-
from pandas._config import using_string_dtype
17-
1816
import pandas as pd
1917
from pandas import (
2018
DataFrame,
@@ -421,15 +419,14 @@ def test_parse_timezone(all_parsers):
421419
tm.assert_frame_equal(result, expected)
422420

423421

424-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
425422
@skip_pyarrow # pandas.errors.ParserError: CSV parse error
426423
@pytest.mark.parametrize(
427424
"date_string",
428425
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
429426
)
430427
def test_invalid_parse_delimited_date(all_parsers, date_string):
431428
parser = all_parsers
432-
expected = DataFrame({0: [date_string]}, dtype="object")
429+
expected = DataFrame({0: [date_string]}, dtype="str")
433430
result = parser.read_csv(
434431
StringIO(date_string),
435432
header=None,
@@ -609,7 +606,6 @@ def test_date_parser_usecols_thousands(all_parsers):
609606
tm.assert_frame_equal(result, expected)
610607

611608

612-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
613609
def test_dayfirst_warnings():
614610
# GH 12585
615611

@@ -642,7 +638,7 @@ def test_dayfirst_warnings():
642638

643639
# first in DD/MM/YYYY, second in MM/DD/YYYY
644640
input = "date\n31/12/2014\n03/30/2011"
645-
expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
641+
expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date")
646642

647643
# A. use dayfirst=True
648644
res5 = read_csv(
@@ -752,7 +748,6 @@ def test_parse_dates_and_string_dtype(all_parsers):
752748
tm.assert_frame_equal(result, expected)
753749

754750

755-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
756751
def test_parse_dot_separated_dates(all_parsers):
757752
# https://github.com/pandas-dev/pandas/issues/2586
758753
parser = all_parsers
@@ -762,7 +757,7 @@ def test_parse_dot_separated_dates(all_parsers):
762757
if parser.engine == "pyarrow":
763758
expected_index = Index(
764759
["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
765-
dtype="object",
760+
dtype="str",
766761
name="a",
767762
)
768763
warn = None

pandas/tests/io/parser/test_upcast.py

-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
64
from pandas._libs.parsers import (
75
_maybe_upcast,
86
na_values,
@@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan():
8684
tm.assert_extension_array_equal(result, expected)
8785

8886

89-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
9087
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
9188
def test_maybe_upcast_object(val, string_storage):
9289
# GH#36712

0 commit comments

Comments
 (0)