Skip to content

Commit 112c2e9

Browse files
WillAydjorisvandenbosschemroeschke
authored
Backport PR pandas-dev#60321: TST (string dtype): resolve all xfails in IO pars… (pandas-dev#60330)
* Backport PR pandas-dev#60321: TST (string dtype): resolve all xfails in IO parser tests (cherry picked from commit ee3c18f) * BUG: Avoid RangeIndex conversion in read_csv if dtype is specified (pandas-dev#59316) Co-authored-by: Joris Van den Bossche <[email protected]> Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 0bcd250 commit 112c2e9

12 files changed

+89
-71
lines changed

pandas/io/parsers/base_parser.py

+25-11
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
464464
arrays = []
465465
converters = self._clean_mapping(self.converters)
466466

467-
for i, arr in enumerate(index):
467+
if self.index_names is not None:
468+
names: Iterable = self.index_names
469+
else:
470+
names = itertools.cycle([None])
471+
for i, (arr, name) in enumerate(zip(index, names)):
468472
if try_parse_dates and self._should_parse_dates(i):
469473
arr = self._date_conv(
470474
arr,
@@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
504508
arr, _ = self._infer_types(
505509
arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
506510
)
507-
arrays.append(arr)
508-
509-
names = self.index_names
510-
index = ensure_index_from_sequences(arrays, names)
511+
if cast_type is not None:
512+
# Don't perform RangeIndex inference
513+
idx = Index(arr, name=name, dtype=cast_type)
514+
else:
515+
idx = ensure_index_from_sequences([arr], [name])
516+
arrays.append(idx)
511517

512-
return index
518+
if len(arrays) == 1:
519+
return arrays[0]
520+
else:
521+
return MultiIndex.from_arrays(arrays)
513522

514523
@final
515524
def _convert_to_ndarrays(
@@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
10841093
dtype_dict: defaultdict[Hashable, Any]
10851094
if not is_dict_like(dtype):
10861095
# if dtype == None, default will be object.
1087-
default_dtype = dtype or object
1088-
dtype_dict = defaultdict(lambda: default_dtype)
1096+
dtype_dict = defaultdict(lambda: dtype)
10891097
else:
10901098
dtype = cast(dict, dtype)
10911099
dtype_dict = defaultdict(
1092-
lambda: object,
1100+
lambda: None,
10931101
{columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
10941102
)
10951103

@@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None):
11061114
if (index_col is None or index_col is False) or index_names is None:
11071115
index = default_index(0)
11081116
else:
1109-
data = [Series([], dtype=dtype_dict[name]) for name in index_names]
1110-
index = ensure_index_from_sequences(data, names=index_names)
1117+
# TODO: We could return default_index(0) if dtype_dict[name] is None
1118+
data = [
1119+
Index([], name=name, dtype=dtype_dict[name]) for name in index_names
1120+
]
1121+
if len(data) == 1:
1122+
index = data[0]
1123+
else:
1124+
index = MultiIndex.from_arrays(data)
11111125
index_col.sort()
11121126

11131127
for i, n in enumerate(index_col):

pandas/tests/io/parser/common/test_chunksize.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas._libs import parsers as libparsers
1311
from pandas.errors import DtypeWarning
1412

@@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
230228
assert result.a.dtype == float
231229

232230

233-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
234-
def test_warn_if_chunks_have_mismatched_type(all_parsers):
231+
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
235232
warning_type = None
236233
parser = all_parsers
237234
size = 10000
@@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
259256
"Specify dtype option on import or set low_memory=False.",
260257
buf,
261258
)
262-
263-
assert df.a.dtype == object
259+
if parser.engine == "c" and parser.low_memory:
260+
assert df.a.dtype == object
261+
elif using_infer_string:
262+
assert df.a.dtype == "str"
263+
else:
264+
assert df.a.dtype == object
264265

265266

266267
@pytest.mark.parametrize("iterator", [True, False])

pandas/tests/io/parser/common/test_file_buffer_url.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
import numpy as np
1515
import pytest
1616

17-
from pandas._config import using_string_dtype
18-
1917
from pandas.errors import (
2018
EmptyDataError,
2119
ParserError,
@@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path):
6967
pytest.skip("Failing on: " + " ".join(platform.uname()))
7068

7169

72-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
7370
@xfail_pyarrow # AssertionError: DataFrame.index are different
7471
def test_path_path_lib(all_parsers):
7572
parser = all_parsers
7673
df = DataFrame(
7774
1.1 * np.arange(120).reshape((30, 4)),
78-
columns=Index(list("ABCD"), dtype=object),
79-
index=Index([f"i-{i}" for i in range(30)], dtype=object),
75+
columns=Index(list("ABCD")),
76+
index=Index([f"i-{i}" for i in range(30)]),
8077
)
8178
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
8279
tm.assert_frame_equal(df, result)

pandas/tests/io/parser/common/test_index.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88

99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas import (
1412
DataFrame,
1513
Index,
@@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
8785
tm.assert_frame_equal(result, expected)
8886

8987

90-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
9188
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
92-
def test_multi_index_no_level_names(all_parsers, index_col):
89+
def test_multi_index_no_level_names(
90+
request, all_parsers, index_col, using_infer_string
91+
):
92+
if using_infer_string and all_parsers.engine == "pyarrow":
93+
# result should have string columns instead of object dtype
94+
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
9395
data = """index1,index2,A,B,C,D
9496
foo,one,2,3,4,5
9597
foo,two,7,8,9,10

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas.errors import ParserWarning
1412

1513
import pandas as pd
@@ -24,6 +22,8 @@
2422
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
2523
)
2624

25+
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
26+
2727

2828
@pytest.mark.parametrize("dtype", [str, object])
2929
@pytest.mark.parametrize("check_orig", [True, False])
@@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
5454
tm.assert_frame_equal(result, expected)
5555

5656

57-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
5857
@pytest.mark.usefixtures("pyarrow_xfail")
5958
def test_dtype_per_column(all_parsers):
6059
parser = all_parsers
@@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers):
6867
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
6968
)
7069
expected["one"] = expected["one"].astype(np.float64)
71-
expected["two"] = expected["two"].astype(object)
7270

7371
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
7472
tm.assert_frame_equal(result, expected)
@@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
598596
tm.assert_frame_equal(result, expected)
599597

600598

599+
@xfail_pyarrow
601600
def test_accurate_parsing_of_large_integers(all_parsers):
602601
# GH#52505
603602
data = """SYMBOL,MOMENT,ID,ID_DEAL
@@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers):
608607
AMZN,20230301181139587,2023552585717889759,2023552585717263360
609608
MSFT,20230301181139587,2023552585717889863,2023552585717263361
610609
NVDA,20230301181139587,2023552585717889827,2023552585717263361"""
611-
orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
610+
orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()})
612611
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1
613612
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1
614613
assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2
@@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers):
630629
values = ["1", "4"]
631630
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
632631
tm.assert_frame_equal(result, expected)
632+
633+
634+
def test_index_col_with_dtype_no_rangeindex(all_parsers):
635+
data = StringIO("345.5,519.5,0\n519.5,726.5,1")
636+
result = all_parsers.read_csv(
637+
data,
638+
header=None,
639+
names=["start", "stop", "bin_id"],
640+
dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32},
641+
index_col="bin_id",
642+
).index
643+
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
644+
tm.assert_index_equal(result, expected)

pandas/tests/io/parser/test_c_parser_only.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
import numpy as np
1818
import pytest
1919

20-
from pandas._config import using_string_dtype
21-
2220
from pandas.compat.numpy import np_version_gte1p24
2321
from pandas.errors import (
2422
ParserError,
@@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
185183
assert max(precise_errors) <= max(normal_errors)
186184

187185

188-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
189-
def test_usecols_dtypes(c_parser_only):
186+
def test_usecols_dtypes(c_parser_only, using_infer_string):
190187
parser = c_parser_only
191188
data = """\
192189
1,2,3
@@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only):
211208
dtype={"b": int, "c": float},
212209
)
213210

214-
assert (result.dtypes == [object, int, float]).all()
215-
assert (result2.dtypes == [object, float]).all()
211+
if using_infer_string:
212+
assert (result.dtypes == ["string", int, float]).all()
213+
assert (result2.dtypes == ["string", float]).all()
214+
else:
215+
assert (result.dtypes == [object, int, float]).all()
216+
assert (result2.dtypes == [object, float]).all()
216217

217218

218219
def test_disable_bool_parsing(c_parser_only):

pandas/tests/io/parser/test_converters.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
import pandas as pd
1412
from pandas import (
1513
DataFrame,
@@ -186,7 +184,6 @@ def convert_score(x):
186184
tm.assert_frame_equal(results[0], results[1])
187185

188186

189-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
190187
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
191188
def test_converter_index_col_bug(all_parsers, conv_f):
192189
# see gh-1835 , GH#40589
@@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
205202
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
206203
)
207204

208-
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
205+
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
209206
tm.assert_frame_equal(rs, xp)
210207

211208

pandas/tests/io/parser/test_index_col.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas import (
1412
DataFrame,
1513
Index,
@@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers):
344342
tm.assert_frame_equal(result, expected, check_index_type=False)
345343

346344

347-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
348345
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
349346
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
350347
# GH#9435
@@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
355352
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
356353
)
357354
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
358-
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
355+
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
359356
tm.assert_frame_equal(result, expected)
360357

361358

pandas/tests/io/parser/test_mangle_dupes.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77

88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
12-
from pandas import DataFrame
10+
from pandas import (
11+
DataFrame,
12+
Index,
13+
)
1314
import pandas._testing as tm
1415

1516
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
120121
parser.read_csv(StringIO(data), names=names)
121122

122123

123-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
124124
@xfail_pyarrow # AssertionError: DataFrame.columns are different
125125
def test_mangled_unnamed_placeholders(all_parsers):
126126
# xref gh-13017
@@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers):
132132

133133
# This test recursively updates `df`.
134134
for i in range(3):
135-
expected = DataFrame()
135+
expected = DataFrame(columns=Index([], dtype="str"))
136136

137137
for j in range(i + 1):
138138
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)

0 commit comments

Comments
 (0)