Skip to content

Commit 634b940

Browse files
authored
BUG: read_csv with engine pyarrow parsing multiple date columns (#50056)
1 parent c871284 commit 634b940

File tree

4 files changed

+44
-20
lines changed

4 files changed

+44
-20
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ I/O
388388
- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
389389
- :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`)
390390
- Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`)
391+
- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
391392
- Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
392393
- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
393394
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)

pandas/io/parsers/arrow_parser_wrapper.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,21 @@ def _get_pyarrow_options(self) -> None:
5858
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
5959
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
6060

61+
# Date format handling
62+
# If we get a string, we need to convert it into a list for pyarrow
63+
# If we get a dict, we want to parse those separately
64+
date_format = self.date_format
65+
if isinstance(date_format, str):
66+
date_format = [date_format]
67+
else:
68+
# In case of dict, we don't want to propagate through, so
69+
# just set to pyarrow default of None
70+
71+
# Ideally, in future we disable pyarrow dtype inference (read in as string)
72+
# to prevent misreads.
73+
date_format = None
74+
self.kwds["timestamp_parsers"] = date_format
75+
6176
self.parse_options = {
6277
option_name: option_value
6378
for option_name, option_value in self.kwds.items()
@@ -76,6 +91,7 @@ def _get_pyarrow_options(self) -> None:
7691
"true_values",
7792
"false_values",
7893
"decimal_point",
94+
"timestamp_parsers",
7995
)
8096
}
8197
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
@@ -116,7 +132,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
116132
multi_index_named = False
117133
frame.columns = self.names
118134
# we only need the frame not the names
119-
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
135+
_, frame = self._do_date_conversions(frame.columns, frame)
120136
if self.index_col is not None:
121137
index_to_set = self.index_col.copy()
122138
for i, item in enumerate(self.index_col):

pandas/io/parsers/base_parser.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,10 @@
6161

6262
from pandas import (
6363
ArrowDtype,
64+
DataFrame,
6465
DatetimeIndex,
6566
StringDtype,
67+
concat,
6668
)
6769
from pandas.core import algorithms
6870
from pandas.core.arrays import (
@@ -92,8 +94,6 @@
9294
Scalar,
9395
)
9496

95-
from pandas import DataFrame
96-
9797

9898
class ParserBase:
9999
class BadLineHandleMethod(Enum):
@@ -1304,7 +1304,10 @@ def _isindex(colspec):
13041304
new_cols.append(new_name)
13051305
date_cols.update(old_names)
13061306

1307-
data_dict.update(new_data)
1307+
if isinstance(data_dict, DataFrame):
1308+
data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False)
1309+
else:
1310+
data_dict.update(new_data)
13081311
new_cols.extend(columns)
13091312

13101313
if not keep_date_col:

pandas/tests/io/parser/test_parse_dates.py

+20-16
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,8 @@ def test_separator_date_conflict(all_parsers):
139139
tm.assert_frame_equal(df, expected)
140140

141141

142-
@xfail_pyarrow
143142
@pytest.mark.parametrize("keep_date_col", [True, False])
144-
def test_multiple_date_col_custom(all_parsers, keep_date_col):
143+
def test_multiple_date_col_custom(all_parsers, keep_date_col, request):
145144
data = """\
146145
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
147146
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
@@ -152,6 +151,14 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col):
152151
"""
153152
parser = all_parsers
154153

154+
if keep_date_col and parser.engine == "pyarrow":
155+
# For this to pass, we need to disable auto-inference on the date columns
156+
# in parse_dates. We have no way of doing this though
157+
mark = pytest.mark.xfail(
158+
reason="pyarrow doesn't support disabling auto-inference on column numbers."
159+
)
160+
request.node.add_marker(mark)
161+
155162
def date_parser(*date_cols):
156163
"""
157164
Test date parser.
@@ -301,9 +308,8 @@ def test_concat_date_col_fail(container, dim):
301308
parsing.concat_date_cols(date_cols)
302309

303310

304-
@xfail_pyarrow
305311
@pytest.mark.parametrize("keep_date_col", [True, False])
306-
def test_multiple_date_col(all_parsers, keep_date_col):
312+
def test_multiple_date_col(all_parsers, keep_date_col, request):
307313
data = """\
308314
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
309315
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
@@ -313,6 +319,15 @@ def test_multiple_date_col(all_parsers, keep_date_col):
313319
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
314320
"""
315321
parser = all_parsers
322+
323+
if keep_date_col and parser.engine == "pyarrow":
324+
# For this to pass, we need to disable auto-inference on the date columns
325+
# in parse_dates. We have no way of doing this though
326+
mark = pytest.mark.xfail(
327+
reason="pyarrow doesn't support disabling auto-inference on column numbers."
328+
)
329+
request.node.add_marker(mark)
330+
316331
kwds = {
317332
"header": None,
318333
"parse_dates": [[1, 2], [1, 3]],
@@ -469,7 +484,6 @@ def test_date_col_as_index_col(all_parsers):
469484
tm.assert_frame_equal(result, expected)
470485

471486

472-
@xfail_pyarrow
473487
def test_multiple_date_cols_int_cast(all_parsers):
474488
data = (
475489
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
@@ -530,7 +544,6 @@ def test_multiple_date_cols_int_cast(all_parsers):
530544
tm.assert_frame_equal(result, expected)
531545

532546

533-
@xfail_pyarrow
534547
def test_multiple_date_col_timestamp_parse(all_parsers):
535548
parser = all_parsers
536549
data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
@@ -1173,7 +1186,6 @@ def test_multiple_date_cols_chunked(all_parsers):
11731186
tm.assert_frame_equal(chunks[2], expected[4:])
11741187

11751188

1176-
@xfail_pyarrow
11771189
def test_multiple_date_col_named_index_compat(all_parsers):
11781190
parser = all_parsers
11791191
data = """\
@@ -1197,7 +1209,6 @@ def test_multiple_date_col_named_index_compat(all_parsers):
11971209
tm.assert_frame_equal(with_indices, with_names)
11981210

11991211

1200-
@xfail_pyarrow
12011212
def test_multiple_date_col_multiple_index_compat(all_parsers):
12021213
parser = all_parsers
12031214
data = """\
@@ -1413,7 +1424,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers):
14131424
tm.assert_frame_equal(result, expected)
14141425

14151426

1416-
@xfail_pyarrow
14171427
@pytest.mark.parametrize(
14181428
"data,kwargs,expected",
14191429
[
@@ -1503,9 +1513,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected):
15031513
tm.assert_frame_equal(result, expected)
15041514

15051515

1506-
@xfail_pyarrow
1507-
# From date_parser fallback behavior
1508-
@pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
15091516
def test_parse_date_fields(all_parsers):
15101517
parser = all_parsers
15111518
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
@@ -1515,7 +1522,7 @@ def test_parse_date_fields(all_parsers):
15151522
StringIO(data),
15161523
header=0,
15171524
parse_dates={"ymd": [0, 1, 2]},
1518-
date_parser=pd.to_datetime,
1525+
date_parser=lambda x: x,
15191526
)
15201527

15211528
expected = DataFrame(
@@ -1525,7 +1532,6 @@ def test_parse_date_fields(all_parsers):
15251532
tm.assert_frame_equal(result, expected)
15261533

15271534

1528-
@xfail_pyarrow
15291535
@pytest.mark.parametrize(
15301536
("key", "value", "warn"),
15311537
[
@@ -1562,7 +1568,6 @@ def test_parse_date_all_fields(all_parsers, key, value, warn):
15621568
tm.assert_frame_equal(result, expected)
15631569

15641570

1565-
@xfail_pyarrow
15661571
@pytest.mark.parametrize(
15671572
("key", "value", "warn"),
15681573
[
@@ -1599,7 +1604,6 @@ def test_datetime_fractional_seconds(all_parsers, key, value, warn):
15991604
tm.assert_frame_equal(result, expected)
16001605

16011606

1602-
@xfail_pyarrow
16031607
def test_generic(all_parsers):
16041608
parser = all_parsers
16051609
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."

0 commit comments

Comments
 (0)