BUG: read_csv with engine pyarrow parsing multiple date columns (#50056)

lithomas1 · web-flow · commit 634b9402466c · 2023-05-18T23:21:04.000+02:00
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -388,6 +388,7 @@ I/O
 - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
 - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`)
 - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`)
+- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
 - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
 - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -58,6 +58,21 @@ def _get_pyarrow_options(self) -> None:
             if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
                 self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
 
+        # Date format handling
+        # If we get a string, we need to convert it into a list for pyarrow
+        # If we get a dict, we want to parse those separately
+        date_format = self.date_format
+        if isinstance(date_format, str):
+            date_format = [date_format]
+        else:
+            # In case of dict, we don't want to propagate through, so
+            # just set to pyarrow default of None
+
+            # Ideally, in future we disable pyarrow dtype inference (read in as string)
+            # to prevent misreads.
+            date_format = None
+        self.kwds["timestamp_parsers"] = date_format
+
         self.parse_options = {
             option_name: option_value
             for option_name, option_value in self.kwds.items()
@@ -76,6 +91,7 @@ def _get_pyarrow_options(self) -> None:
                 "true_values",
                 "false_values",
                 "decimal_point",
+                "timestamp_parsers",
             )
         }
         self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
@@ -116,7 +132,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
                 multi_index_named = False
             frame.columns = self.names
         # we only need the frame not the names
-        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
+        _, frame = self._do_date_conversions(frame.columns, frame)
         if self.index_col is not None:
             index_to_set = self.index_col.copy()
             for i, item in enumerate(self.index_col):
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -61,8 +61,10 @@
 
 from pandas import (
     ArrowDtype,
+    DataFrame,
     DatetimeIndex,
     StringDtype,
+    concat,
 )
 from pandas.core import algorithms
 from pandas.core.arrays import (
@@ -92,8 +94,6 @@
         Scalar,
     )
 
-    from pandas import DataFrame
-
 
 class ParserBase:
     class BadLineHandleMethod(Enum):
@@ -1304,7 +1304,10 @@ def _isindex(colspec):
             new_cols.append(new_name)
             date_cols.update(old_names)
 
-    data_dict.update(new_data)
+    if isinstance(data_dict, DataFrame):
+        data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False)
+    else:
+        data_dict.update(new_data)
     new_cols.extend(columns)
 
     if not keep_date_col:
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -139,9 +139,8 @@ def test_separator_date_conflict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
-def test_multiple_date_col_custom(all_parsers, keep_date_col):
+def test_multiple_date_col_custom(all_parsers, keep_date_col, request):
     data = """\
 KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
@@ -152,6 +151,14 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col):
 """
     parser = all_parsers
 
+    if keep_date_col and parser.engine == "pyarrow":
+        # For this to pass, we need to disable auto-inference on the date columns
+        # in parse_dates. We have no way of doing this though
+        mark = pytest.mark.xfail(
+            reason="pyarrow doesn't support disabling auto-inference on column numbers."
+        )
+        request.node.add_marker(mark)
+
     def date_parser(*date_cols):
         """
         Test date parser.
@@ -301,9 +308,8 @@ def test_concat_date_col_fail(container, dim):
         parsing.concat_date_cols(date_cols)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
-def test_multiple_date_col(all_parsers, keep_date_col):
+def test_multiple_date_col(all_parsers, keep_date_col, request):
     data = """\
 KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
@@ -313,6 +319,15 @@ def test_multiple_date_col(all_parsers, keep_date_col):
 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
 """
     parser = all_parsers
+
+    if keep_date_col and parser.engine == "pyarrow":
+        # For this to pass, we need to disable auto-inference on the date columns
+        # in parse_dates. We have no way of doing this though
+        mark = pytest.mark.xfail(
+            reason="pyarrow doesn't support disabling auto-inference on column numbers."
+        )
+        request.node.add_marker(mark)
+
     kwds = {
         "header": None,
         "parse_dates": [[1, 2], [1, 3]],
@@ -469,7 +484,6 @@ def test_date_col_as_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_multiple_date_cols_int_cast(all_parsers):
     data = (
         "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
@@ -530,7 +544,6 @@ def test_multiple_date_cols_int_cast(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_multiple_date_col_timestamp_parse(all_parsers):
     parser = all_parsers
     data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
@@ -1173,7 +1186,6 @@ def test_multiple_date_cols_chunked(all_parsers):
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
-@xfail_pyarrow
 def test_multiple_date_col_named_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1197,7 +1209,6 @@ def test_multiple_date_col_named_index_compat(all_parsers):
     tm.assert_frame_equal(with_indices, with_names)
 
 
-@xfail_pyarrow
 def test_multiple_date_col_multiple_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1413,7 +1424,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -1503,9 +1513,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
-# From date_parser fallback behavior
-@pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
 def test_parse_date_fields(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
@@ -1515,7 +1522,7 @@ def test_parse_date_fields(all_parsers):
         StringIO(data),
         header=0,
         parse_dates={"ymd": [0, 1, 2]},
-        date_parser=pd.to_datetime,
+        date_parser=lambda x: x,
     )
 
     expected = DataFrame(
@@ -1525,7 +1532,6 @@ def test_parse_date_fields(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize(
     ("key", "value", "warn"),
     [
@@ -1562,7 +1568,6 @@ def test_parse_date_all_fields(all_parsers, key, value, warn):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize(
     ("key", "value", "warn"),
     [
@@ -1599,7 +1604,6 @@ def test_datetime_fractional_seconds(all_parsers, key, value, warn):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_generic(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."