ENH: Implement io.nullable_backend config for read_csv(engine="pyarrow") (#49366)

mroeschke · web-flow · commit f332143172c6 · 2022-11-05T21:52:28.000+01:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -28,10 +28,24 @@ Available optional dependencies (listed in order of appearance at `install guide
 ``[all, performance, computation, timezone, fss, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql,
 sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (:issue:`39164`).
 
-.. _whatsnew_200.enhancements.enhancement2:
+.. _whatsnew_200.enhancements.io_readers_nullable_pyarrow:
 
-enhancement2
-^^^^^^^^^^^^
+Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet` and :func:`read_csv` (with ``engine="pyarrow"``)
+to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
+
+.. ipython:: python
+
+    import io
+    data = io.StringIO("""a,b,c,d,e,f,g,h,i
+        1,2.5,True,a,,,,,
+        3,4.5,False,b,6,7.5,True,a,
+    """)
+    with pd.option_context("io.nullable_backend", "pyarrow"):
+        df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
+    df
 
 .. _whatsnew_200.enhancements.other:
 
@@ -42,7 +56,6 @@ Other enhancements
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
 - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
 - Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
-- Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`)
 - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
 - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
 - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -1,16 +1,17 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
-
 from pandas._typing import ReadBuffer
 from pandas.compat._optional import import_optional_dependency
 
 from pandas.core.dtypes.inference import is_integer
 
-from pandas.io.parsers.base_parser import ParserBase
+from pandas import (
+    DataFrame,
+    arrays,
+    get_option,
+)
 
-if TYPE_CHECKING:
-    from pandas import DataFrame
+from pandas.io.parsers.base_parser import ParserBase
 
 
 class ArrowParserWrapper(ParserBase):
@@ -77,7 +78,7 @@ def _get_pyarrow_options(self) -> None:
             else self.kwds["skiprows"],
         }
 
-    def _finalize_output(self, frame: DataFrame) -> DataFrame:
+    def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
         """
         Processes data read in based on kwargs.
 
@@ -148,6 +149,16 @@ def read(self) -> DataFrame:
             parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
             convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
         )
-
-        frame = table.to_pandas()
-        return self._finalize_output(frame)
+        if (
+            self.kwds["use_nullable_dtypes"]
+            and get_option("io.nullable_backend") == "pyarrow"
+        ):
+            frame = DataFrame(
+                {
+                    col_name: arrays.ArrowExtensionArray(pa_col)
+                    for col_name, pa_col in zip(table.column_names, table.itercolumns())
+                }
+            )
+        else:
+            frame = table.to_pandas()
+        return self._finalize_pandas_output(frame)
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -24,6 +24,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import lib
 from pandas._libs.parsers import STR_NA_VALUES
 from pandas._typing import (
@@ -560,6 +562,14 @@ def _read(
             raise ValueError(
                 "The 'chunksize' option is not supported with the 'pyarrow' engine"
             )
+    elif (
+        kwds.get("use_nullable_dtypes", False)
+        and get_option("io.nullable_backend") == "pyarrow"
+    ):
+        raise NotImplementedError(
+            f"use_nullable_dtypes=True and engine={kwds['engine']} with "
+            "io.nullable_backend set to 'pyarrow' is not implemented."
+        )
     else:
         chunksize = validate_integer("chunksize", chunksize, 1)
 
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -9,7 +9,6 @@
 import pytest
 
 from pandas.errors import ParserWarning
-import pandas.util._test_decorators as td
 
 import pandas as pd
 from pandas import (
@@ -22,13 +21,10 @@
     StringArray,
 )
 
-# TODO(1.4): Change me into xfail at release time
-# and xfail individual tests
-pytestmark = pytest.mark.usefixtures("pyarrow_skip")
-
 
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_all_columns(all_parsers, dtype, check_orig):
     # see gh-3795, gh-6607
     parser = all_parsers
@@ -53,6 +49,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
         tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -71,6 +68,7 @@ def test_dtype_per_column(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_invalid_dtype_per_column(all_parsers):
     parser = all_parsers
     data = """\
@@ -84,6 +82,7 @@ def test_invalid_dtype_per_column(all_parsers):
         parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_raise_on_passed_int_dtype_with_nas(all_parsers):
     # see gh-2631
     parser = all_parsers
@@ -101,6 +100,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers):
         parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_with_converters(all_parsers):
     parser = all_parsers
     data = """a,b
@@ -132,6 +132,7 @@ def test_numeric_dtype(all_parsers, dtype):
     tm.assert_frame_equal(expected, result)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_boolean_dtype(all_parsers):
     parser = all_parsers
     data = "\n".join(
@@ -184,6 +185,7 @@ def test_boolean_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_delimiter_with_usecols_and_parse_dates(all_parsers):
     # GH#35873
     result = all_parsers.read_csv(
@@ -264,6 +266,7 @@ def test_skip_whitespace(c_parser_only, float_precision):
     tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_true_values_cast_to_bool(all_parsers):
     # GH#34655
     text = """a,b
@@ -286,6 +289,7 @@ def test_true_values_cast_to_bool(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
 def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
     # GH#35211
@@ -300,6 +304,7 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
     # GH#42022
     parser = all_parsers
@@ -309,6 +314,7 @@ def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_multi_index(all_parsers):
     # GH 42446
     parser = all_parsers
@@ -355,6 +361,7 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
     tm.assert_frame_equal(actual, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 @pytest.mark.parametrize("default", ["float", "float64"])
 def test_dtypes_defaultdict(all_parsers, default):
     # GH#41574
@@ -368,6 +375,7 @@ def test_dtypes_defaultdict(all_parsers, default):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
     # GH#41574
     data = """a,b,a,b,b.1
@@ -381,6 +389,7 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtypes_defaultdict_invalid(all_parsers):
     # GH#41574
     data = """a,b
@@ -392,6 +401,7 @@ def test_dtypes_defaultdict_invalid(all_parsers):
         parser.read_csv(StringIO(data), dtype=dtype)
 
 
+@pytest.mark.usefixtures("pyarrow_xfail")
 def test_use_nullable_dtypes(all_parsers):
     # GH#36712
 
@@ -435,11 +445,11 @@ def test_use_nullabla_dtypes_and_dtype(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@td.skip_if_no("pyarrow")
+@pytest.mark.usefixtures("pyarrow_xfail")
 @pytest.mark.parametrize("storage", ["pyarrow", "python"])
 def test_use_nullable_dtypes_string(all_parsers, storage):
     # GH#36712
-    import pyarrow as pa
+    pa = pytest.importorskip("pyarrow")
 
     with pd.option_context("mode.string_storage", storage):
 
@@ -477,3 +487,40 @@ def test_use_nullable_dtypes_ea_dtype_specified(all_parsers):
     result = parser.read_csv(StringIO(data), dtype="Int64", use_nullable_dtypes=True)
     expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
     tm.assert_frame_equal(result, expected)
+
+
+def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request):
+    # GH#36712
+    pa = pytest.importorskip("pyarrow")
+    parser = all_parsers
+
+    data = """a,b,c,d,e,f,g,h,i,j
+1,2.5,True,a,,,,,12-31-2019,
+3,4.5,False,b,6,7.5,True,a,12-31-2019,
+"""
+    with pd.option_context("io.nullable_backend", "pyarrow"):
+        if parser.engine != "pyarrow":
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=NotImplementedError,
+                    reason=f"Not implemented with engine={parser.engine}",
+                )
+            )
+        result = parser.read_csv(
+            StringIO(data), use_nullable_dtypes=True, parse_dates=["i"]
+        )
+        expected = DataFrame(
+            {
+                "a": pd.Series([1, 3], dtype="int64[pyarrow]"),
+                "b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
+                "c": pd.Series([True, False], dtype="bool[pyarrow]"),
+                "d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
+                "e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
+                "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
+                "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
+                "h": pd.Series(["", "a"], dtype=pd.ArrowDtype(pa.string())),
+                "i": pd.Series([Timestamp("2019-12-31")] * 2),
+                "j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
+            }
+        )
+    tm.assert_frame_equal(result, expected)