diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4df2886454c0a..70018e985af19 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -28,10 +28,24 @@ Available optional dependencies (listed in order of appearance at `install guide ``[all, performance, computation, timezone, fss, aws, gcp, excel, parquet, feather, hdf5, spss, postgresql, mysql, sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (:issue:`39164`). -.. _whatsnew_200.enhancements.enhancement2: +.. _whatsnew_200.enhancements.io_readers_nullable_pyarrow: -enhancement2 -^^^^^^^^^^^^ +Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet` and :func:`read_csv` (with ``engine="pyarrow"``) +to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`). + +.. ipython:: python + + import io + data = io.StringIO("""a,b,c,d,e,f,g,h,i + 1,2.5,True,a,,,,, + 3,4.5,False,b,6,7.5,True,a, + """) + with pd.option_context("io.nullable_backend", "pyarrow"): + df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") + df .. _whatsnew_200.enhancements.other: @@ -42,7 +56,6 @@ Other enhancements - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) - Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`) -- Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 6cc56bb1c8840..68158a30f7fdf 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -1,16 +1,17 @@ from __future__ import annotations -from typing import TYPE_CHECKING - from pandas._typing import ReadBuffer from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.inference import is_integer -from pandas.io.parsers.base_parser import ParserBase +from pandas import ( + DataFrame, + arrays, + get_option, +) -if TYPE_CHECKING: - from pandas import DataFrame +from pandas.io.parsers.base_parser import ParserBase class ArrowParserWrapper(ParserBase): @@ -77,7 +78,7 @@ def _get_pyarrow_options(self) -> None: else self.kwds["skiprows"], } - def _finalize_output(self, frame: DataFrame) -> DataFrame: + def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: """ Processes data read in based on kwargs. @@ -148,6 +149,16 @@ def read(self) -> DataFrame: parse_options=pyarrow_csv.ParseOptions(**self.parse_options), convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), ) - - frame = table.to_pandas() - return self._finalize_output(frame) + if ( + self.kwds["use_nullable_dtypes"] + and get_option("io.nullable_backend") == "pyarrow" + ): + frame = DataFrame( + { + col_name: arrays.ArrowExtensionArray(pa_col) + for col_name, pa_col in zip(table.column_names, table.itercolumns()) + } + ) + else: + frame = table.to_pandas() + return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index f2b466b06e062..af7b6027574e9 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -24,6 +24,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import ( @@ -560,6 +562,14 @@ def _read( raise ValueError( "The 'chunksize' option is not supported with the 'pyarrow' engine" ) + elif ( + kwds.get("use_nullable_dtypes", False) + and get_option("io.nullable_backend") == "pyarrow" + ): + raise NotImplementedError( + f"use_nullable_dtypes=True and engine={kwds['engine']} with " + "io.nullable_backend set to 'pyarrow' is not implemented." + ) else: chunksize = validate_integer("chunksize", chunksize, 1) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index f5b3b608bd59e..030b38cceeb39 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -9,7 +9,6 @@ import pytest from pandas.errors import ParserWarning -import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -22,13 +21,10 @@ StringArray, ) -# TODO(1.4): Change me into xfail at release time -# and xfail individual tests -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) +@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_all_columns(all_parsers, dtype, check_orig): # see gh-3795, gh-6607 parser = all_parsers @@ -53,6 +49,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -71,6 +68,7 @@ def test_dtype_per_column(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") def test_invalid_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -84,6 +82,7 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) +@pytest.mark.usefixtures("pyarrow_xfail") def test_raise_on_passed_int_dtype_with_nas(all_parsers): # see gh-2631 parser = all_parsers @@ -101,6 +100,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) +@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_with_converters(all_parsers): parser = all_parsers data = """a,b @@ -132,6 +132,7 @@ def test_numeric_dtype(all_parsers, dtype): tm.assert_frame_equal(expected, result) +@pytest.mark.usefixtures("pyarrow_xfail") def test_boolean_dtype(all_parsers): parser = all_parsers data = "\n".join( @@ -184,6 +185,7 @@ def test_boolean_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") def test_delimiter_with_usecols_and_parse_dates(all_parsers): # GH#35873 result = all_parsers.read_csv( @@ -264,6 +266,7 @@ def test_skip_whitespace(c_parser_only, float_precision): tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num")) +@pytest.mark.usefixtures("pyarrow_xfail") def test_true_values_cast_to_bool(all_parsers): # GH#34655 text = """a,b @@ -286,6 +289,7 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): # GH#35211 @@ -300,6 +304,7 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 parser = all_parsers @@ -309,6 +314,7 @@ def test_dtype_mangle_dup_cols_single_dtype(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_multi_index(all_parsers): # GH 42446 parser = all_parsers @@ -355,6 +361,7 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype): tm.assert_frame_equal(actual, expected) +@pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("default", ["float", "float64"]) def test_dtypes_defaultdict(all_parsers, default): # GH#41574 @@ -368,6 +375,7 @@ def test_dtypes_defaultdict(all_parsers, default): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): # GH#41574 data = """a,b,a,b,b.1 @@ -381,6 +389,7 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.usefixtures("pyarrow_xfail") def test_dtypes_defaultdict_invalid(all_parsers): # GH#41574 data = """a,b @@ -392,6 +401,7 @@ def test_dtypes_defaultdict_invalid(all_parsers): parser.read_csv(StringIO(data), dtype=dtype) +@pytest.mark.usefixtures("pyarrow_xfail") def test_use_nullable_dtypes(all_parsers): # GH#36712 @@ -435,11 +445,11 @@ def test_use_nullabla_dtypes_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@td.skip_if_no("pyarrow") +@pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("storage", ["pyarrow", "python"]) def test_use_nullable_dtypes_string(all_parsers, storage): # GH#36712 - import pyarrow as pa + pa = pytest.importorskip("pyarrow") with pd.option_context("mode.string_storage", storage): @@ -477,3 +487,40 @@ def test_use_nullable_dtypes_ea_dtype_specified(all_parsers): result = parser.read_csv(StringIO(data), dtype="Int64", use_nullable_dtypes=True) expected = DataFrame({"a": [1], "b": 2}, dtype="Int64") tm.assert_frame_equal(result, expected) + + +def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): + # GH#36712 + pa = pytest.importorskip("pyarrow") + parser = all_parsers + + data = """a,b,c,d,e,f,g,h,i,j +1,2.5,True,a,,,,,12-31-2019, +3,4.5,False,b,6,7.5,True,a,12-31-2019, +""" + with pd.option_context("io.nullable_backend", "pyarrow"): + if parser.engine != "pyarrow": + request.node.add_marker( + pytest.mark.xfail( + raises=NotImplementedError, + reason=f"Not implemented with engine={parser.engine}", + ) + ) + result = parser.read_csv( + StringIO(data), use_nullable_dtypes=True, parse_dates=["i"] + ) + expected = DataFrame( + { + "a": pd.Series([1, 3], dtype="int64[pyarrow]"), + "b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"), + "c": pd.Series([True, False], dtype="bool[pyarrow]"), + "d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())), + "e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"), + "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), + "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), + "h": pd.Series(["", "a"], dtype=pd.ArrowDtype(pa.string())), + "i": pd.Series([Timestamp("2019-12-31")] * 2), + "j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"), + } + ) + tm.assert_frame_equal(result, expected)