Skip to content

ENH: Implement io.nullable_backend config for read_csv(engine="pyarrow") #49366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Nov 5, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Other enhancements
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
- :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` and :func:`read_excel` to enable automatic conversion to nullable dtypes (:issue:`36712`)
- Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`)
- Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet`, :func:`read_csv` (with ``engine="pyarrow"``) (:issue:`48957`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth having this a section for itself.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea. Moved to its own section

- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
- Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
- :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
Expand Down
29 changes: 20 additions & 9 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from pandas._typing import ReadBuffer
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.inference import is_integer

from pandas.io.parsers.base_parser import ParserBase
from pandas import (
DataFrame,
arrays,
get_option,
)

if TYPE_CHECKING:
from pandas import DataFrame
from pandas.io.parsers.base_parser import ParserBase


class ArrowParserWrapper(ParserBase):
Expand Down Expand Up @@ -77,7 +78,7 @@ def _get_pyarrow_options(self) -> None:
else self.kwds["skiprows"],
}

def _finalize_output(self, frame: DataFrame) -> DataFrame:
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
"""
Processes data read in based on kwargs.

Expand Down Expand Up @@ -150,6 +151,16 @@ def read(self) -> DataFrame:
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
)

frame = table.to_pandas()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know if the types_mapper kwarg would work here in the to_pandas call?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

types_mapper works, but it requires a conversion from arrow -> pandas -> back to arrow.

The implementation here just directly converts arrow table -> arrow ChunkedArray and sticks it into pandas

return self._finalize_output(frame)
if (
self.kwds["use_nullable_dtypes"]
and get_option("io.nullable_backend") == "pyarrow"
):
frame = DataFrame(
{
col_name: arrays.ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(table.column_names, table.itercolumns())
}
)
else:
frame = table.to_pandas()
return self._finalize_pandas_output(frame)
10 changes: 10 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

import numpy as np

from pandas._config import get_option

from pandas._libs import lib
from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import (
Expand Down Expand Up @@ -600,6 +602,14 @@ def _read(
raise ValueError(
"The 'chunksize' option is not supported with the 'pyarrow' engine"
)
elif (
Copy link
Member

@lithomas1 lithomas1 Oct 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there a reason to disable this for the C/Python parsers? If not, I'm OK with allowing this for consistency purposes.

Given the bugginess/lack of support for some kwargs in the pyarrow engine(Sorry for not really maintaining it after adding it), a user might want to read with the C/Python engine, first before operating on arrow arrays.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not entirely; I just wanted to keep the changes of this PR small for now and add support in a follow up PR

kwds.get("use_nullable_dtypes", False)
and get_option("io.nullable_backend") == "pyarrow"
):
raise NotImplementedError(
f"use_nullable_dtypes=True and engine={kwds['engine']} with "
"io.nullable_backend set to 'pyarrow' is not implemented."
)
else:
chunksize = validate_integer("chunksize", chunksize, 1)

Expand Down
61 changes: 54 additions & 7 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest

from pandas.errors import ParserWarning
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Expand All @@ -22,13 +21,10 @@
StringArray,
)

# TODO(1.4): Change me into xfail at release time
# and xfail individual tests
pytestmark = pytest.mark.usefixtures("pyarrow_skip")


@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_all_columns(all_parsers, dtype, check_orig):
# see gh-3795, gh-6607
parser = all_parsers
Expand All @@ -53,6 +49,7 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -71,6 +68,7 @@ def test_dtype_per_column(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_invalid_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -84,6 +82,7 @@ def test_invalid_dtype_per_column(all_parsers):
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})


@pytest.mark.usefixtures("pyarrow_xfail")
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
# see gh-2631
parser = all_parsers
Expand All @@ -101,6 +100,7 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers):
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_with_converters(all_parsers):
parser = all_parsers
data = """a,b
Expand Down Expand Up @@ -132,6 +132,7 @@ def test_numeric_dtype(all_parsers, dtype):
tm.assert_frame_equal(expected, result)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_boolean_dtype(all_parsers):
parser = all_parsers
data = "\n".join(
Expand Down Expand Up @@ -184,6 +185,7 @@ def test_boolean_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
# GH#35873
result = all_parsers.read_csv(
Expand Down Expand Up @@ -264,6 +266,7 @@ def test_skip_whitespace(c_parser_only, float_precision):
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))


@pytest.mark.usefixtures("pyarrow_xfail")
def test_true_values_cast_to_bool(all_parsers):
# GH#34655
text = """a,b
Expand All @@ -286,6 +289,7 @@ def test_true_values_cast_to_bool(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
# GH#35211
Expand All @@ -300,6 +304,7 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
# GH#42022
parser = all_parsers
Expand All @@ -309,6 +314,7 @@ def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_multi_index(all_parsers):
# GH 42446
parser = all_parsers
Expand Down Expand Up @@ -355,6 +361,7 @@ def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
tm.assert_frame_equal(actual, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("default", ["float", "float64"])
def test_dtypes_defaultdict(all_parsers, default):
# GH#41574
Expand All @@ -368,6 +375,7 @@ def test_dtypes_defaultdict(all_parsers, default):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
# GH#41574
data = """a,b,a,b,b.1
Expand All @@ -381,6 +389,7 @@ def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_invalid(all_parsers):
# GH#41574
data = """a,b
Expand All @@ -392,6 +401,7 @@ def test_dtypes_defaultdict_invalid(all_parsers):
parser.read_csv(StringIO(data), dtype=dtype)


@pytest.mark.usefixtures("pyarrow_xfail")
def test_use_nullable_dtypes(all_parsers):
# GH#36712

Expand Down Expand Up @@ -435,11 +445,11 @@ def test_use_nullabla_dtypes_and_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


@td.skip_if_no("pyarrow")
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("storage", ["pyarrow", "python"])
def test_use_nullabla_dtypes_string(all_parsers, storage):
# GH#36712
import pyarrow as pa
pa = pytest.importorskip("pyarrow")

with pd.option_context("mode.string_storage", storage):

Expand Down Expand Up @@ -477,3 +487,40 @@ def test_use_nullable_dtypes_ea_dtype_specified(all_parsers):
result = parser.read_csv(StringIO(data), dtype="Int64", use_nullable_dtypes=True)
expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
tm.assert_frame_equal(result, expected)


def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request):
# GH#36712
pa = pytest.importorskip("pyarrow")
parser = all_parsers

data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
with pd.option_context("io.nullable_backend", "pyarrow"):
if parser.engine != "pyarrow":
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"Not implemented with engine={parser.engine}",
)
)
result = parser.read_csv(
StringIO(data), use_nullable_dtypes=True, parse_dates=["i"]
)
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="int64[pyarrow]"),
"b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
"c": pd.Series([True, False], dtype="bool[pyarrow]"),
"d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
"e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
"h": pd.Series(["", "a"], dtype=pd.ArrowDtype(pa.string())),
"i": pd.Series([Timestamp("2019-12-31")] * 2),
"j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
}
)
tm.assert_frame_equal(result, expected)