Skip to content

BUG: Fix some more arrow CSV tests #52087

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Apr 10, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,8 @@ Period
- Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`)
- Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`)
- Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
-
- Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087`)
- Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087`)

Plotting
^^^^^^^^
Expand Down
3 changes: 3 additions & 0 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
def _arrow_dtype_mapping() -> dict:
pa = import_optional_dependency("pyarrow")
return {
# All nulls should still give Float64 not object
# TODO: This breaks parquet
# pa.null(): pd.Float64Dtype(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what way does it break parquet?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, first I hit this error.

>>> import pandas as pd
>>> import pyarrow as pa
>>> a = pa.nulls(0)
>>> pd.Float64Dtype().__from__arrow__(a)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: 'Float64Dtype' object has no attribute '__from__arrow__'

After patching that(another PR incoming),
now an empty DataFrame of dtype object(pd.DataFrame({"value": pd.array([], dtype=object)})) returns a Float64Dtype when roundtripped.

Best guess here is that somehow the types_mapper is overriding the pandas metadata somehow.

pa.int8(): pd.Int8Dtype(),
pa.int16(): pd.Int16Dtype(),
pa.int32(): pd.Int32Dtype(),
Expand Down
14 changes: 14 additions & 0 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def _get_pyarrow_options(self) -> None:
"decimal_point",
)
}
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
self.read_options = {
"autogenerate_column_names": self.header is None,
"skip_rows": self.header
Expand Down Expand Up @@ -149,6 +150,7 @@ def read(self) -> DataFrame:
DataFrame
The DataFrame created from the CSV file.
"""
pa = import_optional_dependency("pyarrow")
pyarrow_csv = import_optional_dependency("pyarrow.csv")
self._get_pyarrow_options()

Expand All @@ -158,6 +160,18 @@ def read(self) -> DataFrame:
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
)

# Convert all pa.null() cols -> float64
# TODO: There has to be a better way... right?
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I'm looking for here is something like a combination of select_dtypes + astype.

I couldn't find anything in the pyarrow docs, but maybe I'm just bad at Googling.

Sadly the types_mapper doesn't work, since you can only feed that EA types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe @jorisvandenbossche knows a better way?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does Table.select not work?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there is an easier way. We currently don't have something like select_dtypes, so what you do (iterating over the schema, checking the type, and if needed adapt the new schema) seems the best option.

new_schema = table.schema
for i, arrow_type in enumerate(table.schema.types):
if pa.types.is_null(arrow_type):
new_schema = new_schema.set(
i, new_schema.field(i).with_type(pa.float64())
)

table = table.cast(new_schema)

if self.kwds["dtype_backend"] == "pyarrow":
frame = table.to_pandas(types_mapper=pd.ArrowDtype)
elif self.kwds["dtype_backend"] == "numpy_nullable":
Expand Down
3 changes: 3 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,8 +1458,11 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
value = kwds[argname]

if engine != "c" and value != default:
# TODO: Refactor this logic, its pretty convoluted
if "python" in engine and argname not in _python_unsupported:
pass
elif "pyarrow" in engine and argname not in _pyarrow_unsupported:
pass
else:
raise ValueError(
f"The {repr(argname)} option is not supported with the "
Expand Down
15 changes: 10 additions & 5 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


@skip_pyarrow
def test_string_nas(all_parsers):
parser = all_parsers
data = """A,B,C
Expand All @@ -36,7 +35,6 @@ def test_string_nas(all_parsers):
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_detect_string_na(all_parsers):
parser = all_parsers
data = """A,B
Expand Down Expand Up @@ -89,7 +87,6 @@ def test_non_string_na_values(all_parsers, data, na_values):
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_default_na_values(all_parsers):
_NA_VALUES = {
"-1.#IND",
Expand Down Expand Up @@ -138,6 +135,7 @@ def f(i, v):
tm.assert_frame_equal(result, expected)


# TODO: needs skiprows list support in pyarrow
@skip_pyarrow
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
def test_custom_na_values(all_parsers, na_values):
Expand Down Expand Up @@ -172,6 +170,7 @@ def test_bool_na_values(all_parsers):
tm.assert_frame_equal(result, expected)


# TODO: Needs pyarrow support for dictionary in na_values
@skip_pyarrow
def test_na_value_dict(all_parsers):
data = """A,B,C
Expand All @@ -191,7 +190,6 @@ def test_na_value_dict(all_parsers):
tm.assert_frame_equal(df, expected)


@skip_pyarrow
@pytest.mark.parametrize(
"index_col,expected",
[
Expand Down Expand Up @@ -225,6 +223,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
tm.assert_frame_equal(result, expected)


# TODO: xfail components of this test, the first one passes
@skip_pyarrow
@pytest.mark.parametrize(
"kwargs,expected",
Expand Down Expand Up @@ -287,7 +286,6 @@ def test_na_values_keep_default(all_parsers, kwargs, expected):
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_no_na_values_no_keep_default(all_parsers):
# see gh-4318: passing na_values=None and
# keep_default_na=False yields 'None" as a na_value
Expand All @@ -314,6 +312,7 @@ def test_no_na_values_no_keep_default(all_parsers):
tm.assert_frame_equal(result, expected)


# TODO: Blocked on na_values dict support in pyarrow
@skip_pyarrow
def test_no_keep_default_na_dict_na_values(all_parsers):
# see gh-19227
Expand All @@ -326,6 +325,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers):
tm.assert_frame_equal(result, expected)


# TODO: Blocked on na_values dict support in pyarrow
@skip_pyarrow
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
# see gh-19227
Expand All @@ -338,6 +338,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
tm.assert_frame_equal(df, expected)


# TODO: Blocked on na_values dict support in pyarrow
@skip_pyarrow
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
Expand Down Expand Up @@ -368,6 +369,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
tm.assert_frame_equal(result, expected)


# TODO: Empty null_values doesn't work properly on pyarrow
@skip_pyarrow
@pytest.mark.parametrize(
"na_filter,row_data",
Expand All @@ -390,6 +392,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
tm.assert_frame_equal(result, expected)


# TODO: Arrow parse error
@skip_pyarrow
def test_na_trailing_columns(all_parsers):
parser = all_parsers
Expand Down Expand Up @@ -418,6 +421,7 @@ def test_na_trailing_columns(all_parsers):
tm.assert_frame_equal(result, expected)


# TODO: xfail the na_values dict case
@skip_pyarrow
@pytest.mark.parametrize(
"na_values,row_data",
Expand Down Expand Up @@ -495,6 +499,7 @@ def test_empty_na_values_no_default_with_index(all_parsers):
tm.assert_frame_equal(result, expected)


# TODO: Missing support for na_filter kewyord
@skip_pyarrow
@pytest.mark.parametrize(
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
Expand Down