diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9f5d6011a7780..16cee24f5e9df 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -284,7 +284,8 @@ Period - :meth:`PeriodIndex.map` with ``na_action="ignore"`` now works as expected (:issue:`51644`) - Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`) - Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) -- +- Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087`) +- Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index a741a11332e99..b7b2ddf0293b5 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.inference import is_integer @@ -80,6 +81,7 @@ def _get_pyarrow_options(self) -> None: "decimal_point", ) } + self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header @@ -149,6 +151,7 @@ def read(self) -> DataFrame: DataFrame The DataFrame created from the CSV file. """ + pa = import_optional_dependency("pyarrow") pyarrow_csv = import_optional_dependency("pyarrow.csv") self._get_pyarrow_options() @@ -158,10 +161,30 @@ def read(self) -> DataFrame: parse_options=pyarrow_csv.ParseOptions(**self.parse_options), convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), ) - if self.kwds["dtype_backend"] == "pyarrow": + + dtype_backend = self.kwds["dtype_backend"] + + # Convert all pa.null() cols -> float64 (non nullable) + # else Int64 (nullable case, see below) + if dtype_backend is lib.no_default: + new_schema = table.schema + new_type = pa.float64() + for i, arrow_type in enumerate(table.schema.types): + if pa.types.is_null(arrow_type): + new_schema = new_schema.set( + i, new_schema.field(i).with_type(new_type) + ) + + table = table.cast(new_schema) + + if dtype_backend == "pyarrow": frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif self.kwds["dtype_backend"] == "numpy_nullable": - frame = table.to_pandas(types_mapper=_arrow_dtype_mapping().get) + elif dtype_backend == "numpy_nullable": + # Modify the default mapping to also + # map null to Int64 (to match other engines) + dtype_mapping = _arrow_dtype_mapping() + dtype_mapping[pa.null()] = pd.Int64Dtype() + frame = table.to_pandas(types_mapper=dtype_mapping.get) else: frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 2a6c43bff5047..f1f44a71b9a3b 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1460,8 +1460,11 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: value = kwds[argname] if engine != "c" and value != default: + # TODO: Refactor this logic, its pretty convoluted if "python" in engine and argname not in _python_unsupported: pass + elif "pyarrow" in engine and argname not in _pyarrow_unsupported: + pass else: raise ValueError( f"The {repr(argname)} option is not supported with the " diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index bb05b000c184f..915cc9a9a1f95 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -423,13 +423,9 @@ def test_dtype_backend(all_parsers): "e": pd.Series([pd.NA, 6], dtype="Int64"), "f": pd.Series([pd.NA, 7.5], dtype="Float64"), "g": pd.Series([pd.NA, True], dtype="boolean"), - "h": pd.Series( - [pd.NA if parser.engine != "pyarrow" else "", "a"], dtype="string" - ), + "h": pd.Series([pd.NA, "a"], dtype="string"), "i": pd.Series([Timestamp("2019-12-31")] * 2), - "j": pd.Series( - [pd.NA, pd.NA], dtype="Int64" if parser.engine != "pyarrow" else object - ), + "j": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) tm.assert_frame_equal(result, expected) @@ -451,7 +447,6 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -499,7 +494,6 @@ def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") parser = all_parsers - engine = parser.engine data = """a,b,c,d,e,f,g,h,i,j 1,2.5,True,a,,,,,12-31-2019, @@ -516,7 +510,7 @@ def test_dtype_backend_pyarrow(all_parsers, request): "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), "h": pd.Series( - [pd.NA if engine != "pyarrow" else "", "a"], + [pd.NA, "a"], dtype=pd.ArrowDtype(pa.string()), ), "i": pd.Series([Timestamp("2019-12-31")] * 2), diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 0ca4884153b59..647c1753cd660 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -20,7 +20,6 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@skip_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -36,7 +35,6 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -89,7 +87,6 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -138,6 +135,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) +# TODO: needs skiprows list support in pyarrow @skip_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): @@ -172,6 +170,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Needs pyarrow support for dictionary in na_values @skip_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C @@ -191,7 +190,6 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) -@skip_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -225,6 +223,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +# TODO: xfail components of this test, the first one passes @skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", @@ -287,7 +286,6 @@ def test_na_values_keep_default(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_no_na_values_no_keep_default(all_parsers): # see gh-4318: passing na_values=None and # keep_default_na=False yields 'None" as a na_value @@ -314,6 +312,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Blocked on na_values dict support in pyarrow @skip_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 @@ -326,6 +325,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Blocked on na_values dict support in pyarrow @skip_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 @@ -338,6 +338,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) +# TODO: Blocked on na_values dict support in pyarrow @skip_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): @@ -368,6 +369,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +# TODO: Empty null_values doesn't work properly on pyarrow @skip_pyarrow @pytest.mark.parametrize( "na_filter,row_data", @@ -390,6 +392,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) +# TODO: Arrow parse error @skip_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers @@ -418,6 +421,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: xfail the na_values dict case @skip_pyarrow @pytest.mark.parametrize( "na_values,row_data", @@ -495,6 +499,7 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Missing support for na_filter kewyord @skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 8c3474220cde8..55efb9254ee34 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1252,19 +1252,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): parser = all_parsers s = StringIO((f"{value},\n") * 50000) - if parser.engine == "pyarrow" and not cache_dates: - # None in input gets converted to 'None', for which - # pandas tries to guess the datetime format, triggering - # the warning. TODO: parse dates directly in pyarrow, see - # https://github.com/pandas-dev/pandas/issues/48017 - warn = UserWarning - else: - # Note: warning is not raised if 'cache_dates', because here there is only a - # single unique date and hence no risk of inconsistent parsing. - warn = None - parser.read_csv_check_warnings( - warn, - "Could not infer format", + parser.read_csv( s, header=None, names=["foo", "bar"],