From d35ffd26d97098bfe2798bf558155daf7d655914 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Mar 2023 10:44:07 -0400 Subject: [PATCH 1/8] BUG: Fix some more arrow CSV tests --- doc/source/whatsnew/v2.1.0.rst | 3 ++- pandas/io/_util.py | 2 ++ pandas/io/parsers/arrow_parser_wrapper.py | 15 +++++++++++++++ pandas/io/parsers/readers.py | 3 +++ pandas/tests/io/parser/test_na_values.py | 15 +++++++++------ 5 files changed, 31 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 3f898ca23bd6f..f0508947ed161 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -199,7 +199,8 @@ Period - Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`) - Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`) - Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) -- +- Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087 `) +- Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087 `) Plotting ^^^^^^^^ diff --git a/pandas/io/_util.py b/pandas/io/_util.py index d2a001f0cf925..8834f9d08b23f 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -8,6 +8,8 @@ def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") return { + # All nulls should still give Float64 not object + pa.null(): pd.Float64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index a741a11332e99..31847fbb1dd59 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -80,6 +80,7 @@ def _get_pyarrow_options(self) -> None: "decimal_point", ) } + self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header @@ -149,6 +150,7 @@ def read(self) -> DataFrame: DataFrame The DataFrame created from the CSV file. """ + pa = import_optional_dependency("pyarrow") pyarrow_csv = import_optional_dependency("pyarrow.csv") self._get_pyarrow_options() @@ -158,6 +160,19 @@ def read(self) -> DataFrame: parse_options=pyarrow_csv.ParseOptions(**self.parse_options), convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), ) + + # Convert all pa.null() cols -> float64 + # TODO: There has to be a better way... right? + cols_to_convert = [] + new_schema = table.schema + for i, type in enumerate(table.schema.types): + if pa.types.is_null(type): + cols_to_convert.append(i) + for i in cols_to_convert: + new_schema = new_schema.set(i, new_schema.field(i).with_type(pa.float64())) + + table = table.cast(new_schema) + if self.kwds["dtype_backend"] == "pyarrow": frame = table.to_pandas(types_mapper=pd.ArrowDtype) elif self.kwds["dtype_backend"] == "numpy_nullable": diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index df675a0a3a6cc..20ce47de0bdb5 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1458,8 +1458,11 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: value = kwds[argname] if engine != "c" and value != default: + # TODO: Refactor this logic, its pretty convoluted if "python" in engine and argname not in _python_unsupported: pass + elif "pyarrow" in engine and argname not in _pyarrow_unsupported: + pass else: raise ValueError( f"The {repr(argname)} option is not supported with the " diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 0ca4884153b59..eb22be0c91c7c 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -20,7 +20,6 @@ xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@skip_pyarrow def test_string_nas(all_parsers): parser = all_parsers data = """A,B,C @@ -36,7 +35,6 @@ def test_string_nas(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_detect_string_na(all_parsers): parser = all_parsers data = """A,B @@ -89,7 +87,6 @@ def test_non_string_na_values(all_parsers, data, na_values): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_default_na_values(all_parsers): _NA_VALUES = { "-1.#IND", @@ -138,6 +135,7 @@ def f(i, v): tm.assert_frame_equal(result, expected) +# TODO: needs skiprows list support in pyarrow @skip_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): @@ -172,6 +170,7 @@ def test_bool_na_values(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Needs pyarrow support for dictionary in na_values @skip_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C @@ -191,7 +190,6 @@ def test_na_value_dict(all_parsers): tm.assert_frame_equal(df, expected) -@skip_pyarrow @pytest.mark.parametrize( "index_col,expected", [ @@ -225,6 +223,7 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) +# TODO: xfail components of this test, the first one passes @skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", @@ -287,7 +286,6 @@ def test_na_values_keep_default(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_no_na_values_no_keep_default(all_parsers): # see gh-4318: passing na_values=None and # keep_default_na=False yields 'None" as a na_value @@ -314,6 +312,7 @@ def test_no_na_values_no_keep_default(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Blocked on na_values dict support in pyarrow @skip_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 @@ -326,6 +325,7 @@ def test_no_keep_default_na_dict_na_values(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Blocked on na_values dict support in pyarrow @skip_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 @@ -338,6 +338,7 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): tm.assert_frame_equal(df, expected) +# TODO: Blocked on na_values dict support in pyarrow @skip_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): @@ -368,6 +369,7 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) +# TODO: Empty null_values doesn't work properly on pyarrow @skip_pyarrow @pytest.mark.parametrize( "na_filter,row_data", @@ -390,6 +392,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): tm.assert_frame_equal(result, expected) +# TODO: Arrow parse error @skip_pyarrow def test_na_trailing_columns(all_parsers): parser = all_parsers @@ -418,6 +421,7 @@ def test_na_trailing_columns(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: xfail the na_values dict case @skip_pyarrow @pytest.mark.parametrize( "na_values,row_data", @@ -495,7 +499,6 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) From 04857b31641b260956a7232f46e41297dfe1ab2f Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Mar 2023 13:26:51 -0400 Subject: [PATCH 2/8] Apply suggestions from code review Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f0508947ed161..d3adc21aad095 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -199,8 +199,8 @@ Period - Bug in :class:`PeriodDtype` constructor failing to raise ``TypeError`` when no argument is passed or when ``None`` is passed (:issue:`27388`) - Bug in :class:`PeriodDtype` constructor raising ``ValueError`` instead of ``TypeError`` when an invalid type is passed (:issue:`51790`) - Bug in :meth:`arrays.PeriodArray.map` and :meth:`PeriodIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`) -- Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087 `) -- Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087 `) +- Bug in :func:`read_csv` not processing empty strings as a null value, with ``engine="pyarrow"`` (:issue:`52087`) +- Bug in :func:`read_csv` returning ``object`` dtype columns instead of ``float64`` dtype columns with ``engine="pyarrow"`` for columns that are all null with ``engine="pyarrow"`` (:issue:`52087`) Plotting ^^^^^^^^ From fb08b13ee4e7cc0bf7f60a43f1a75bfc38306819 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Mar 2023 13:31:37 -0400 Subject: [PATCH 3/8] simplify --- pandas/io/parsers/arrow_parser_wrapper.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 31847fbb1dd59..694d40d7b9234 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -163,13 +163,12 @@ def read(self) -> DataFrame: # Convert all pa.null() cols -> float64 # TODO: There has to be a better way... right? - cols_to_convert = [] new_schema = table.schema - for i, type in enumerate(table.schema.types): - if pa.types.is_null(type): - cols_to_convert.append(i) - for i in cols_to_convert: - new_schema = new_schema.set(i, new_schema.field(i).with_type(pa.float64())) + for i, arrow_type in enumerate(table.schema.types): + if pa.types.is_null(arrow_type): + new_schema = new_schema.set( + i, new_schema.field(i).with_type(pa.float64()) + ) table = table.cast(new_schema) From c0268d975b12b2a4e43b2d1721acb7e134bfe872 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Mar 2023 16:06:44 -0400 Subject: [PATCH 4/8] skip test --- pandas/tests/io/parser/test_na_values.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index eb22be0c91c7c..647c1753cd660 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -499,6 +499,8 @@ def test_empty_na_values_no_default_with_index(all_parsers): tm.assert_frame_equal(result, expected) +# TODO: Missing support for na_filter kewyord +@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) From d7d5e3237aa2d116732df3a0c7d8c7f5231e8687 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 20 Mar 2023 16:09:21 -0400 Subject: [PATCH 5/8] Update _util.py --- pandas/io/_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 8834f9d08b23f..2c3eedc5b73dd 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -9,7 +9,8 @@ def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") return { # All nulls should still give Float64 not object - pa.null(): pd.Float64Dtype(), + # TODO: This breaks parquet + # pa.null(): pd.Float64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), From b4014c1897d7fa11214db894e210214d1a790a10 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 24 Mar 2023 10:31:55 -0400 Subject: [PATCH 6/8] fix tests --- pandas/io/parsers/arrow_parser_wrapper.py | 17 +++++++++-------- .../tests/io/parser/dtypes/test_dtypes_basic.py | 12 +++--------- pandas/tests/io/parser/test_parse_dates.py | 12 +----------- 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 694d40d7b9234..39e17a404450a 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -163,14 +163,15 @@ def read(self) -> DataFrame: # Convert all pa.null() cols -> float64 # TODO: There has to be a better way... right? - new_schema = table.schema - for i, arrow_type in enumerate(table.schema.types): - if pa.types.is_null(arrow_type): - new_schema = new_schema.set( - i, new_schema.field(i).with_type(pa.float64()) - ) - - table = table.cast(new_schema) + if self.kwds["dtype_backend"] != "pyarrow": + new_schema = table.schema + for i, arrow_type in enumerate(table.schema.types): + if pa.types.is_null(arrow_type): + new_schema = new_schema.set( + i, new_schema.field(i).with_type(pa.float64()) + ) + + table = table.cast(new_schema) if self.kwds["dtype_backend"] == "pyarrow": frame = table.to_pandas(types_mapper=pd.ArrowDtype) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index bb05b000c184f..2f6289ed3bb11 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -423,13 +423,9 @@ def test_dtype_backend(all_parsers): "e": pd.Series([pd.NA, 6], dtype="Int64"), "f": pd.Series([pd.NA, 7.5], dtype="Float64"), "g": pd.Series([pd.NA, True], dtype="boolean"), - "h": pd.Series( - [pd.NA if parser.engine != "pyarrow" else "", "a"], dtype="string" - ), + "h": pd.Series([pd.NA, "a"], dtype="string"), "i": pd.Series([Timestamp("2019-12-31")] * 2), - "j": pd.Series( - [pd.NA, pd.NA], dtype="Int64" if parser.engine != "pyarrow" else object - ), + "j": pd.Series([pd.NA, pd.NA], dtype="Float64"), } ) tm.assert_frame_equal(result, expected) @@ -451,7 +447,6 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_backend_string(all_parsers, string_storage): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -499,7 +494,6 @@ def test_dtype_backend_pyarrow(all_parsers, request): # GH#36712 pa = pytest.importorskip("pyarrow") parser = all_parsers - engine = parser.engine data = """a,b,c,d,e,f,g,h,i,j 1,2.5,True,a,,,,,12-31-2019, @@ -516,7 +510,7 @@ def test_dtype_backend_pyarrow(all_parsers, request): "f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"), "g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"), "h": pd.Series( - [pd.NA if engine != "pyarrow" else "", "a"], + [pd.NA, "a"], dtype=pd.ArrowDtype(pa.string()), ), "i": pd.Series([Timestamp("2019-12-31")] * 2), diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index f3c49471b5bb2..9af261753122a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1252,17 +1252,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value): parser = all_parsers s = StringIO((f"{value},\n") * 50000) - if parser.engine == "pyarrow": - # None in input gets converted to 'None', for which - # pandas tries to guess the datetime format, triggering - # the warning. TODO: parse dates directly in pyarrow, see - # https://github.com/pandas-dev/pandas/issues/48017 - warn = UserWarning - else: - warn = None - parser.read_csv_check_warnings( - warn, - "Could not infer format", + parser.read_csv( s, header=None, names=["foo", "bar"], From 671df0f411ed4057a834ff3081cdde47e8bb1b1c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 24 Mar 2023 10:51:06 -0400 Subject: [PATCH 7/8] last one? --- pandas/io/parsers/arrow_parser_wrapper.py | 17 ++++++++++++----- .../tests/io/parser/dtypes/test_dtypes_basic.py | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 39e17a404450a..5112262cec95b 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -161,21 +161,28 @@ def read(self) -> DataFrame: convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), ) - # Convert all pa.null() cols -> float64 + dtype_backend = self.kwds["dtype_backend"] + + # Convert all pa.null() cols -> float64 (non nullable) + # else Int64 (nullable case) # TODO: There has to be a better way... right? - if self.kwds["dtype_backend"] != "pyarrow": + if dtype_backend != "pyarrow": new_schema = table.schema + if dtype_backend == "numpy_nullable": + new_type = pa.int64() + else: + new_type = pa.float64() for i, arrow_type in enumerate(table.schema.types): if pa.types.is_null(arrow_type): new_schema = new_schema.set( - i, new_schema.field(i).with_type(pa.float64()) + i, new_schema.field(i).with_type(new_type) ) table = table.cast(new_schema) - if self.kwds["dtype_backend"] == "pyarrow": + if dtype_backend == "pyarrow": frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif self.kwds["dtype_backend"] == "numpy_nullable": + elif dtype_backend == "numpy_nullable": frame = table.to_pandas(types_mapper=_arrow_dtype_mapping().get) else: frame = table.to_pandas() diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 2f6289ed3bb11..915cc9a9a1f95 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -425,7 +425,7 @@ def test_dtype_backend(all_parsers): "g": pd.Series([pd.NA, True], dtype="boolean"), "h": pd.Series([pd.NA, "a"], dtype="string"), "i": pd.Series([Timestamp("2019-12-31")] * 2), - "j": pd.Series([pd.NA, pd.NA], dtype="Float64"), + "j": pd.Series([pd.NA, pd.NA], dtype="Int64"), } ) tm.assert_frame_equal(result, expected) From 2b4aae7101586237cecdf7f0046711135bd45eee Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 9 Apr 2023 22:21:40 -0400 Subject: [PATCH 8/8] simplify according to code review --- pandas/io/_util.py | 3 --- pandas/io/parsers/arrow_parser_wrapper.py | 17 +++++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 2c3eedc5b73dd..d2a001f0cf925 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -8,9 +8,6 @@ def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") return { - # All nulls should still give Float64 not object - # TODO: This breaks parquet - # pa.null(): pd.Float64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 5112262cec95b..b7b2ddf0293b5 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.inference import is_integer @@ -164,14 +165,10 @@ def read(self) -> DataFrame: dtype_backend = self.kwds["dtype_backend"] # Convert all pa.null() cols -> float64 (non nullable) - # else Int64 (nullable case) - # TODO: There has to be a better way... right? - if dtype_backend != "pyarrow": + # else Int64 (nullable case, see below) + if dtype_backend is lib.no_default: new_schema = table.schema - if dtype_backend == "numpy_nullable": - new_type = pa.int64() - else: - new_type = pa.float64() + new_type = pa.float64() for i, arrow_type in enumerate(table.schema.types): if pa.types.is_null(arrow_type): new_schema = new_schema.set( @@ -183,7 +180,11 @@ def read(self) -> DataFrame: if dtype_backend == "pyarrow": frame = table.to_pandas(types_mapper=pd.ArrowDtype) elif dtype_backend == "numpy_nullable": - frame = table.to_pandas(types_mapper=_arrow_dtype_mapping().get) + # Modify the default mapping to also + # map null to Int64 (to match other engines) + dtype_mapping = _arrow_dtype_mapping() + dtype_mapping[pa.null()] = pd.Int64Dtype() + frame = table.to_pandas(types_mapper=dtype_mapping.get) else: frame = table.to_pandas() return self._finalize_pandas_output(frame)