Skip to content

Commit 38565aa

Browse files
Backport PR #60324: REF: centralize pyarrow Table to pandas conversions and types_mapper handling (#60332)
(cherry picked from commit 12d6f60) Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 4f13697 commit 38565aa

File tree

8 files changed

+92
-122
lines changed

8 files changed

+92
-122
lines changed

pandas/io/_util.py

+47-2
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,27 @@
11
from __future__ import annotations
22

3-
from typing import Callable
3+
from typing import (
4+
TYPE_CHECKING,
5+
Literal,
6+
)
47

58
import numpy as np
69

10+
from pandas._config import using_string_dtype
11+
12+
from pandas._libs import lib
713
from pandas.compat import pa_version_under18p0
814
from pandas.compat._optional import import_optional_dependency
915

1016
import pandas as pd
1117

18+
if TYPE_CHECKING:
19+
from collections.abc import Callable
20+
21+
import pyarrow
22+
23+
from pandas._typing import DtypeBackend
24+
1225

1326
def _arrow_dtype_mapping() -> dict:
1427
pa = import_optional_dependency("pyarrow")
@@ -30,7 +43,7 @@ def _arrow_dtype_mapping() -> dict:
3043
}
3144

3245

33-
def arrow_string_types_mapper() -> Callable:
46+
def _arrow_string_types_mapper() -> Callable:
3447
pa = import_optional_dependency("pyarrow")
3548

3649
mapping = {
@@ -41,3 +54,35 @@ def arrow_string_types_mapper() -> Callable:
4154
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
4255

4356
return mapping.get
57+
58+
59+
def arrow_table_to_pandas(
60+
table: pyarrow.Table,
61+
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
62+
null_to_int64: bool = False,
63+
to_pandas_kwargs: dict | None = None,
64+
) -> pd.DataFrame:
65+
if to_pandas_kwargs is None:
66+
to_pandas_kwargs = {}
67+
68+
pa = import_optional_dependency("pyarrow")
69+
70+
types_mapper: type[pd.ArrowDtype] | None | Callable
71+
if dtype_backend == "numpy_nullable":
72+
mapping = _arrow_dtype_mapping()
73+
if null_to_int64:
74+
# Modify the default mapping to also map null to Int64
75+
# (to match other engines - only for CSV parser)
76+
mapping[pa.null()] = pd.Int64Dtype()
77+
types_mapper = mapping.get
78+
elif dtype_backend == "pyarrow":
79+
types_mapper = pd.ArrowDtype
80+
elif using_string_dtype():
81+
types_mapper = _arrow_string_types_mapper()
82+
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
83+
types_mapper = None
84+
else:
85+
raise NotImplementedError
86+
87+
df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
88+
return df

pandas/io/feather_format.py

+2-15
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,10 @@
1313
from pandas.util._decorators import doc
1414
from pandas.util._validators import check_dtype_backend
1515

16-
import pandas as pd
1716
from pandas.core.api import DataFrame
1817
from pandas.core.shared_docs import _shared_docs
1918

20-
from pandas.io._util import arrow_string_types_mapper
19+
from pandas.io._util import arrow_table_to_pandas
2120
from pandas.io.common import get_handle
2221

2322
if TYPE_CHECKING:
@@ -128,16 +127,4 @@ def read_feather(
128127
pa_table = feather.read_table(
129128
handles.handle, columns=columns, use_threads=bool(use_threads)
130129
)
131-
132-
if dtype_backend == "numpy_nullable":
133-
from pandas.io._util import _arrow_dtype_mapping
134-
135-
return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
136-
137-
elif dtype_backend == "pyarrow":
138-
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
139-
140-
elif using_string_dtype():
141-
return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
142-
else:
143-
raise NotImplementedError
130+
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)

pandas/io/json/_json.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
from pandas.core.dtypes.dtypes import PeriodDtype
4141

4242
from pandas import (
43-
ArrowDtype,
4443
DataFrame,
4544
Index,
4645
MultiIndex,
@@ -52,6 +51,7 @@
5251
from pandas.core.reshape.concat import concat
5352
from pandas.core.shared_docs import _shared_docs
5453

54+
from pandas.io._util import arrow_table_to_pandas
5555
from pandas.io.common import (
5656
IOHandles,
5757
dedup_names,
@@ -997,18 +997,7 @@ def read(self) -> DataFrame | Series:
997997
if self.engine == "pyarrow":
998998
pyarrow_json = import_optional_dependency("pyarrow.json")
999999
pa_table = pyarrow_json.read_json(self.data)
1000-
1001-
mapping: type[ArrowDtype] | None | Callable
1002-
if self.dtype_backend == "pyarrow":
1003-
mapping = ArrowDtype
1004-
elif self.dtype_backend == "numpy_nullable":
1005-
from pandas.io._util import _arrow_dtype_mapping
1006-
1007-
mapping = _arrow_dtype_mapping().get
1008-
else:
1009-
mapping = None
1010-
1011-
return pa_table.to_pandas(types_mapper=mapping)
1000+
return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend)
10121001
elif self.engine == "ujson":
10131002
if self.lines:
10141003
if self.chunksize:

pandas/io/orc.py

+2-19
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,13 @@
99
Literal,
1010
)
1111

12-
from pandas._config import using_string_dtype
13-
1412
from pandas._libs import lib
1513
from pandas.compat._optional import import_optional_dependency
1614
from pandas.util._validators import check_dtype_backend
1715

18-
import pandas as pd
1916
from pandas.core.indexes.api import default_index
2017

21-
from pandas.io._util import arrow_string_types_mapper
18+
from pandas.io._util import arrow_table_to_pandas
2219
from pandas.io.common import (
2320
get_handle,
2421
is_fsspec_url,
@@ -117,21 +114,7 @@ def read_orc(
117114
pa_table = orc.read_table(
118115
source=source, columns=columns, filesystem=filesystem, **kwargs
119116
)
120-
if dtype_backend is not lib.no_default:
121-
if dtype_backend == "pyarrow":
122-
df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
123-
else:
124-
from pandas.io._util import _arrow_dtype_mapping
125-
126-
mapping = _arrow_dtype_mapping()
127-
df = pa_table.to_pandas(types_mapper=mapping.get)
128-
return df
129-
else:
130-
if using_string_dtype():
131-
types_mapper = arrow_string_types_mapper()
132-
else:
133-
types_mapper = None
134-
return pa_table.to_pandas(types_mapper=types_mapper)
117+
return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
135118

136119

137120
def to_orc(

pandas/io/parquet.py

+18-16
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010
Literal,
1111
)
1212
import warnings
13-
from warnings import catch_warnings
13+
from warnings import (
14+
catch_warnings,
15+
filterwarnings,
16+
)
1417

15-
from pandas._config import using_string_dtype
1618
from pandas._config.config import _get_option
1719

1820
from pandas._libs import lib
@@ -22,14 +24,13 @@
2224
from pandas.util._exceptions import find_stack_level
2325
from pandas.util._validators import check_dtype_backend
2426

25-
import pandas as pd
2627
from pandas import (
2728
DataFrame,
2829
get_option,
2930
)
3031
from pandas.core.shared_docs import _shared_docs
3132

32-
from pandas.io._util import arrow_string_types_mapper
33+
from pandas.io._util import arrow_table_to_pandas
3334
from pandas.io.common import (
3435
IOHandles,
3536
get_handle,
@@ -250,20 +251,10 @@ def read(
250251
kwargs["use_pandas_metadata"] = True
251252

252253
to_pandas_kwargs = {}
253-
if dtype_backend == "numpy_nullable":
254-
from pandas.io._util import _arrow_dtype_mapping
255-
256-
mapping = _arrow_dtype_mapping()
257-
to_pandas_kwargs["types_mapper"] = mapping.get
258-
elif dtype_backend == "pyarrow":
259-
to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment]
260-
elif using_string_dtype():
261-
to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper()
262254

263255
manager = _get_option("mode.data_manager", silent=True)
264256
if manager == "array":
265-
to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment]
266-
257+
to_pandas_kwargs["split_blocks"] = True
267258
path_or_handle, handles, filesystem = _get_path_or_handle(
268259
path,
269260
filesystem,
@@ -278,7 +269,18 @@ def read(
278269
filters=filters,
279270
**kwargs,
280271
)
281-
result = pa_table.to_pandas(**to_pandas_kwargs)
272+
273+
with catch_warnings():
274+
filterwarnings(
275+
"ignore",
276+
"make_block is deprecated",
277+
DeprecationWarning,
278+
)
279+
result = arrow_table_to_pandas(
280+
pa_table,
281+
dtype_backend=dtype_backend,
282+
to_pandas_kwargs=to_pandas_kwargs,
283+
)
282284

283285
if manager == "array":
284286
result = result._as_manager("array", copy=False)

pandas/io/parsers/arrow_parser_wrapper.py

+12-21
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
from typing import TYPE_CHECKING
44
import warnings
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas._libs import lib
97
from pandas.compat._optional import import_optional_dependency
108
from pandas.errors import (
@@ -16,18 +14,14 @@
1614
from pandas.core.dtypes.common import pandas_dtype
1715
from pandas.core.dtypes.inference import is_integer
1816

19-
import pandas as pd
20-
from pandas import DataFrame
21-
22-
from pandas.io._util import (
23-
_arrow_dtype_mapping,
24-
arrow_string_types_mapper,
25-
)
17+
from pandas.io._util import arrow_table_to_pandas
2618
from pandas.io.parsers.base_parser import ParserBase
2719

2820
if TYPE_CHECKING:
2921
from pandas._typing import ReadBuffer
3022

23+
from pandas import DataFrame
24+
3125

3226
class ArrowParserWrapper(ParserBase):
3327
"""
@@ -287,17 +281,14 @@ def read(self) -> DataFrame:
287281

288282
table = table.cast(new_schema)
289283

290-
if dtype_backend == "pyarrow":
291-
frame = table.to_pandas(types_mapper=pd.ArrowDtype)
292-
elif dtype_backend == "numpy_nullable":
293-
# Modify the default mapping to also
294-
# map null to Int64 (to match other engines)
295-
dtype_mapping = _arrow_dtype_mapping()
296-
dtype_mapping[pa.null()] = pd.Int64Dtype()
297-
frame = table.to_pandas(types_mapper=dtype_mapping.get)
298-
elif using_string_dtype():
299-
frame = table.to_pandas(types_mapper=arrow_string_types_mapper())
284+
with warnings.catch_warnings():
285+
warnings.filterwarnings(
286+
"ignore",
287+
"make_block is deprecated",
288+
DeprecationWarning,
289+
)
290+
frame = arrow_table_to_pandas(
291+
table, dtype_backend=dtype_backend, null_to_int64=True
292+
)
300293

301-
else:
302-
frame = table.to_pandas()
303294
return self._finalize_pandas_output(frame)

pandas/io/sql.py

+7-34
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,7 @@
4949
is_object_dtype,
5050
is_string_dtype,
5151
)
52-
from pandas.core.dtypes.dtypes import (
53-
ArrowDtype,
54-
DatetimeTZDtype,
55-
)
52+
from pandas.core.dtypes.dtypes import DatetimeTZDtype
5653
from pandas.core.dtypes.missing import isna
5754

5855
from pandas import get_option
@@ -68,6 +65,8 @@
6865
from pandas.core.internals.construction import convert_object_array
6966
from pandas.core.tools.datetimes import to_datetime
7067

68+
from pandas.io._util import arrow_table_to_pandas
69+
7170
if TYPE_CHECKING:
7271
from collections.abc import (
7372
Iterator,
@@ -2221,23 +2220,10 @@ def read_table(
22212220
else:
22222221
stmt = f"SELECT {select_list} FROM {table_name}"
22232222

2224-
mapping: type[ArrowDtype] | None | Callable
2225-
if dtype_backend == "pyarrow":
2226-
mapping = ArrowDtype
2227-
elif dtype_backend == "numpy_nullable":
2228-
from pandas.io._util import _arrow_dtype_mapping
2229-
2230-
mapping = _arrow_dtype_mapping().get
2231-
elif using_string_dtype():
2232-
from pandas.io._util import arrow_string_types_mapper
2233-
2234-
mapping = arrow_string_types_mapper()
2235-
else:
2236-
mapping = None
2237-
22382223
with self.con.cursor() as cur:
22392224
cur.execute(stmt)
2240-
df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping)
2225+
pa_table = cur.fetch_arrow_table()
2226+
df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
22412227

22422228
return _wrap_result_adbc(
22432229
df,
@@ -2305,23 +2291,10 @@ def read_query(
23052291
if chunksize:
23062292
raise NotImplementedError("'chunksize' is not implemented for ADBC drivers")
23072293

2308-
mapping: type[ArrowDtype] | None | Callable
2309-
if dtype_backend == "pyarrow":
2310-
mapping = ArrowDtype
2311-
elif dtype_backend == "numpy_nullable":
2312-
from pandas.io._util import _arrow_dtype_mapping
2313-
2314-
mapping = _arrow_dtype_mapping().get
2315-
elif using_string_dtype():
2316-
from pandas.io._util import arrow_string_types_mapper
2317-
2318-
mapping = arrow_string_types_mapper()
2319-
else:
2320-
mapping = None
2321-
23222294
with self.con.cursor() as cur:
23232295
cur.execute(sql)
2324-
df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping)
2296+
pa_table = cur.fetch_arrow_table()
2297+
df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
23252298

23262299
return _wrap_result_adbc(
23272300
df,

pandas/tests/io/test_sql.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -959,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data):
959959

960960
adbc_connectable_iris = [
961961
pytest.param("postgresql_adbc_iris", marks=pytest.mark.db),
962-
pytest.param("sqlite_adbc_iris", marks=pytest.mark.db),
962+
"sqlite_adbc_iris",
963963
]
964964

965965
adbc_connectable_types = [
966966
pytest.param("postgresql_adbc_types", marks=pytest.mark.db),
967-
pytest.param("sqlite_adbc_types", marks=pytest.mark.db),
967+
"sqlite_adbc_types",
968968
]
969969

970970

0 commit comments

Comments
 (0)