Skip to content

Commit 0d9fdd4

Browse files
authored
ENH: Add dtype_backend support to read_sql (#50985)
* ENH: Add dtype_backend support to read_sql * Fix intendation * Update doc
1 parent 1d58928 commit 0d9fdd4

File tree

3 files changed

+118
-40
lines changed

3 files changed

+118
-40
lines changed

doc/source/whatsnew/v2.0.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ to select the nullable dtypes implementation.
6969
* :func:`read_html`
7070
* :func:`read_xml`
7171
* :func:`read_json`
72+
* :func:`read_sql`
73+
* :func:`read_sql_query`
74+
* :func:`read_sql_table`
7275
* :func:`read_parquet`
7376
* :func:`read_orc`
7477
* :func:`read_feather`

pandas/io/sql.py

+39
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
DataFrame,
6262
Series,
6363
)
64+
from pandas.core.arrays import ArrowExtensionArray
6465
from pandas.core.base import PandasObject
6566
import pandas.core.common as com
6667
from pandas.core.internals.construction import convert_object_array
@@ -166,6 +167,12 @@ def _convert_arrays_to_dataframe(
166167
coerce_float=coerce_float,
167168
use_nullable_dtypes=use_nullable_dtypes,
168169
)
170+
dtype_backend = get_option("mode.dtype_backend")
171+
if dtype_backend == "pyarrow":
172+
pa = import_optional_dependency("pyarrow")
173+
arrays = [
174+
ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
175+
]
169176
if arrays:
170177
return DataFrame(dict(zip(columns, arrays)))
171178
else:
@@ -314,6 +321,14 @@ def read_sql_table(
314321
set to True, nullable dtypes are used for all dtypes that have a nullable
315322
implementation, even if no nulls are present.
316323
324+
.. note::
325+
326+
The nullable dtype implementation can be configured by calling
327+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
328+
numpy-backed nullable dtypes or
329+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
330+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
331+
317332
.. versionadded:: 2.0
318333
319334
Returns
@@ -449,6 +464,14 @@ def read_sql_query(
449464
set to True, nullable dtypes are used for all dtypes that have a nullable
450465
implementation, even if no nulls are present.
451466
467+
.. note::
468+
469+
The nullable dtype implementation can be configured by calling
470+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
471+
numpy-backed nullable dtypes or
472+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
473+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
474+
452475
.. versionadded:: 2.0
453476
454477
Returns
@@ -579,6 +602,14 @@ def read_sql(
579602
set to True, nullable dtypes are used for all dtypes that have a nullable
580603
implementation, even if no nulls are present.
581604
605+
.. note::
606+
607+
The nullable dtype implementation can be configured by calling
608+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
609+
numpy-backed nullable dtypes or
610+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
611+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
612+
582613
.. versionadded:: 2.0
583614
dtype : Type name or dict of columns
584615
Data type for data or columns. E.g. np.float64 or
@@ -1626,6 +1657,14 @@ def read_table(
16261657
set to True, nullable dtypes are used for all dtypes that have a nullable
16271658
implementation, even if no nulls are present.
16281659
1660+
.. note::
1661+
1662+
The nullable dtype implementation can be configured by calling
1663+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
1664+
numpy-backed nullable dtypes or
1665+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
1666+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
1667+
16291668
.. versionadded:: 2.0
16301669
16311670
Returns

pandas/tests/io/test_sql.py

+76-40
Original file line numberDiff line numberDiff line change
@@ -2360,61 +2360,73 @@ def test_get_engine_auto_error_message(self):
23602360

23612361
@pytest.mark.parametrize("option", [True, False])
23622362
@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"])
2363-
def test_read_sql_nullable_dtypes(self, string_storage, func, option):
2363+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
2364+
def test_read_sql_nullable_dtypes(
2365+
self, string_storage, func, option, dtype_backend
2366+
):
23642367
# GH#50048
23652368
table = "test"
23662369
df = self.nullable_data()
23672370
df.to_sql(table, self.conn, index=False, if_exists="replace")
23682371

23692372
with pd.option_context("mode.string_storage", string_storage):
2370-
if option:
2371-
with pd.option_context("mode.nullable_dtypes", True):
2372-
result = getattr(pd, func)(f"Select * from {table}", self.conn)
2373-
else:
2374-
result = getattr(pd, func)(
2375-
f"Select * from {table}", self.conn, use_nullable_dtypes=True
2376-
)
2377-
expected = self.nullable_expected(string_storage)
2373+
with pd.option_context("mode.dtype_backend", dtype_backend):
2374+
if option:
2375+
with pd.option_context("mode.nullable_dtypes", True):
2376+
result = getattr(pd, func)(f"Select * from {table}", self.conn)
2377+
else:
2378+
result = getattr(pd, func)(
2379+
f"Select * from {table}", self.conn, use_nullable_dtypes=True
2380+
)
2381+
expected = self.nullable_expected(string_storage, dtype_backend)
23782382
tm.assert_frame_equal(result, expected)
23792383

23802384
with pd.option_context("mode.string_storage", string_storage):
2381-
iterator = getattr(pd, func)(
2382-
f"Select * from {table}",
2383-
self.conn,
2384-
use_nullable_dtypes=True,
2385-
chunksize=3,
2386-
)
2387-
expected = self.nullable_expected(string_storage)
2388-
for result in iterator:
2389-
tm.assert_frame_equal(result, expected)
2385+
with pd.option_context("mode.dtype_backend", dtype_backend):
2386+
iterator = getattr(pd, func)(
2387+
f"Select * from {table}",
2388+
self.conn,
2389+
use_nullable_dtypes=True,
2390+
chunksize=3,
2391+
)
2392+
expected = self.nullable_expected(string_storage, dtype_backend)
2393+
for result in iterator:
2394+
tm.assert_frame_equal(result, expected)
23902395

23912396
@pytest.mark.parametrize("option", [True, False])
23922397
@pytest.mark.parametrize("func", ["read_sql", "read_sql_table"])
2393-
def test_read_sql_nullable_dtypes_table(self, string_storage, func, option):
2398+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
2399+
def test_read_sql_nullable_dtypes_table(
2400+
self, string_storage, func, option, dtype_backend
2401+
):
23942402
# GH#50048
23952403
table = "test"
23962404
df = self.nullable_data()
23972405
df.to_sql(table, self.conn, index=False, if_exists="replace")
23982406

23992407
with pd.option_context("mode.string_storage", string_storage):
2400-
if option:
2401-
with pd.option_context("mode.nullable_dtypes", True):
2402-
result = getattr(pd, func)(table, self.conn)
2403-
else:
2404-
result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True)
2405-
expected = self.nullable_expected(string_storage)
2408+
with pd.option_context("mode.dtype_backend", dtype_backend):
2409+
if option:
2410+
with pd.option_context("mode.nullable_dtypes", True):
2411+
result = getattr(pd, func)(table, self.conn)
2412+
else:
2413+
result = getattr(pd, func)(
2414+
table, self.conn, use_nullable_dtypes=True
2415+
)
2416+
expected = self.nullable_expected(string_storage, dtype_backend)
24062417
tm.assert_frame_equal(result, expected)
24072418

24082419
with pd.option_context("mode.string_storage", string_storage):
2409-
iterator = getattr(pd, func)(
2410-
table,
2411-
self.conn,
2412-
use_nullable_dtypes=True,
2413-
chunksize=3,
2414-
)
2415-
expected = self.nullable_expected(string_storage)
2416-
for result in iterator:
2417-
tm.assert_frame_equal(result, expected)
2420+
with pd.option_context("mode.dtype_backend", dtype_backend):
2421+
iterator = getattr(pd, func)(
2422+
table,
2423+
self.conn,
2424+
use_nullable_dtypes=True,
2425+
chunksize=3,
2426+
)
2427+
expected = self.nullable_expected(string_storage, dtype_backend)
2428+
for result in iterator:
2429+
tm.assert_frame_equal(result, expected)
24182430

24192431
def nullable_data(self) -> DataFrame:
24202432
return DataFrame(
@@ -2430,7 +2442,7 @@ def nullable_data(self) -> DataFrame:
24302442
}
24312443
)
24322444

2433-
def nullable_expected(self, storage) -> DataFrame:
2445+
def nullable_expected(self, storage, dtype_backend) -> DataFrame:
24342446

24352447
string_array: StringArray | ArrowStringArray
24362448
string_array_na: StringArray | ArrowStringArray
@@ -2443,7 +2455,7 @@ def nullable_expected(self, storage) -> DataFrame:
24432455
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
24442456
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
24452457

2446-
return DataFrame(
2458+
df = DataFrame(
24472459
{
24482460
"a": Series([1, np.nan, 3], dtype="Int64"),
24492461
"b": Series([1, 2, 3], dtype="Int64"),
@@ -2455,6 +2467,18 @@ def nullable_expected(self, storage) -> DataFrame:
24552467
"h": string_array_na,
24562468
}
24572469
)
2470+
if dtype_backend == "pyarrow":
2471+
pa = pytest.importorskip("pyarrow")
2472+
2473+
from pandas.arrays import ArrowExtensionArray
2474+
2475+
df = DataFrame(
2476+
{
2477+
col: ArrowExtensionArray(pa.array(df[col], from_pandas=True))
2478+
for col in df.columns
2479+
}
2480+
)
2481+
return df
24582482

24592483
def test_chunksize_empty_dtypes(self):
24602484
# GH#50245
@@ -2578,8 +2602,14 @@ class Test(BaseModel):
25782602

25792603
assert list(df.columns) == ["id", "string_column"]
25802604

2581-
def nullable_expected(self, storage) -> DataFrame:
2582-
return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"})
2605+
def nullable_expected(self, storage, dtype_backend) -> DataFrame:
2606+
df = super().nullable_expected(storage, dtype_backend)
2607+
if dtype_backend == "pandas":
2608+
df = df.astype({"e": "Int64", "f": "Int64"})
2609+
else:
2610+
df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"})
2611+
2612+
return df
25832613

25842614
@pytest.mark.parametrize("func", ["read_sql", "read_sql_table"])
25852615
def test_read_sql_nullable_dtypes_table(self, string_storage, func):
@@ -2613,8 +2643,14 @@ def setup_driver(cls):
26132643
def test_default_type_conversion(self):
26142644
pass
26152645

2616-
def nullable_expected(self, storage) -> DataFrame:
2617-
return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"})
2646+
def nullable_expected(self, storage, dtype_backend) -> DataFrame:
2647+
df = super().nullable_expected(storage, dtype_backend)
2648+
if dtype_backend == "pandas":
2649+
df = df.astype({"e": "Int64", "f": "Int64"})
2650+
else:
2651+
df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"})
2652+
2653+
return df
26182654

26192655

26202656
@pytest.mark.db

0 commit comments

Comments
 (0)