From 2097d9f60d38de9ce620db3d5f47f274ea6abbc3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 25 Jan 2023 21:20:57 -0500 Subject: [PATCH 1/3] ENH: Add dtype_backend support to read_sql --- doc/source/whatsnew/v2.0.0.rst | 3 + pandas/io/sql.py | 31 +++++++++ pandas/tests/io/test_sql.py | 112 ++++++++++++++++++++++----------- 3 files changed, 108 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c28c9fdad1804..ca98d41070fb3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -69,6 +69,9 @@ to select the nullable dtypes implementation. * :func:`read_html` * :func:`read_xml` * :func:`read_json` +* :func:`read_sql` +* :func:`read_sql_query` +* :func:`read_sql_table` * :func:`read_parquet` * :func:`read_orc` * :func:`read_feather` diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8ba208aa84286..aba6df57b0da5 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -58,6 +58,7 @@ DataFrame, Series, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.internals.construction import convert_object_array @@ -155,6 +156,12 @@ def _convert_arrays_to_dataframe( coerce_float=coerce_float, use_nullable_dtypes=use_nullable_dtypes, ) + dtype_backend = get_option("mode.dtype_backend") + if dtype_backend == "pyarrow": + pa = import_optional_dependency("pyarrow") + arrays = [ + ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays + ] if arrays: return DataFrame(dict(zip(columns, arrays))) else: @@ -303,6 +310,12 @@ def read_sql_table( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. versionadded:: 2.0 Returns @@ -438,6 +451,12 @@ def read_sql_query( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. versionadded:: 2.0 Returns @@ -568,6 +587,12 @@ def read_sql( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. versionadded:: 2.0 dtype : Type name or dict of columns Data type for data or columns. E.g. np.float64 or @@ -1609,6 +1634,12 @@ def read_table( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. versionadded:: 2.0 Returns diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a5bcfa8845785..9483ad1e23c7e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2293,59 +2293,71 @@ def test_get_engine_auto_error_message(self): @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_nullable_dtypes(self, string_storage, func, option): + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) + def test_read_sql_nullable_dtypes( + self, string_storage, func, option, dtype_backend + ): # GH#50048 table = "test" df = self.nullable_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - if option: - with pd.option_context("mode.nullable_dtypes", True): - result = getattr(pd, func)(f"Select * from {table}", self.conn) - else: - result = getattr(pd, func)( - f"Select * from {table}", self.conn, use_nullable_dtypes=True - ) - expected = self.nullable_expected(string_storage) + with pd.option_context("mode.dtype_backend", dtype_backend): + if option: + with pd.option_context("mode.nullable_dtypes", True): + result = getattr(pd, func)(f"Select * from {table}", self.conn) + else: + result = getattr(pd, func)( + f"Select * from {table}", self.conn, use_nullable_dtypes=True + ) + expected = self.nullable_expected(string_storage, dtype_backend) tm.assert_frame_equal(result, expected) with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - f"Select * from {table}", - self.conn, - use_nullable_dtypes=True, - chunksize=3, - ) - expected = self.nullable_expected(string_storage) - for result in iterator: - tm.assert_frame_equal(result, expected) + with pd.option_context("mode.dtype_backend", dtype_backend): + iterator = getattr(pd, func)( + f"Select * from {table}", + self.conn, + use_nullable_dtypes=True, + chunksize=3, + ) + expected = self.nullable_expected(string_storage, dtype_backend) + for result in iterator: + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("option", [True, False]) @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_nullable_dtypes_table(self, string_storage, func, option): + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) + def test_read_sql_nullable_dtypes_table( + self, string_storage, func, option, dtype_backend + ): # GH#50048 table = "test" df = self.nullable_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): - if option: - with pd.option_context("mode.nullable_dtypes", True): - result = getattr(pd, func)(table, self.conn) - else: - result = getattr(pd, func)(table, self.conn, use_nullable_dtypes=True) - expected = self.nullable_expected(string_storage) + with pd.option_context("mode.dtype_backend", dtype_backend): + if option: + with pd.option_context("mode.nullable_dtypes", True): + result = getattr(pd, func)(table, self.conn) + else: + result = getattr(pd, func)( + table, self.conn, use_nullable_dtypes=True + ) + expected = self.nullable_expected(string_storage, dtype_backend) tm.assert_frame_equal(result, expected) with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - table, - self.conn, - use_nullable_dtypes=True, - chunksize=3, - ) - expected = self.nullable_expected(string_storage) + with pd.option_context("mode.dtype_backend", dtype_backend): + iterator = getattr(pd, func)( + table, + self.conn, + use_nullable_dtypes=True, + chunksize=3, + ) + expected = self.nullable_expected(string_storage, dtype_backend) for result in iterator: tm.assert_frame_equal(result, expected) @@ -2363,7 +2375,7 @@ def nullable_data(self) -> DataFrame: } ) - def nullable_expected(self, storage) -> DataFrame: + def nullable_expected(self, storage, dtype_backend) -> DataFrame: string_array: StringArray | ArrowStringArray string_array_na: StringArray | ArrowStringArray @@ -2376,7 +2388,7 @@ def nullable_expected(self, storage) -> DataFrame: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - return DataFrame( + df = DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), "b": Series([1, 2, 3], dtype="Int64"), @@ -2388,6 +2400,18 @@ def nullable_expected(self, storage) -> DataFrame: "h": string_array_na, } ) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + + from pandas.arrays import ArrowExtensionArray + + df = DataFrame( + { + col: ArrowExtensionArray(pa.array(df[col], from_pandas=True)) + for col in df.columns + } + ) + return df def test_chunksize_empty_dtypes(self): # GH#50245 @@ -2511,8 +2535,14 @@ class Test(BaseModel): assert list(df.columns) == ["id", "string_column"] - def nullable_expected(self, storage) -> DataFrame: - return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"}) + def nullable_expected(self, storage, dtype_backend) -> DataFrame: + df = super().nullable_expected(storage, dtype_backend) + if dtype_backend == "pandas": + df = df.astype({"e": "Int64", "f": "Int64"}) + else: + df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) + + return df @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) def test_read_sql_nullable_dtypes_table(self, string_storage, func): @@ -2546,8 +2576,14 @@ def setup_driver(cls): def test_default_type_conversion(self): pass - def nullable_expected(self, storage) -> DataFrame: - return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"}) + def nullable_expected(self, storage, dtype_backend) -> DataFrame: + df = super().nullable_expected(storage, dtype_backend) + if dtype_backend == "pandas": + df = df.astype({"e": "Int64", "f": "Int64"}) + else: + df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) + + return df @pytest.mark.db From 45a1cfe683e06eabba5cc18666f7ce2b57afb785 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 26 Jan 2023 18:20:48 -0500 Subject: [PATCH 2/3] Fix intendation --- pandas/tests/io/test_sql.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9483ad1e23c7e..e467d4d2bc9a8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2357,9 +2357,9 @@ def test_read_sql_nullable_dtypes_table( use_nullable_dtypes=True, chunksize=3, ) - expected = self.nullable_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) + expected = self.nullable_expected(string_storage, dtype_backend) + for result in iterator: + tm.assert_frame_equal(result, expected) def nullable_data(self) -> DataFrame: return DataFrame( From 9aaefea38d71a22672a6f317370fcfdb49fd6463 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 30 Jan 2023 21:03:26 +0100 Subject: [PATCH 3/3] Update doc --- pandas/io/sql.py | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index aba6df57b0da5..dc929de9c2888 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -310,11 +310,13 @@ def read_sql_table( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. note:: + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0 @@ -451,11 +453,13 @@ def read_sql_query( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. note:: + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0 @@ -587,11 +591,13 @@ def read_sql( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. note:: + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0 dtype : Type name or dict of columns @@ -1634,11 +1640,13 @@ def read_table( set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. - The nullable dtype implementation can be configured by calling - ``pd.set_option("mode.dtype_backend", "pandas")`` to use - numpy-backed nullable dtypes or - ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use - pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + .. note:: + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0