From bcbe5ea231e9686631158f5732895677bd838cf4 Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Sat, 31 Oct 2020 22:28:13 +0100 Subject: [PATCH 1/6] ENH: Add dtype argument to read_sql_query --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/sql.py | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 84eb3b3f15780..895ee884892da 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -282,6 +282,7 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) +- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 1fea50ecade3c..c95d66c9597c5 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -13,6 +13,7 @@ import numpy as np import pandas._libs.lib as lib +from pandas._typing import Dtype from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -119,10 +120,15 @@ def _parse_date_columns(data_frame, parse_dates): return data_frame -def _wrap_result(data, columns, index_col=None, coerce_float=True, parse_dates=None): +def _wrap_result( + data, columns, index_col=None, coerce_float=True, parse_dates=None, dtype=None +): """Wrap result set of query in a DataFrame.""" frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) + if dtype: + frame = frame.astype(dtype) + frame = _parse_date_columns(frame, parse_dates) if index_col is not None: @@ -295,6 +301,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: None = None, + dtype: Optional[Dtype] = None, ) -> DataFrame: ... @@ -308,6 +315,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: int = 1, + dtype: Optional[Dtype] = None, ) -> Iterator[DataFrame]: ... @@ -320,6 +328,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: Optional[int] = None, + dtype: Optional[Dtype] = None, ) -> Union[DataFrame, Iterator[DataFrame]]: """ Read SQL query into a DataFrame. @@ -381,6 +390,7 @@ def read_sql_query( coerce_float=coerce_float, parse_dates=parse_dates, chunksize=chunksize, + dtype=dtype, ) @@ -1225,7 +1235,13 @@ def read_table( @staticmethod def _query_iterator( - result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + result, + chunksize, + columns, + index_col=None, + coerce_float=True, + parse_dates=None, + dtype=None, ): """Return generator through chunked result set""" while True: @@ -1239,6 +1255,7 @@ def _query_iterator( index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, + dtype=dtype, ) def read_query( @@ -1249,6 +1266,7 @@ def read_query( parse_dates=None, params=None, chunksize=None, + dtype=None, ): """ Read SQL query into a DataFrame. @@ -1304,6 +1322,7 @@ def read_query( index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, + dtype=dtype, ) else: data = result.fetchall() @@ -1313,6 +1332,7 @@ def read_query( index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, + dtype=dtype, ) return frame From 9c4f0348774b6c478a6d425eee210d7ca718964e Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Sun, 1 Nov 2020 12:08:55 +0100 Subject: [PATCH 2/6] Update sql unit tests --- pandas/io/sql.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c95d66c9597c5..ff557a161f2e6 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1715,7 +1715,13 @@ def execute(self, *args, **kwargs): @staticmethod def _query_iterator( - cursor, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + cursor, + chunksize, + columns, + index_col=None, + coerce_float=True, + parse_dates=None, + dtype=None, ): """Return generator through chunked result set""" while True: @@ -1732,6 +1738,7 @@ def _query_iterator( index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, + dtype=dtype, ) def read_query( @@ -1742,6 +1749,7 @@ def read_query( params=None, parse_dates=None, chunksize=None, + dtype=None, ): args = _convert_params(sql, params) @@ -1756,6 +1764,7 @@ def read_query( index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, + dtype=dtype, ) else: data = self._fetchall_as_list(cursor) @@ -1767,6 +1776,7 @@ def read_query( index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, + dtype=dtype, ) return frame From 620c0abee0cab96d24cf7ea852fb1c6d42c214f5 Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Fri, 13 Nov 2020 13:46:15 +0100 Subject: [PATCH 3/6] Update type hinting and update doc --- pandas/io/sql.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ff557a161f2e6..3da80392bbdde 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -7,7 +7,7 @@ from datetime import date, datetime, time from functools import partial import re -from typing import Iterator, Optional, Union, overload +from typing import Dict, Iterator, Optional, Union, overload import warnings import numpy as np @@ -301,7 +301,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: None = None, - dtype: Optional[Dtype] = None, + dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None, ) -> DataFrame: ... @@ -315,7 +315,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: int = 1, - dtype: Optional[Dtype] = None, + dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None, ) -> Iterator[DataFrame]: ... @@ -328,7 +328,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: Optional[int] = None, - dtype: Optional[Dtype] = None, + dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None, ) -> Union[DataFrame, Iterator[DataFrame]]: """ Read SQL query into a DataFrame. @@ -367,6 +367,9 @@ def read_sql_query( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. + dtype : Type name or dict of columns + Data type for data or columns. E.g. np.float64 or + {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’} Returns ------- From bcef60e263d982b40db804bcd5e1b31482675784 Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Tue, 1 Dec 2020 22:26:42 +0100 Subject: [PATCH 4/6] Add test --- pandas/tests/io/test_sql.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1be6022bc0fcd..c0621d444fd89 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -857,6 +857,25 @@ def test_multiindex_roundtrip(self): ) tm.assert_frame_equal(df, result, check_index_type=True) + @pytest.mark.parametrize( + "dtype, expected", + [ + (None, [float, float]), + (int, [int, int]), + (float, [float, float]), + ({"SepalLength": int, "SepalWidth": float}, [int, float]), + ], + ) + def test_dtype_argument(self, dtype, expected): + # GH10285 Add dtype argument to read_sql_query + result = sql.read_sql_query( + "SELECT SepalLength, SepalWidth FROM iris", self.conn, dtype=dtype + ) + assert result.dtypes.to_dict() == { + "SepalLength": expected[0], + "SepalWidth": expected[1], + } + def test_integer_col_names(self): df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") From 5de64f21f672e33a6687e8b5fbf72b06985b7b56 Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Tue, 15 Dec 2020 20:56:43 +0100 Subject: [PATCH 5/6] Address comments --- doc/source/whatsnew/v1.2.0.rst | 1 - doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/_typing.py | 15 ++++++++------- pandas/io/sql.py | 28 ++++++++++++++++++---------- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index eca20b2b2d067..af9219bc25931 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -307,7 +307,6 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) -- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d0afc24aaecac..6ccd65652ddd5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -19,7 +19,7 @@ Enhancements Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_typing.py b/pandas/_typing.py index 09c490e64957d..a9f0695b17cdc 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -71,13 +71,6 @@ ] Timezone = Union[str, tzinfo] -# other - -Dtype = Union[ - "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] -] -DtypeObj = Union[np.dtype, "ExtensionDtype"] - # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series # is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed @@ -99,6 +92,14 @@ JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Axes = Collection +# dtypes + +Dtype = Union[ + "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] +] +DtypeArg = Optional[Union[Dtype, Dict[Label, Dtype]]] +DtypeObj = Union[np.dtype, "ExtensionDtype"] + # For functions like rename that convert one label to another Renamer = Union[Mapping[Label, Any], Callable[[Label], Label]] diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 034133a881721..192355f60299f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -7,13 +7,13 @@ from datetime import date, datetime, time from functools import partial import re -from typing import Dict, Iterator, List, Optional, Union, overload +from typing import Iterator, List, Optional, Union, overload import warnings import numpy as np import pandas._libs.lib as lib -from pandas._typing import Dtype +from pandas._typing import DtypeArg from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -121,7 +121,12 @@ def _parse_date_columns(data_frame, parse_dates): def _wrap_result( - data, columns, index_col=None, coerce_float=True, parse_dates=None, dtype=None + data, + columns, + index_col=None, + coerce_float=True, + parse_dates=None, + dtype: DtypeArg = None, ): """Wrap result set of query in a DataFrame.""" frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) @@ -301,7 +306,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: None = None, - dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None, + dtype: DtypeArg = None, ) -> DataFrame: ... @@ -315,7 +320,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: int = 1, - dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None, + dtype: DtypeArg = None, ) -> Iterator[DataFrame]: ... @@ -328,7 +333,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: Optional[int] = None, - dtype: Optional[Union[Dtype, Dict[str, Dtype]]] = None, + dtype: DtypeArg = None, ) -> Union[DataFrame, Iterator[DataFrame]]: """ Read SQL query into a DataFrame. @@ -1244,7 +1249,7 @@ def _query_iterator( index_col=None, coerce_float=True, parse_dates=None, - dtype=None, + dtype: DtypeArg = None, ): """Return generator through chunked result set""" while True: @@ -1269,7 +1274,7 @@ def read_query( parse_dates=None, params=None, chunksize=None, - dtype=None, + dtype: DtypeArg = None, ): """ Read SQL query into a DataFrame. @@ -1301,6 +1306,9 @@ def read_query( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. + dtype : Type name or dict of columns + Data type for data or columns. E.g. np.float64 or + {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’} Returns ------- @@ -1741,7 +1749,7 @@ def _query_iterator( index_col=None, coerce_float=True, parse_dates=None, - dtype=None, + dtype: DtypeArg = None, ): """Return generator through chunked result set""" while True: @@ -1769,7 +1777,7 @@ def read_query( params=None, parse_dates=None, chunksize=None, - dtype=None, + dtype: DtypeArg = None, ): args = _convert_params(sql, params) From a4e7cdf4e90255ee3d20611f19e045db8f84ea2a Mon Sep 17 00:00:00 2001 From: Avinash Pancham Date: Wed, 23 Dec 2020 21:42:38 +0100 Subject: [PATCH 6/6] Address comments --- pandas/_typing.py | 4 ++-- pandas/io/sql.py | 18 ++++++++++-------- pandas/tests/io/test_sql.py | 24 +++++++++++++----------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 362b54d45f43f..64452bf337361 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -94,11 +94,11 @@ Axes = Collection # dtypes - Dtype = Union[ "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] ] -DtypeArg = Optional[Union[Dtype, Dict[Label, Dtype]]] +# DtypeArg specifies all allowable dtypes in a functions its dtype argument +DtypeArg = Union[Dtype, Dict[Label, Dtype]] DtypeObj = Union[np.dtype, "ExtensionDtype"] # For functions like rename that convert one label to another diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 2f9adf08e8c50..0ad9140f2a757 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -133,7 +133,7 @@ def _wrap_result( index_col=None, coerce_float: bool = True, parse_dates=None, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ): """Wrap result set of query in a DataFrame.""" frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) @@ -313,7 +313,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: None = None, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ) -> DataFrame: ... @@ -327,7 +327,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: int = 1, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ) -> Iterator[DataFrame]: ... @@ -340,7 +340,7 @@ def read_sql_query( params=None, parse_dates=None, chunksize: Optional[int] = None, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ) -> Union[DataFrame, Iterator[DataFrame]]: """ Read SQL query into a DataFrame. @@ -1319,7 +1319,7 @@ def _query_iterator( index_col=None, coerce_float=True, parse_dates=None, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ): """Return generator through chunked result set""" while True: @@ -1344,7 +1344,7 @@ def read_query( parse_dates=None, params=None, chunksize: Optional[int] = None, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ): """ Read SQL query into a DataFrame. @@ -1380,6 +1380,8 @@ def read_query( Data type for data or columns. E.g. np.float64 or {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’} + .. versionadded:: 1.3.0 + Returns ------- DataFrame @@ -1819,7 +1821,7 @@ def _query_iterator( index_col=None, coerce_float: bool = True, parse_dates=None, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ): """Return generator through chunked result set""" while True: @@ -1847,7 +1849,7 @@ def read_query( params=None, parse_dates=None, chunksize: Optional[int] = None, - dtype: DtypeArg = None, + dtype: Optional[DtypeArg] = None, ): args = _convert_params(sql, params) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 8e8283f7f1c00..fdd42ec0cc5ab 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -938,23 +938,25 @@ def test_multiindex_roundtrip(self): tm.assert_frame_equal(df, result, check_index_type=True) @pytest.mark.parametrize( - "dtype, expected", + "dtype", [ - (None, [float, float]), - (int, [int, int]), - (float, [float, float]), - ({"SepalLength": int, "SepalWidth": float}, [int, float]), + None, + int, + float, + {"A": int, "B": float}, ], ) - def test_dtype_argument(self, dtype, expected): + def test_dtype_argument(self, dtype): # GH10285 Add dtype argument to read_sql_query + df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) + df.to_sql("test_dtype_argument", self.conn) + + expected = df.astype(dtype) result = sql.read_sql_query( - "SELECT SepalLength, SepalWidth FROM iris", self.conn, dtype=dtype + "SELECT A, B FROM test_dtype_argument", con=self.conn, dtype=dtype ) - assert result.dtypes.to_dict() == { - "SepalLength": expected[0], - "SepalWidth": expected[1], - } + + tm.assert_frame_equal(result, expected) def test_integer_col_names(self): df = DataFrame([[1, 2], [3, 4]], columns=[0, 1])