From 4d44e84e7a2a57d9dfa38294f332eab33bbb3b90 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 19:08:32 +0100 Subject: [PATCH 01/19] Start sql implementation --- pandas/_libs/lib.pyx | 3 +++ pandas/core/internals/construction.py | 9 +++++---- pandas/io/sql.py | 3 +++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e35cf2fb13768..28b3ac896c52c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2406,12 +2406,15 @@ def maybe_convert_objects(ndarray[object] objects, for i in range(n): val = objects[i] + print(val) + print("x") if itemsize_max != -1: itemsize = get_itemsize(val) if itemsize > itemsize_max or itemsize == -1: itemsize_max = itemsize if val is None: + print("hello") seen.null_ = True floats[i] = complexes[i] = fnan mask[i] = True diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 563011abe2c41..5a58b38e4963f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -900,7 +900,7 @@ def _finalize_columns_and_data( raise ValueError(err) from err if len(contents) and contents[0].dtype == np.object_: - contents = _convert_object_array(contents, dtype=dtype) + contents = convert_object_array(contents, dtype=dtype) return contents, columns @@ -963,8 +963,8 @@ def _validate_or_indexify_columns( return columns -def _convert_object_array( - content: list[npt.NDArray[np.object_]], dtype: DtypeObj | None +def convert_object_array( + content: list[npt.NDArray[np.object_]], dtype: DtypeObj | None, use_nullable_dtypes: bool = False ) -> list[ArrayLike]: """ Internal function to convert object array. @@ -973,6 +973,7 @@ def _convert_object_array( ---------- content: List[np.ndarray] dtype: np.dtype or ExtensionDtype + use_nullable_dtypes: Controls if nullable dtypes are returned. Returns ------- @@ -981,7 +982,7 @@ def _convert_object_array( # provide soft conversion of object dtypes def convert(arr): if dtype != np.dtype("O"): - arr = lib.maybe_convert_objects(arr) + arr = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) if dtype is None: if arr.dtype == np.dtype("O"): diff --git a/pandas/io/sql.py b/pandas/io/sql.py index e3510f71bd0cd..ae3674254338d 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -36,6 +36,7 @@ IndexLabel, ) from pandas.compat._optional import import_optional_dependency +from pandas.core.internals.construction import convert_object_array from pandas.errors import ( AbstractMethodError, DatabaseError, @@ -148,6 +149,8 @@ def _wrap_result( dtype: DtypeArg | None = None, ): """Wrap result set of query in a DataFrame.""" + content = lib.to_object_array_tuples(data) + # content = convert_object_array(content, use_nullable_dtypes=True) frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) if dtype: From aff00f1657f0838fff66f2756086305221dfc02a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 19:39:24 +0100 Subject: [PATCH 02/19] BUG: Fix bug in maybe_convert_objects with None and nullable --- pandas/_libs/lib.pyx | 9 ++++++--- pandas/tests/dtypes/test_inference.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e35cf2fb13768..81e0f3de748ff 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2446,7 +2446,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.int_ = True floats[i] = val complexes[i] = val - if not seen.null_: + if not seen.null_ or convert_to_nullable_integer: seen.saw_int(val) if ((seen.uint_ and seen.sint_) or @@ -2616,10 +2616,13 @@ def maybe_convert_objects(ndarray[object] objects, result = complexes elif seen.float_: result = floats - elif seen.int_: + elif seen.int_ or seen.uint_: if convert_to_nullable_integer: from pandas.core.arrays import IntegerArray - result = IntegerArray(ints, mask) + if seen.uint_: + result = IntegerArray(uints, mask) + else: + result = IntegerArray(ints, mask) else: result = floats elif seen.nan_: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index df2afad51abf8..72bae6b4782eb 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -863,6 +863,18 @@ def test_maybe_convert_objects_nullable_integer(self, exp): tm.assert_extension_array_equal(result, exp) + @pytest.mark.parametrize( + "dtype, val", [("int64", 1), ("uint64", np.iinfo(np.int64).max + 1)] + ) + def test_maybe_convert_objects_nullable_none(self, dtype, val): + # GH# + arr = np.array([val, None, 3], dtype="object") + result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) + expected = IntegerArray( + np.array([val, 0, 3], dtype=dtype), np.array([False, True, False]) + ) + tm.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize( "convert_to_masked_nullable, exp", [ From 9070acd4a2ed8ecb41931251c2c62b86e010fa83 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 19:41:00 +0100 Subject: [PATCH 03/19] Add gh ref --- pandas/tests/dtypes/test_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 72bae6b4782eb..015c121ca684a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -867,7 +867,7 @@ def test_maybe_convert_objects_nullable_integer(self, exp): "dtype, val", [("int64", 1), ("uint64", np.iinfo(np.int64).max + 1)] ) def test_maybe_convert_objects_nullable_none(self, dtype, val): - # GH# + # GH#50043 arr = np.array([val, None, 3], dtype="object") result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) expected = IntegerArray( From f69d6d8ec997dd81ac8af65a578cdc11b36fd320 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 20:52:26 +0100 Subject: [PATCH 04/19] Continue sql implementation --- pandas/_libs/lib.pyx | 3 --- pandas/core/internals/construction.py | 29 ++++++++++++++++++++++++--- pandas/io/sql.py | 6 ++++-- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e34010448c6bf..81e0f3de748ff 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2406,15 +2406,12 @@ def maybe_convert_objects(ndarray[object] objects, for i in range(n): val = objects[i] - print(val) - print("x") if itemsize_max != -1: itemsize = get_itemsize(val) if itemsize > itemsize_max or itemsize == -1: itemsize_max = itemsize if val is None: - print("hello") seen.null_ = True floats[i] = complexes[i] = fnan mask[i] = True diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5a58b38e4963f..6ab82146801a9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -31,9 +31,11 @@ ) from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, + is_bool_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, is_extension_array_dtype, + is_float_dtype, is_integer_dtype, is_list_like, is_named_tuple, @@ -49,7 +51,12 @@ algorithms, common as com, ) -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + BooleanArray, + ExtensionArray, + FloatingArray, + IntegerArray, +) from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, @@ -964,7 +971,9 @@ def _validate_or_indexify_columns( def convert_object_array( - content: list[npt.NDArray[np.object_]], dtype: DtypeObj | None, use_nullable_dtypes: bool = False + content: list[npt.NDArray[np.object_]], + dtype: DtypeObj | None, + use_nullable_dtypes: bool = False, ) -> list[ArrayLike]: """ Internal function to convert object array. @@ -980,14 +989,28 @@ def convert_object_array( List[ArrayLike] """ # provide soft conversion of object dtypes + + use_nullable_dtypes = True + def convert(arr): if dtype != np.dtype("O"): - arr = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) + arr = lib.maybe_convert_objects( + arr, convert_to_nullable_integer=use_nullable_dtypes + ) if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) + else: + if isinstance(arr, np.ndarray): + if is_integer_dtype(arr.dtype): + arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_)) + elif is_bool_dtype(arr.dtype): + arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_)) + elif is_float_dtype(arr.dtype): + arr = FloatingArray(arr, np.isnan(arr)) + elif isinstance(dtype, ExtensionDtype): # TODO: test(s) that get here # TODO: try to de-duplicate this convert function with diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ae3674254338d..aac46aaaea3f8 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -36,7 +36,6 @@ IndexLabel, ) from pandas.compat._optional import import_optional_dependency -from pandas.core.internals.construction import convert_object_array from pandas.errors import ( AbstractMethodError, DatabaseError, @@ -59,6 +58,7 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime if TYPE_CHECKING: @@ -150,7 +150,9 @@ def _wrap_result( ): """Wrap result set of query in a DataFrame.""" content = lib.to_object_array_tuples(data) - # content = convert_object_array(content, use_nullable_dtypes=True) + content = list(content.T) + content = convert_object_array(content, dtype=dtype, use_nullable_dtypes=True) + frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) if dtype: From 3d6958dc8d06b67592035679ca23cddbc41f0b09 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 21:19:33 +0100 Subject: [PATCH 05/19] ENH: maybe_convert_objects add boolean support with NA --- pandas/_libs/lib.pyi | 12 ++++++------ pandas/_libs/lib.pyx | 21 ++++++++++++++++----- pandas/tests/dtypes/test_inference.py | 24 +++++++++++++++++++++++- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 3cbc04fb2f5cd..9bc02e90ebb9e 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -75,7 +75,7 @@ def maybe_convert_objects( convert_timedelta: Literal[False] = ..., convert_period: Literal[False] = ..., convert_interval: Literal[False] = ..., - convert_to_nullable_integer: Literal[False] = ..., + convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray @@ -88,7 +88,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[False] = ..., convert_interval: Literal[False] = ..., - convert_to_nullable_integer: Literal[False] = ..., + convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> np.ndarray: ... @overload @@ -101,7 +101,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_interval: bool = ..., - convert_to_nullable_integer: Literal[True] = ..., + convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -114,7 +114,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_interval: bool = ..., - convert_to_nullable_integer: bool = ..., + convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -127,7 +127,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[True] = ..., convert_interval: bool = ..., - convert_to_nullable_integer: bool = ..., + convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -140,7 +140,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_interval: bool = ..., - convert_to_nullable_integer: bool = ..., + convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e35cf2fb13768..de10626f3e574 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1315,6 +1315,14 @@ cdef class Seen: or self.numeric_ or self.nan_ or self.null_ or self.object_ ) + @property + def is_bool_or_na(self): + # i.e. not (anything but bool or missing values) + return not ( + self.datetime_ or self.datetimetz_ or self.nat_ or self.timedelta_ + or self.period_ or self.interval_ or self.numeric_ or self.object_ + ) + cdef object _try_infer_map(object dtype): """ @@ -2335,7 +2343,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_timedelta=False, bint convert_period=False, bint convert_interval=False, - bint convert_to_nullable_integer=False, + bint convert_to_nullable_dtype=False, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2362,9 +2370,9 @@ def maybe_convert_objects(ndarray[object] objects, convert_interval : bool, default False If an array-like object contains only Interval objects (with matching dtypes and closedness) or NaN, whether to convert to IntervalArray. - convert_to_nullable_integer : bool, default False - If an array-like object contains only integer values (and NaN) is - encountered, whether to convert and return an IntegerArray. + convert_to_nullable_dtype : bool, default False + If an array-like object contains only integer or boolean values (and NaN) is + encountered, whether to convert and return an Boolean/IntegerArray. dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None Dtype to cast to if we have all-NaT. @@ -2606,6 +2614,9 @@ def maybe_convert_objects(ndarray[object] objects, if seen.is_bool: # is_bool property rules out everything else return bools.view(np.bool_) + elif convert_to_nullable_dtype and seen.is_bool_or_na: + from pandas.core.arrays import BooleanArray + return BooleanArray(bools.view(np.bool_), mask) seen.object_ = True if not seen.object_: @@ -2617,7 +2628,7 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.float_: result = floats elif seen.int_: - if convert_to_nullable_integer: + if convert_to_nullable_dtype: from pandas.core.arrays import IntegerArray result = IntegerArray(ints, mask) else: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index df2afad51abf8..4a8099d5b2b0f 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -859,7 +859,7 @@ def test_maybe_convert_objects_timedelta64_nat(self): def test_maybe_convert_objects_nullable_integer(self, exp): # GH27335 arr = np.array([2, np.NaN], dtype=object) - result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) + result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) tm.assert_extension_array_equal(result, exp) @@ -918,6 +918,28 @@ def test_maybe_convert_objects_bool_nan(self): out = lib.maybe_convert_objects(ind.values, safe=1) tm.assert_numpy_array_equal(out, exp) + def test_maybe_convert_objects_nullable_boolean(self): + # GH + arr = np.array([True, False], dtype=object) + exp = np.array([True, False]) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_numpy_array_equal(out, exp) + + arr = np.array([True, False, pd.NaT], dtype=object) + exp = np.array([True, False, pd.NaT], dtype=object) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_numpy_array_equal(out, exp) + + @pytest.mark.parametrize("val", [None, np.nan]) + def test_maybe_convert_objects_nullable_boolean_na(self, val): + # GH + arr = np.array([True, False, val], dtype=object) + exp = BooleanArray( + np.array([True, False, False]), np.array([False, False, True]) + ) + out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) + tm.assert_extension_array_equal(out, exp) + @pytest.mark.parametrize( "data0", [ From 43545c5456d42eb64403e5b2260be65c7e7126c0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 21:51:01 +0100 Subject: [PATCH 06/19] Fix merge error --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5361bb499240b..726fe79c2d702 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2454,7 +2454,7 @@ def maybe_convert_objects(ndarray[object] objects, seen.int_ = True floats[i] = val complexes[i] = val - if not seen.null_ or convert_to_nullable_integer: + if not seen.null_ or convert_to_nullable_dtype: seen.saw_int(val) if ((seen.uint_ and seen.sint_) or From 1ed72bfa1fb53925575f00223c4121e234acd78b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 21:52:25 +0100 Subject: [PATCH 07/19] Add gh ref --- pandas/tests/dtypes/test_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 4a6d480dcc988..b075718a678d6 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -931,7 +931,7 @@ def test_maybe_convert_objects_bool_nan(self): tm.assert_numpy_array_equal(out, exp) def test_maybe_convert_objects_nullable_boolean(self): - # GH + # GH50047 arr = np.array([True, False], dtype=object) exp = np.array([True, False]) out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) @@ -944,7 +944,7 @@ def test_maybe_convert_objects_nullable_boolean(self): @pytest.mark.parametrize("val", [None, np.nan]) def test_maybe_convert_objects_nullable_boolean_na(self, val): - # GH + # GH50047 arr = np.array([True, False, val], dtype=object) exp = BooleanArray( np.array([True, False, False]), np.array([False, False, True]) From 4e93577014870f56a25702fef9b03017e108c847 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 22:36:13 +0100 Subject: [PATCH 08/19] Add to api --- pandas/core/internals/construction.py | 23 ++++++++++---------- pandas/io/sql.py | 31 +++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6ab82146801a9..326a92846d97f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -974,6 +974,7 @@ def convert_object_array( content: list[npt.NDArray[np.object_]], dtype: DtypeObj | None, use_nullable_dtypes: bool = False, + coerce_float: bool = False, ) -> list[ArrayLike]: """ Internal function to convert object array. @@ -983,6 +984,7 @@ def convert_object_array( content: List[np.ndarray] dtype: np.dtype or ExtensionDtype use_nullable_dtypes: Controls if nullable dtypes are returned. + coerce_float: Cast floats that are integers to int. Returns ------- @@ -990,26 +992,25 @@ def convert_object_array( """ # provide soft conversion of object dtypes - use_nullable_dtypes = True - def convert(arr): if dtype != np.dtype("O"): arr = lib.maybe_convert_objects( - arr, convert_to_nullable_integer=use_nullable_dtypes + arr, + try_float=coerce_float, + convert_to_nullable_dtype=use_nullable_dtypes, ) if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) - else: - if isinstance(arr, np.ndarray): - if is_integer_dtype(arr.dtype): - arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_)) - elif is_bool_dtype(arr.dtype): - arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_)) - elif is_float_dtype(arr.dtype): - arr = FloatingArray(arr, np.isnan(arr)) + elif use_nullable_dtypes and isinstance(arr, np.ndarray): + if is_integer_dtype(arr.dtype): + arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_)) + elif is_bool_dtype(arr.dtype): + arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_)) + elif is_float_dtype(arr.dtype): + arr = FloatingArray(arr, np.isnan(arr)) elif isinstance(dtype, ExtensionDtype): # TODO: test(s) that get here diff --git a/pandas/io/sql.py b/pandas/io/sql.py index aac46aaaea3f8..c16028505f524 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -147,13 +147,18 @@ def _wrap_result( coerce_float: bool = True, parse_dates=None, dtype: DtypeArg | None = None, + use_nullable_dtypes: bool = False, ): """Wrap result set of query in a DataFrame.""" content = lib.to_object_array_tuples(data) content = list(content.T) - content = convert_object_array(content, dtype=dtype, use_nullable_dtypes=True) - - frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) + content = convert_object_array( + content, + dtype=None, + coerce_float=coerce_float, + use_nullable_dtypes=use_nullable_dtypes, + ) + frame = DataFrame({col: val for col, val in zip(columns, content)}, columns=columns) if dtype: frame = frame.astype(dtype) @@ -161,7 +166,7 @@ def _wrap_result( frame = _parse_date_columns(frame, parse_dates) if index_col is not None: - frame.set_index(index_col, inplace=True) + frame = frame.set_index(index_col) return frame @@ -423,6 +428,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: None = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame: ... @@ -437,6 +443,7 @@ def read_sql( parse_dates=..., columns: list[str] = ..., chunksize: int = ..., + use_nullable_dtypes: bool = ..., ) -> Iterator[DataFrame]: ... @@ -450,6 +457,7 @@ def read_sql( parse_dates=None, columns: list[str] | None = None, chunksize: int | None = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query or database table into a DataFrame. @@ -497,7 +505,12 @@ def read_sql( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. + use_nullable_dtypes : bool = False + Whether to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + .. versionadded:: 2.0 Returns ------- DataFrame or Iterator[DataFrame] @@ -576,6 +589,7 @@ def read_sql( coerce_float=coerce_float, parse_dates=parse_dates, chunksize=chunksize, + use_nullable_dtypes=use_nullable_dtypes, ) try: @@ -601,6 +615,7 @@ def read_sql( coerce_float=coerce_float, parse_dates=parse_dates, chunksize=chunksize, + use_nullable_dtypes=use_nullable_dtypes, ) @@ -1308,6 +1323,7 @@ def read_query( params=None, chunksize: int | None = None, dtype: DtypeArg | None = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Iterator[DataFrame]: pass @@ -1567,6 +1583,7 @@ def read_query( params=None, chunksize: int | None = None, dtype: DtypeArg | None = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL query into a DataFrame. @@ -1638,6 +1655,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, + use_nullable_dtypes=use_nullable_dtypes, ) return frame @@ -2094,6 +2112,7 @@ def _query_iterator( coerce_float: bool = True, parse_dates=None, dtype: DtypeArg | None = None, + use_nullable_dtypes: bool = False, ): """Return generator through chunked result set""" has_read_data = False @@ -2117,6 +2136,7 @@ def _query_iterator( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, + use_nullable_dtypes=use_nullable_dtypes, ) def read_query( @@ -2128,6 +2148,7 @@ def read_query( params=None, chunksize: int | None = None, dtype: DtypeArg | None = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Iterator[DataFrame]: args = _convert_params(sql, params) @@ -2143,6 +2164,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, + use_nullable_dtypes=use_nullable_dtypes, ) else: data = self._fetchall_as_list(cursor) @@ -2155,6 +2177,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, + use_nullable_dtypes=use_nullable_dtypes, ) return frame From d5d00476fa3e5b48dd382489debbe061bcb1aba3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 22:59:24 +0100 Subject: [PATCH 09/19] Add tests --- pandas/io/sql.py | 4 +++ pandas/tests/io/test_sql.py | 52 +++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c16028505f524..e29b5ccbba45f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1548,6 +1548,7 @@ def _query_iterator( coerce_float: bool = True, parse_dates=None, dtype: DtypeArg | None = None, + use_nullable_dtypes: bool = False, ): """Return generator through chunked result set""" has_read_data = False @@ -1561,6 +1562,7 @@ def _query_iterator( index_col=index_col, coerce_float=coerce_float, parse_dates=parse_dates, + use_nullable_dtypes=use_nullable_dtypes, ) break @@ -1572,6 +1574,7 @@ def _query_iterator( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, + use_nullable_dtypes=use_nullable_dtypes, ) def read_query( @@ -1645,6 +1648,7 @@ def read_query( coerce_float=coerce_float, parse_dates=parse_dates, dtype=dtype, + use_nullable_dtypes=use_nullable_dtypes, ) else: data = result.fetchall() diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 394fceb69b788..8d06abaf05293 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2266,6 +2266,46 @@ def test_get_engine_auto_error_message(self): pass # TODO(GH#36893) fill this in when we add more engines + def test_read_sql_nullable_dtypes(self): + # GH# + table = "test" + df = DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + } + ) + df.to_sql(table, self.conn, index=False, if_exists="replace") + + result = pd.read_sql( + f"Select * from {table}", self.conn, use_nullable_dtypes=True + ) + expected = self.nullable_expected() + tm.assert_frame_equal(result, expected) + + iterator = pd.read_sql( + f"Select * from {table}", self.conn, use_nullable_dtypes=True, chunksize=3 + ) + expected = self.nullable_expected() + for result in iterator: + tm.assert_frame_equal(result, expected) + + def nullable_expected(self) -> DataFrame: + return DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": Series([True, False, pd.NA], dtype="boolean"), + "f": Series([True, False, True], dtype="boolean"), + } + ) + class TestSQLiteAlchemy(_TestSQLAlchemy): """ @@ -2349,6 +2389,18 @@ class Test(BaseModel): assert list(df.columns) == ["id", "string_column"] + def nullable_expected(self) -> DataFrame: + return DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": Series([1, 0, pd.NA], dtype="Int64"), + "f": Series([1, 0, 1], dtype="Int64"), + } + ) + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): From 62c798f664545b9250dba58248f6d0f915d0621d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 3 Dec 2022 23:00:26 +0100 Subject: [PATCH 10/19] Fix test --- pandas/tests/dtypes/test_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b075718a678d6..c9b61afb5eb25 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -869,7 +869,7 @@ def test_maybe_convert_objects_nullable_integer(self, exp): def test_maybe_convert_objects_nullable_none(self, dtype, val): # GH#50043 arr = np.array([val, None, 3], dtype="object") - result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=True) + result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True) expected = IntegerArray( np.array([val, 0, 3], dtype=dtype), np.array([False, True, False]) ) From da4f4aec6421e5292b67c362b620ff1b5b1b6357 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 01:26:25 +0100 Subject: [PATCH 11/19] Fix test --- pandas/tests/io/test_sql.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 8d06abaf05293..bcc48d1ac8ca7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2390,16 +2390,7 @@ class Test(BaseModel): assert list(df.columns) == ["id", "string_column"] def nullable_expected(self) -> DataFrame: - return DataFrame( - { - "a": Series([1, np.nan, 3], dtype="Int64"), - "b": Series([1, 2, 3], dtype="Int64"), - "c": Series([1.5, np.nan, 2.5], dtype="Float64"), - "d": Series([1.5, 2.0, 2.5], dtype="Float64"), - "e": Series([1, 0, pd.NA], dtype="Int64"), - "f": Series([1, 0, 1], dtype="Int64"), - } - ) + return super().nullable_expected().astype({"e": "Int64", "f": "Int64"}) @pytest.mark.db @@ -2428,6 +2419,9 @@ def setup_driver(cls): def test_default_type_conversion(self): pass + def nullable_expected(self) -> DataFrame: + return super().nullable_expected().astype({"e": "Int64", "f": "Int64"}) + @pytest.mark.db class TestPostgreSQLAlchemy(_TestSQLAlchemy): From 85c995ae9f08284e11a352cc9f65b251d29aaf9c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 01:30:39 +0100 Subject: [PATCH 12/19] Simplify --- pandas/_libs/lib.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 726fe79c2d702..462537af3383a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1309,16 +1309,12 @@ cdef class Seen: @property def is_bool(self): # i.e. not (anything but bool) - return not ( - self.datetime_ or self.datetimetz_ or self.timedelta_ or self.nat_ - or self.period_ or self.interval_ - or self.numeric_ or self.nan_ or self.null_ or self.object_ - ) + return self.is_bool_or_na and not (self.nan_ or self.null_) @property def is_bool_or_na(self): # i.e. not (anything but bool or missing values) - return not ( + return self.bool_ and not ( self.datetime_ or self.datetimetz_ or self.nat_ or self.timedelta_ or self.period_ or self.interval_ or self.numeric_ or self.object_ ) From 87743f319b94bd14de17b732f32b92bdf69790db Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 14:19:14 +0100 Subject: [PATCH 13/19] Implement string support --- pandas/core/internals/construction.py | 3 ++ pandas/tests/io/test_sql.py | 56 +++++++++++++++++++-------- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 326a92846d97f..401c9566baf19 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -57,6 +57,7 @@ FloatingArray, IntegerArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, @@ -1004,6 +1005,8 @@ def convert(arr): if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) + if use_nullable_dtypes and arr.dtype == np.dtype("O"): + arr = StringDtype().construct_array_type()._from_sequence(arr) elif use_nullable_dtypes and isinstance(arr, np.ndarray): if is_integer_dtype(arr.dtype): arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_)) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index bcc48d1ac8ca7..eb15d285eb8aa 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -53,6 +53,10 @@ to_timedelta, ) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io import sql from pandas.io.sql import ( @@ -2266,7 +2270,8 @@ def test_get_engine_auto_error_message(self): pass # TODO(GH#36893) fill this in when we add more engines - def test_read_sql_nullable_dtypes(self): + @pytest.mark.parametrize("storage", ["pyarrow", "python"]) + def test_read_sql_nullable_dtypes(self, storage): # GH# table = "test" df = DataFrame( @@ -2277,24 +2282,41 @@ def test_read_sql_nullable_dtypes(self): "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": [True, False, None], "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], } ) df.to_sql(table, self.conn, index=False, if_exists="replace") - result = pd.read_sql( - f"Select * from {table}", self.conn, use_nullable_dtypes=True - ) - expected = self.nullable_expected() + with pd.option_context("mode.string_storage", storage): + result = pd.read_sql( + f"Select * from {table}", self.conn, use_nullable_dtypes=True + ) + expected = self.nullable_expected(storage) tm.assert_frame_equal(result, expected) - iterator = pd.read_sql( - f"Select * from {table}", self.conn, use_nullable_dtypes=True, chunksize=3 - ) - expected = self.nullable_expected() - for result in iterator: - tm.assert_frame_equal(result, expected) + with pd.option_context("mode.string_storage", storage): + iterator = pd.read_sql( + f"Select * from {table}", + self.conn, + use_nullable_dtypes=True, + chunksize=3, + ) + expected = self.nullable_expected(storage) + for result in iterator: + tm.assert_frame_equal(result, expected) + + def nullable_expected(self, storage) -> DataFrame: + + if storage == "python": + string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) + string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + + else: + pa = pytest.importorskip("pyarrow") + string_array = ArrowStringArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - def nullable_expected(self) -> DataFrame: return DataFrame( { "a": Series([1, np.nan, 3], dtype="Int64"), @@ -2303,6 +2325,8 @@ def nullable_expected(self) -> DataFrame: "d": Series([1.5, 2.0, 2.5], dtype="Float64"), "e": Series([True, False, pd.NA], dtype="boolean"), "f": Series([True, False, True], dtype="boolean"), + "g": string_array, + "h": string_array_na, } ) @@ -2389,8 +2413,8 @@ class Test(BaseModel): assert list(df.columns) == ["id", "string_column"] - def nullable_expected(self) -> DataFrame: - return super().nullable_expected().astype({"e": "Int64", "f": "Int64"}) + def nullable_expected(self, storage) -> DataFrame: + return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"}) @pytest.mark.db @@ -2419,8 +2443,8 @@ def setup_driver(cls): def test_default_type_conversion(self): pass - def nullable_expected(self) -> DataFrame: - return super().nullable_expected().astype({"e": "Int64", "f": "Int64"}) + def nullable_expected(self, storage) -> DataFrame: + return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"}) @pytest.mark.db From 3363466159963c8118f4fb0ee089c00ec85a816e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 14:43:19 +0100 Subject: [PATCH 14/19] Add support for table --- pandas/io/sql.py | 56 ++++++++++++++++++++++++++----------- pandas/tests/io/test_sql.py | 52 +++++++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 30 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index e29b5ccbba45f..5110c2ff70d7e 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -140,6 +140,22 @@ def _parse_date_columns(data_frame, parse_dates): return data_frame +def _convert_arrays_to_dataframe( + data, + columns, + coerce_float: bool = True, + use_nullable_dtypes: bool = False, +) -> DataFrame: + content = lib.to_object_array_tuples(data) + arrays = convert_object_array( + list(content.T), + dtype=None, + coerce_float=coerce_float, + use_nullable_dtypes=use_nullable_dtypes, + ) + return DataFrame({col: val for col, val in zip(columns, arrays)}) + + def _wrap_result( data, columns, @@ -150,15 +166,9 @@ def _wrap_result( use_nullable_dtypes: bool = False, ): """Wrap result set of query in a DataFrame.""" - content = lib.to_object_array_tuples(data) - content = list(content.T) - content = convert_object_array( - content, - dtype=None, - coerce_float=coerce_float, - use_nullable_dtypes=use_nullable_dtypes, + frame = _convert_arrays_to_dataframe( + data, columns, coerce_float, use_nullable_dtypes ) - frame = DataFrame({col: val for col, val in zip(columns, content)}, columns=columns) if dtype: frame = frame.astype(dtype) @@ -606,6 +616,7 @@ def read_sql( parse_dates=parse_dates, columns=columns, chunksize=chunksize, + use_nullable_dtypes=use_nullable_dtypes, ) else: return pandas_sql.read_query( @@ -1003,6 +1014,7 @@ def _query_iterator( columns, coerce_float: bool = True, parse_dates=None, + use_nullable_dtypes: bool = False, ): """Return generator through chunked result set.""" has_read_data = False @@ -1016,11 +1028,13 @@ def _query_iterator( break has_read_data = True - self.frame = DataFrame.from_records( - data, columns=columns, coerce_float=coerce_float + self.frame = _convert_arrays_to_dataframe( + data, columns, coerce_float, use_nullable_dtypes ) - self._harmonize_columns(parse_dates=parse_dates) + self._harmonize_columns( + parse_dates=parse_dates, use_nullable_dtypes=use_nullable_dtypes + ) if self.index is not None: self.frame.set_index(self.index, inplace=True) @@ -1033,6 +1047,7 @@ def read( parse_dates=None, columns=None, chunksize=None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Iterator[DataFrame]: from sqlalchemy import select @@ -1054,14 +1069,17 @@ def read( column_names, coerce_float=coerce_float, parse_dates=parse_dates, + use_nullable_dtypes=use_nullable_dtypes, ) else: data = result.fetchall() - self.frame = DataFrame.from_records( - data, columns=column_names, coerce_float=coerce_float + self.frame = _convert_arrays_to_dataframe( + data, column_names, coerce_float, use_nullable_dtypes ) - self._harmonize_columns(parse_dates=parse_dates) + self._harmonize_columns( + parse_dates=parse_dates, use_nullable_dtypes=use_nullable_dtypes + ) if self.index is not None: self.frame.set_index(self.index, inplace=True) @@ -1144,7 +1162,9 @@ def _create_table_setup(self): meta = MetaData() return Table(self.name, meta, *columns, schema=schema) - def _harmonize_columns(self, parse_dates=None) -> None: + def _harmonize_columns( + self, parse_dates=None, use_nullable_dtypes: bool = False + ) -> None: """ Make the DataFrame's column types align with the SQL table column types. @@ -1184,11 +1204,11 @@ def _harmonize_columns(self, parse_dates=None) -> None: # Convert tz-aware Datetime SQL columns to UTC utc = col_type is DatetimeTZDtype self.frame[col_name] = _handle_date_column(df_col, utc=utc) - elif col_type is float: + elif not use_nullable_dtypes and col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type, copy=False) - elif len(df_col) == df_col.count(): + elif not use_nullable_dtypes and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: self.frame[col_name] = df_col.astype(col_type, copy=False) @@ -1487,6 +1507,7 @@ def read_table( columns=None, schema: str | None = None, chunksize: int | None = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Iterator[DataFrame]: """ Read SQL database table into a DataFrame. @@ -1537,6 +1558,7 @@ def read_table( parse_dates=parse_dates, columns=columns, chunksize=chunksize, + use_nullable_dtypes=use_nullable_dtypes, ) @staticmethod diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index eb15d285eb8aa..b0cf8d2de7384 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2272,20 +2272,9 @@ def test_get_engine_auto_error_message(self): @pytest.mark.parametrize("storage", ["pyarrow", "python"]) def test_read_sql_nullable_dtypes(self, storage): - # GH# + # GH#50048 table = "test" - df = DataFrame( - { - "a": Series([1, np.nan, 3], dtype="Int64"), - "b": Series([1, 2, 3], dtype="Int64"), - "c": Series([1.5, np.nan, 2.5], dtype="Float64"), - "d": Series([1.5, 2.0, 2.5], dtype="Float64"), - "e": [True, False, None], - "f": [True, False, True], - "g": ["a", "b", "c"], - "h": ["a", "b", None], - } - ) + df = self.nullable_data() df.to_sql(table, self.conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", storage): @@ -2306,6 +2295,43 @@ def test_read_sql_nullable_dtypes(self, storage): for result in iterator: tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("storage", ["pyarrow", "python"]) + def test_read_sql_nullable_dtypes_table(self, storage): + # GH#50048 + table = "test" + df = self.nullable_data() + df.to_sql(table, self.conn, index=False, if_exists="replace") + + with pd.option_context("mode.string_storage", storage): + result = pd.read_sql(table, self.conn, use_nullable_dtypes=True) + expected = self.nullable_expected(storage) + tm.assert_frame_equal(result, expected) + + with pd.option_context("mode.string_storage", storage): + iterator = pd.read_sql( + f"Select * from {table}", + self.conn, + use_nullable_dtypes=True, + chunksize=3, + ) + expected = self.nullable_expected(storage) + for result in iterator: + tm.assert_frame_equal(result, expected) + + def nullable_data(self) -> DataFrame: + return DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) + def nullable_expected(self, storage) -> DataFrame: if storage == "python": From a0233cd35389087c8527a03e7b33c6bfae7a586a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 14:45:00 +0100 Subject: [PATCH 15/19] Add docstring --- pandas/io/sql.py | 7 +++++++ pandas/tests/io/test_sql.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 5110c2ff70d7e..bcf7b65d8aa53 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1330,6 +1330,7 @@ def read_table( columns=None, schema: str | None = None, chunksize: int | None = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Iterator[DataFrame]: raise NotImplementedError @@ -1540,6 +1541,12 @@ def read_table( chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. + use_nullable_dtypes : bool = False + Whether to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + .. versionadded:: 2.0 Returns ------- diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b0cf8d2de7384..cd5729b9931a6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2334,6 +2334,8 @@ def nullable_data(self) -> DataFrame: def nullable_expected(self, storage) -> DataFrame: + string_array: StringArray | ArrowStringArray + string_array_na: StringArray | ArrowStringArray if storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) From 58561d7c26354a99f5b7d1112e94d3c52890989f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 14:45:46 +0100 Subject: [PATCH 16/19] Add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d8609737b8c7a..060acabc2a489 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -37,6 +37,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_csv` * :func:`read_excel` +* :func:`read_sql` Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. From 65408592c608c66985c76dc914eb85c7a41140af Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Dec 2022 15:59:07 +0100 Subject: [PATCH 17/19] Fix tests --- pandas/io/sql.py | 5 ++++- pandas/tests/io/test_sql.py | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index bcf7b65d8aa53..69d9c8faa3360 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -153,7 +153,10 @@ def _convert_arrays_to_dataframe( coerce_float=coerce_float, use_nullable_dtypes=use_nullable_dtypes, ) - return DataFrame({col: val for col, val in zip(columns, arrays)}) + if arrays: + return DataFrame({col: val for col, val in zip(columns, arrays)}) + else: + return DataFrame(columns=columns) def _wrap_result( diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index cd5729b9931a6..db37b1785af5c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2444,6 +2444,11 @@ class Test(BaseModel): def nullable_expected(self, storage) -> DataFrame: return super().nullable_expected(storage).astype({"e": "Int64", "f": "Int64"}) + @pytest.mark.parametrize("storage", ["pyarrow", "python"]) + def test_read_sql_nullable_dtypes_table(self, storage): + # GH#50048 Not supported for sqlite + pass + @pytest.mark.db class TestMySQLAlchemy(_TestSQLAlchemy): From 80b61e31b669bc5b91456b8aa71969f673a47206 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 5 Dec 2022 11:43:02 +0100 Subject: [PATCH 18/19] Fix pylint --- pandas/io/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 69d9c8faa3360..ca6440da56a19 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -154,7 +154,7 @@ def _convert_arrays_to_dataframe( use_nullable_dtypes=use_nullable_dtypes, ) if arrays: - return DataFrame({col: val for col, val in zip(columns, arrays)}) + return DataFrame(dict(zip(columns, arrays))) else: return DataFrame(columns=columns) From 940bca7f45a15d8b64b2a6353d9af65c9ca79f7e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 5 Dec 2022 12:49:38 +0100 Subject: [PATCH 19/19] Fix docstring --- pandas/io/sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ca6440da56a19..4c1dca180c6e9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -524,6 +524,7 @@ def read_sql( implementation, even if no nulls are present. .. versionadded:: 2.0 + Returns ------- DataFrame or Iterator[DataFrame]