From 7e4853156e730881ed198ecb1485ac203e9d8741 Mon Sep 17 00:00:00 2001 From: Anna Daglis Date: Fri, 6 Mar 2020 12:29:34 +0000 Subject: [PATCH 1/6] Fixed bug, where BooleanDtype columns were converted to Int64 --- doc/source/whatsnew/v1.0.2.rst | 1 + pandas/core/dtypes/cast.py | 6 ++++-- pandas/tests/series/methods/test_convert_dtypes.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 808e6ae709ce9..ebdf0c09e0b10 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -83,6 +83,7 @@ Bug fixes - Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`) - Fixed bug where :meth:`GroupBy.first` and :meth:`GroupBy.last` would raise a ``TypeError`` when groups contained ``pd.NA`` in a column of object dtype (:issue:`32123`) - Fix bug in :meth:`Series.convert_dtypes` for series with mix of integers and strings (:issue:`32117`) +- Fixed bug in :meth:`DataFrame.convert_dtypes`, where ``BooleanDtype`` columns were converted to ``Int64`` (:issue:`32287`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c06bd8a1d6e36..c83759564188f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,6 +6,7 @@ import numpy as np +import pandas as pd from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import ( NaT, @@ -1078,8 +1079,9 @@ def convert_dtypes( inferred_dtype = input_array.dtype if convert_boolean: - if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype( - input_array.dtype + if is_bool_dtype(input_array.dtype) and ( + (not is_extension_array_dtype(input_array.dtype)) + or (input_array.dtype == pd.BooleanDtype()) ): inferred_dtype = "boolean" else: diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 17527a09f07a1..946b78184b395 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -279,3 +279,16 @@ def test_convert_string_dtype(self): ) result = df.convert_dtypes() tm.assert_frame_equal(df, result) + + def test_convert_bool_dtype(self): + # GH32287 + df = pd.DataFrame([["abc", 123, True]]) + exp_dtypes = pd.Series([np.object, np.int64, np.bool]) + tm.assert_series_equal(df.dtypes, exp_dtypes) + + df_1 = df.convert_dtypes() + exp_dtypes_1 = pd.Series([pd.StringDtype(), pd.Int64Dtype(), pd.BooleanDtype()]) + tm.assert_series_equal(df_1.dtypes, exp_dtypes_1) + + df_2 = df_1.convert_dtypes() + tm.assert_series_equal(df_2.dtypes, exp_dtypes_1) From 06fcb7f7fe7c63581c98d676de83ea4fd9fa1642 Mon Sep 17 00:00:00 2001 From: Anna Daglis Date: Fri, 6 Mar 2020 13:10:36 +0000 Subject: [PATCH 2/6] Fix linting --- pandas/core/dtypes/cast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c83759564188f..726b83a76b830 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,7 +6,6 @@ import numpy as np -import pandas as pd from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import ( NaT, @@ -71,6 +70,8 @@ from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import isna, notna +import pandas as pd + _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max From bfcda0d85217cbeaab2fc5089ebf56bbb1bd547a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Mar 2020 09:30:57 -0500 Subject: [PATCH 3/6] Simplify --- pandas/core/dtypes/cast.py | 14 ++++---------- pandas/tests/series/methods/test_convert_dtypes.py | 12 ++---------- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7f2ecf470321b..97c02428cbdf9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -70,8 +70,6 @@ from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import isna, notna -import pandas as pd - _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max @@ -1051,7 +1049,8 @@ def convert_dtypes( dtype new dtype """ - if convert_string or convert_integer or convert_boolean: + is_extension = is_extension_array_dtype(input_array.dtype) + if (convert_string or convert_integer or convert_boolean) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1064,9 +1063,7 @@ def convert_dtypes( if convert_integer: target_int_dtype = "Int64" - if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype( - input_array.dtype - ): + if is_integer_dtype(input_array.dtype): from pandas.core.arrays.integer import _dtypes inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) @@ -1080,10 +1077,7 @@ def convert_dtypes( inferred_dtype = input_array.dtype if convert_boolean: - if is_bool_dtype(input_array.dtype) and ( - (not is_extension_array_dtype(input_array.dtype)) - or (input_array.dtype == pd.BooleanDtype()) - ): + if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" else: if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 946b78184b395..a41f893e3753f 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -282,13 +282,5 @@ def test_convert_string_dtype(self): def test_convert_bool_dtype(self): # GH32287 - df = pd.DataFrame([["abc", 123, True]]) - exp_dtypes = pd.Series([np.object, np.int64, np.bool]) - tm.assert_series_equal(df.dtypes, exp_dtypes) - - df_1 = df.convert_dtypes() - exp_dtypes_1 = pd.Series([pd.StringDtype(), pd.Int64Dtype(), pd.BooleanDtype()]) - tm.assert_series_equal(df_1.dtypes, exp_dtypes_1) - - df_2 = df_1.convert_dtypes() - tm.assert_series_equal(df_2.dtypes, exp_dtypes_1) + df = pd.DataFrame({"A": pd.array([True])}) + tm.assert_frame_equal(df, df.convert_dtypes()) From 934147b492d265cfd798c5288821f665c82a7a7c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Mar 2020 09:42:04 -0500 Subject: [PATCH 4/6] BUG: Fixed failure to copy in astype --- doc/source/whatsnew/v1.0.2.rst | 1 + pandas/core/arrays/datetimes.py | 3 +++ pandas/core/internals/blocks.py | 3 +++ pandas/tests/arrays/test_datetimes.py | 12 ++++++++++++ 4 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 7e961487e7209..0a9a9c226417c 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -64,6 +64,7 @@ Bug fixes **Datetimelike** - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with a tz-aware index (:issue:`26683`) +- Bug in :meth:`Series.astype` not copying for tz-naive and tz-aware datetime64 dtyep (:issue:`32490`) - Bug where :func:`to_datetime` would raise when passed ``pd.NA`` (:issue:`32213`) **Categorical** diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 9f19c7ba0be6e..eb58e22968be6 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -579,6 +579,7 @@ def astype(self, dtype, copy=True): # --> datetime # --> period # DatetimeLikeArrayMixin Super handles the rest. + # breakpoint() dtype = pandas_dtype(dtype) if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype): @@ -587,6 +588,8 @@ def astype(self, dtype, copy=True): if getattr(self.dtype, "tz", None) is None: return self.tz_localize(new_tz) result = self.tz_convert(new_tz) + if copy: + result = result.copy() if new_tz is None: # Do we want .astype('datetime64[ns]') to be an ndarray. # The astype in Block._astype expects this to return an diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c088b7020927b..bce440fe09319 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2228,6 +2228,9 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # if we are passed a datetime64[ns, tz] if is_datetime64tz_dtype(dtype): values = self.values + if copy: + # this should be the only copy + values = values.copy() if getattr(values, "tz", None) is None: values = DatetimeArray(values).tz_localize("UTC") values = values.tz_convert(dtype.tz) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index a59ed429cc404..2c26e72a245f7 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -151,6 +151,18 @@ def test_astype_to_same(self): result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False) assert result is arr + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "datetime64[ns, UTC]"]) + @pytest.mark.parametrize( + "other", ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, CET]"] + ) + def test_astype_copies(self, dtype, other): + # https://github.com/pandas-dev/pandas/pull/32490 + s = pd.Series([1, 2], dtype=dtype) + orig = s.copy() + t = s.astype(other) + t[:] = pd.NaT + tm.assert_series_equal(s, orig) + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")]) From 3e6f3a7cd14e5f200ef3e97cc231852cc8ff46a9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Mar 2020 09:51:06 -0500 Subject: [PATCH 5/6] remove breakpoint --- pandas/core/arrays/datetimes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index eb58e22968be6..7223eda22b3d9 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -579,7 +579,6 @@ def astype(self, dtype, copy=True): # --> datetime # --> period # DatetimeLikeArrayMixin Super handles the rest. - # breakpoint() dtype = pandas_dtype(dtype) if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype): From b1f964a4d4620a50fbfdc7d542dbc90e2f316796 Mon Sep 17 00:00:00 2001 From: Anna Daglis <40292327+AnnaDaglis@users.noreply.github.com> Date: Tue, 10 Mar 2020 14:53:51 +0000 Subject: [PATCH 6/6] Update v1.0.2.rst --- doc/source/whatsnew/v1.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 0a9a9c226417c..7a25f7b58c2e7 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -64,7 +64,7 @@ Bug fixes **Datetimelike** - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when reindexing with a tz-aware index (:issue:`26683`) -- Bug in :meth:`Series.astype` not copying for tz-naive and tz-aware datetime64 dtyep (:issue:`32490`) +- Bug in :meth:`Series.astype` not copying for tz-naive and tz-aware datetime64 dtype (:issue:`32490`) - Bug where :func:`to_datetime` would raise when passed ``pd.NA`` (:issue:`32213`) **Categorical**