From 59b3467840509c916caf5f0f123c2a8ee74b5a4a Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Sat, 20 Jun 2020 12:33:23 +0200 Subject: [PATCH 1/7] Added test case --- pandas/tests/plotting/test_frame.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index e4299490e7601..c69b8df0b9455 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3363,6 +3363,23 @@ def test_colors_of_columns_with_same_name(self): for legend, line in zip(result.get_legend().legendHandles, result.lines): assert legend.get_color() == line.get_color() + def test_nullable_int_plot(self): + # GH 32073 + dates = ["2008", "2009", None, "2011", "2012"] + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5], + "B": [7, 5, np.nan, 3, 2], + "C": pd.to_datetime(dates, format="%Y"), + "D": pd.to_datetime(dates, format="%Y", utc=True), + } + ) + + _check_plot_works(df.plot, x="A", y="B") + _check_plot_works(df[["A", "B"]].astype("Int64").plot, x="A", y="B") + _check_plot_works(df[["A", "C"]].plot, x="A", y="C") + _check_plot_works(df[["A", "D"]].plot, x="A", y="D") + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt From 5daa630f72083ed52024df28811bd7e37e5c1e91 Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Sat, 20 Jun 2020 14:56:36 +0200 Subject: [PATCH 2/7] Convert nullable integers to float before matplotlib call --- pandas/plotting/_matplotlib/core.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index f3682e0a008a6..19fe8983fd7d0 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -8,9 +8,11 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( + is_extension_array_dtype, is_float, is_hashable, is_integer, + is_integer_dtype, is_iterator, is_list_like, is_number, @@ -411,11 +413,22 @@ def _compute_plot_data(self): if is_empty: raise TypeError("no numeric data to plot") + def convert_to_ndarray(data): + # GH32073: cast to float if values can contain nulled integers + if ( + hasattr(data, "dtype") + and is_integer_dtype(data.dtype) + and is_extension_array_dtype(data.dtype) + ): + return data.to_numpy(dtype="float", na_value=np.nan) + else: + return np.asarray(data) + # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to # np.ndarray before plot. numeric_data = numeric_data.copy() for col in numeric_data: - numeric_data[col] = np.asarray(numeric_data[col]) + numeric_data[col] = convert_to_ndarray(numeric_data[col]) self.data = numeric_data From 1c35f23835712780d73cb229f8290d48a86bbe53 Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Fri, 26 Jun 2020 17:42:35 +0200 Subject: [PATCH 3/7] Added safe_convert_to_numpy function --- pandas/core/dtypes/cast.py | 29 +++++++++++ pandas/core/groupby/groupby.py | 9 ++-- pandas/plotting/_matplotlib/core.py | 14 +---- .../cast/test_safe_convert_to_ndarray.py | 51 +++++++++++++++++++ 4 files changed, 85 insertions(+), 18 deletions(-) create mode 100644 pandas/tests/dtypes/cast/test_safe_convert_to_ndarray.py diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d0417d51da497..68bc650a82249 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1747,3 +1747,32 @@ def validate_numeric_casting(dtype: np.dtype, value): ): if is_bool(value): raise ValueError("Cannot assign bool to float/integer series") + + +def safe_convert_to_ndarray(values) -> np.ndarray: + """ + Convert values to ndarray while casting nullable dtype arrays to float. + + Parameters + ---------- + values + Series or array. + + Returns + ------- + converted_values : np.ndarray + Values cast to np.ndarray. + """ + if hasattr(values, "dtype") and is_extension_array_dtype(values.dtype): + if is_integer_dtype(values.dtype): + converted_values = values.to_numpy(dtype=float, na_value=np.nan) + elif is_bool_dtype(values.dtype): + converted_values = values.to_numpy(dtype=float, na_value=np.nan) + elif is_datetime64tz_dtype(values.dtype): + print(values) + converted_values = np.asarray(values.dt.tz_localize(tz=None)) + else: + converted_values = np.asarray(values) + else: + converted_values = np.asarray(values) + return converted_values diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 02f7f605a7605..9eacd9528a570 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -41,7 +41,7 @@ class providing the base-class of operations. from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc -from pandas.core.dtypes.cast import maybe_cast_result +from pandas.core.dtypes.cast import maybe_cast_result, safe_convert_to_ndarray from pandas.core.dtypes.common import ( ensure_float, is_bool_dtype, @@ -2052,14 +2052,11 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: inference = None if is_integer_dtype(vals.dtype): - if is_extension_array_dtype(vals.dtype): - vals = vals.to_numpy(dtype=float, na_value=np.nan) inference = np.int64 - elif is_bool_dtype(vals.dtype) and is_extension_array_dtype(vals.dtype): - vals = vals.to_numpy(dtype=float, na_value=np.nan) elif is_datetime64_dtype(vals.dtype): inference = "datetime64[ns]" - vals = np.asarray(vals).astype(float) + + vals = safe_convert_to_ndarray(vals) return vals, inference diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 19fe8983fd7d0..fdbdd553df305 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -7,6 +7,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.cast import safe_convert_to_ndarray from pandas.core.dtypes.common import ( is_extension_array_dtype, is_float, @@ -413,22 +414,11 @@ def _compute_plot_data(self): if is_empty: raise TypeError("no numeric data to plot") - def convert_to_ndarray(data): - # GH32073: cast to float if values can contain nulled integers - if ( - hasattr(data, "dtype") - and is_integer_dtype(data.dtype) - and is_extension_array_dtype(data.dtype) - ): - return data.to_numpy(dtype="float", na_value=np.nan) - else: - return np.asarray(data) - # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to # np.ndarray before plot. numeric_data = numeric_data.copy() for col in numeric_data: - numeric_data[col] = convert_to_ndarray(numeric_data[col]) + numeric_data[col] = safe_convert_to_ndarray(numeric_data[col]) self.data = numeric_data diff --git a/pandas/tests/dtypes/cast/test_safe_convert_to_ndarray.py b/pandas/tests/dtypes/cast/test_safe_convert_to_ndarray.py new file mode 100644 index 0000000000000..d47f4eeb56e74 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_safe_convert_to_ndarray.py @@ -0,0 +1,51 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.cast import safe_convert_to_ndarray + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "values, expected", + [ + (pd.Series([1, 2, 3], dtype=int), np.array([1, 2, 3], dtype=int)), + ( + # Nullable integer type cast to float to handle missing values + pd.Series([1, np.NaN, 3], dtype="Int64"), + np.array([1, np.NaN, 3], dtype=float), + ), + ( + # Nullable boolean type cast to float to handle missing values + pd.Series([True, np.NaN, False], dtype="boolean"), + np.array([1.0, np.NaN, 0.0], dtype=float), + ), + ( + # Normal datetime cast not changed + pd.to_datetime([2001, None, 2003], format="%Y"), + np.array(["2001", "NaT", "2003"], dtype="datetime64").astype( + "datetime64[ns]" + ), + ), + ( + # Extended datetime should be downcast to normal datetime + pd.to_datetime([2001, None, 2003], format="%Y", utc=True), + np.array(["2001", "NaT", "2003"], dtype="datetime64").astype( + "datetime64[ns]" + ), + ), + ( + # Downcast to naive datetime should result in local dates, not UTC + pd.to_datetime([2001, None, 2003], format="%Y").tz_localize( + tz="US/Eastern" + ), + np.array(["2001", "NaT", "2003"], dtype="datetime64").astype( + "datetime64[ns]" + ), + ), + ], +) +def test_safe_convert_to_ndarray(values, expected): + result = safe_convert_to_ndarray(values) + tm.assert_numpy_array_equal(result, expected) From eee0e2aa17b2d4574c7a4be885a3cc3be9e49d54 Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Fri, 26 Jun 2020 18:09:21 +0200 Subject: [PATCH 4/7] Remove now-unused imports --- pandas/core/groupby/groupby.py | 2 -- pandas/plotting/_matplotlib/core.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c06412d4572a9..61d26e57f0469 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -44,9 +44,7 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_cast_result, safe_convert_to_ndarray from pandas.core.dtypes.common import ( ensure_float, - is_bool_dtype, is_datetime64_dtype, - is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 09d40c30f3d54..b3aaa78020751 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -11,11 +11,9 @@ from pandas.core.dtypes.cast import safe_convert_to_ndarray from pandas.core.dtypes.common import ( - is_extension_array_dtype, is_float, is_hashable, is_integer, - is_integer_dtype, is_iterator, is_list_like, is_number, From 3b6b25769dca4b393ccf5561513d0ac06e9c49cb Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Sat, 27 Jun 2020 18:10:58 +0200 Subject: [PATCH 5/7] Only pass values to safe_convert_to_ndarray --- pandas/core/dtypes/cast.py | 3 +-- pandas/plotting/_matplotlib/core.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 68bc650a82249..fe8f32a2e8a51 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1769,8 +1769,7 @@ def safe_convert_to_ndarray(values) -> np.ndarray: elif is_bool_dtype(values.dtype): converted_values = values.to_numpy(dtype=float, na_value=np.nan) elif is_datetime64tz_dtype(values.dtype): - print(values) - converted_values = np.asarray(values.dt.tz_localize(tz=None)) + converted_values = np.asarray(values.tz_localize(tz=None)) else: converted_values = np.asarray(values) else: diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index b3aaa78020751..5a90477b3a332 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -422,7 +422,7 @@ def _compute_plot_data(self): # np.ndarray before plot. numeric_data = numeric_data.copy() for col in numeric_data: - numeric_data[col] = safe_convert_to_ndarray(numeric_data[col]) + numeric_data[col] = safe_convert_to_ndarray(numeric_data[col].values) self.data = numeric_data From e4f21174c90dc0d4a9ce91acf5f2864ddc40f8e1 Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Sat, 27 Jun 2020 18:46:17 +0200 Subject: [PATCH 6/7] Updated typing and docstring --- pandas/core/dtypes/cast.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fe8f32a2e8a51..d0a2ceb4ca31e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -3,7 +3,7 @@ """ from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union import numpy as np @@ -76,6 +76,7 @@ if TYPE_CHECKING: from pandas import Series from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.indexes.base import Index # noqa: F401 _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -1749,14 +1750,18 @@ def validate_numeric_casting(dtype: np.dtype, value): raise ValueError("Cannot assign bool to float/integer series") -def safe_convert_to_ndarray(values) -> np.ndarray: +def safe_convert_to_ndarray(values: Union[ArrayLike, Index]) -> np.ndarray: """ - Convert values to ndarray while casting nullable dtype arrays to float. + Converts values to ndarray with special handling for extension arrays. + + Cast to ndarray but tries to avoid returning an array of `object` dtype. + Nullable integer and boolean arrays will be cast to float, and datetime + arrays with timezone information will lose their timezone information. Parameters ---------- - values - Series or array. + values : Union[ArrayLike, Index] + Values to be converted to ndarray. Returns ------- From 8866c961ef15fd0358e11dd429ab41456168849c Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Sat, 27 Jun 2020 18:57:10 +0200 Subject: [PATCH 7/7] Use forward reference for Index type --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d0a2ceb4ca31e..41c455ca41a6f 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1750,7 +1750,7 @@ def validate_numeric_casting(dtype: np.dtype, value): raise ValueError("Cannot assign bool to float/integer series") -def safe_convert_to_ndarray(values: Union[ArrayLike, Index]) -> np.ndarray: +def safe_convert_to_ndarray(values: Union[ArrayLike, "Index"]) -> np.ndarray: """ Converts values to ndarray with special handling for extension arrays.