From ee64bd67a6083fc0074c31c8f974d7eb61431216 Mon Sep 17 00:00:00 2001 From: yuwang Date: Thu, 5 Dec 2019 16:35:11 -0500 Subject: [PATCH 1/7] PERF: improve conversion to BooleanArray from int/float array --- pandas/core/arrays/boolean.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c118b6fe26549..ba2d62bcbf796 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -130,9 +130,18 @@ def coerce_to_array(values, mask=None, copy: bool = False): if isinstance(values, np.ndarray) and values.dtype == np.bool_: if copy: values = values.copy() + elif isinstance(values, np.ndarray) and values.dtype in (np.int_, np.float_): + values_copy = values.copy() + + mask_values = isna(values) + values = np.zeros(len(values), dtype=bool) + values[~mask_values] = values_copy[~mask_values].astype(bool) + + if not np.all( + values[~mask_values].astype(values.dtype) == values_copy[~mask_values] + ): + raise TypeError("Need to pass bool-like values") else: - # TODO conversion from integer/float ndarray can be done more efficiently - # (avoid roundtrip through object) values_object = np.asarray(values, dtype=object) inferred_dtype = lib.infer_dtype(values_object, skipna=True) From 3da324eba5a9ce467e0369989f2b5708625cd4ff Mon Sep 17 00:00:00 2001 From: yuwang Date: Fri, 6 Dec 2019 00:19:09 -0500 Subject: [PATCH 2/7] DOC: add issue number into whatsnew list --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 1a07b424fa884..5261b30ef9c1e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -156,7 +156,7 @@ type dedicated to boolean data that can hold missing values. With the default ``'bool`` data type based on a numpy bool array, the column can only hold True or False values and not missing values. This new :class:`BooleanDtype` can store missing values as well by keeping track of this in a separate mask. -(:issue:`29555`) +(:issue:`29555`, :issue:`30095`) .. ipython:: python From 788820fd9ec06d2a0074aef3431bed282d19faf0 Mon Sep 17 00:00:00 2001 From: yuwang Date: Fri, 6 Dec 2019 00:20:09 -0500 Subject: [PATCH 3/7] PERF: remove redundant copy() in booleanArray --- pandas/core/arrays/boolean.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index cda618317bf6d..583bb1723b390 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -131,16 +131,17 @@ def coerce_to_array(values, mask=None, copy: bool = False): if copy: values = values.copy() elif isinstance(values, np.ndarray) and values.dtype in (np.int_, np.float_): - values_copy = values.copy() - mask_values = isna(values) - values = np.zeros(len(values), dtype=bool) - values[~mask_values] = values_copy[~mask_values].astype(bool) + + values_bool = np.zeros(len(values), dtype=bool) + values_bool[~mask_values] = values[~mask_values].astype(bool) if not np.all( - values[~mask_values].astype(values.dtype) == values_copy[~mask_values] + values_bool[~mask_values].astype(values.dtype) == values[~mask_values] ): raise TypeError("Need to pass bool-like values") + + values = values_bool else: values_object = np.asarray(values, dtype=object) From 2d5455cbb14c39093396c6a21d67c99dc52175e9 Mon Sep 17 00:00:00 2001 From: yuwang Date: Fri, 6 Dec 2019 00:21:21 -0500 Subject: [PATCH 4/7] TST: Update some test of boolean array from int/float np.ndarray --- pandas/tests/arrays/test_boolean.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index a13bb8edc8e48..6f45e6318664d 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -133,24 +133,37 @@ def test_to_boolean_array_error(values): pd.array(values, dtype="boolean") -def test_to_boolean_array_integer_like(): - # integers of 0's and 1's - result = pd.array([1, 0, 1, 0], dtype="boolean") +def test_to_boolean_array_from_integer_array(): + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") expected = pd.array([True, False, True, False], dtype="boolean") tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + # with missing values + result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") tm.assert_extension_array_equal(result, expected) + +def test_to_boolean_array_from_float_array(): result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") tm.assert_extension_array_equal(result, expected) # with missing values - result = pd.array([1, 0, 1, None], dtype="boolean") + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") expected = pd.array([True, False, True, None], dtype="boolean") tm.assert_extension_array_equal(result, expected) - result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") tm.assert_extension_array_equal(result, expected) From 5c3e4aae5ebee6cd2ccf34d5bdc19c329b7e96f9 Mon Sep 17 00:00:00 2001 From: yuwang Date: Fri, 6 Dec 2019 16:58:51 -0500 Subject: [PATCH 5/7] STY: Change values.dtype comparation --- pandas/core/arrays/boolean.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 583bb1723b390..349cbd1919e76 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -17,6 +17,7 @@ is_integer, is_integer_dtype, is_list_like, + is_numeric_dtype, is_scalar, pandas_dtype, ) @@ -130,7 +131,7 @@ def coerce_to_array(values, mask=None, copy: bool = False): if isinstance(values, np.ndarray) and values.dtype == np.bool_: if copy: values = values.copy() - elif isinstance(values, np.ndarray) and values.dtype in (np.int_, np.float_): + elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype): mask_values = isna(values) values_bool = np.zeros(len(values), dtype=bool) From 66fa55969def9a930179764a7e0022b64aa05ab5 Mon Sep 17 00:00:00 2001 From: yuwang Date: Sun, 8 Dec 2019 13:11:15 -0500 Subject: [PATCH 6/7] TST: Add exception test from np.array --- asv_bench/benchmarks/array.py | 0 pandas/tests/arrays/test_boolean.py | 2 ++ 2 files changed, 2 insertions(+) create mode 100644 asv_bench/benchmarks/array.py diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 6f45e6318664d..2b946a2a925d5 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -124,6 +124,8 @@ def test_to_boolean_array_missing_indicators(a, b): [1.0, 2.0], pd.date_range("20130101", periods=2), np.array(["foo"]), + np.array([1, 2]), + np.array([1.0, 2.0]), [np.nan, {"a": 1}], ], ) From 3dcbbd6156acd4db1dc18a71dcab7c9b925b7960 Mon Sep 17 00:00:00 2001 From: yuwang Date: Sun, 8 Dec 2019 13:13:40 -0500 Subject: [PATCH 7/7] ASV: add array benchmark --- asv_bench/benchmarks/array.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index e69de29bb2d1d..8cbf8c8592661 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -0,0 +1,23 @@ +import numpy as np + +import pandas as pd + + +class BooleanArray: + def setup(self): + self.values_bool = np.array([True, False, True, False]) + self.values_float = np.array([1.0, 0.0, 1.0, 0.0]) + self.values_integer = np.array([1, 0, 1, 0]) + self.values_integer_like = [1, 0, 1, 0] + + def time_from_bool_array(self): + pd.array(self.values_bool, dtype="boolean") + + def time_from_integer_array(self): + pd.array(self.values_integer, dtype="boolean") + + def time_from_integer_like(self): + pd.array(self.values_integer_like, dtype="boolean") + + def time_from_float_array(self): + pd.array(self.values_float, dtype="boolean")