From acaa8c4a15f5322f4cc84f1a68d2f793b34b74c2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Apr 2020 08:50:08 +0200 Subject: [PATCH 1/3] PERF: improve IntegerArray fast constructor --- pandas/core/arrays/integer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 4f3c68aa03b16..e02a61689d3a5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -343,12 +343,12 @@ def dtype(self) -> _IntegerDtype: return _dtypes[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)): + if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): raise TypeError( "values should be integer numpy array. Use " "the 'integer_array' function instead" ) - if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): + if not (isinstance(mask, np.ndarray) and mask.dtype.kind == "b"): raise TypeError( "mask should be boolean numpy array. Use " "the 'integer_array' function instead" From 357a59f7e0b2172b43dc13637d640f896656b719 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Apr 2020 10:48:15 +0200 Subject: [PATCH 2/3] add benchmarks --- asv_bench/benchmarks/array.py | 18 ++++++++++++++++++ pandas/core/arrays/integer.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 8cbf8c8592661..103df0fd94847 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -9,6 +9,11 @@ def setup(self): self.values_float = np.array([1.0, 0.0, 1.0, 0.0]) self.values_integer = np.array([1, 0, 1, 0]) self.values_integer_like = [1, 0, 1, 0] + self.data = np.array([True, False, True, False]) + self.mask = np.array([False, False, True, False]) + + def time_constructor(self): + pd.arrays.BooleanArray(self.data, self.mask) def time_from_bool_array(self): pd.array(self.values_bool, dtype="boolean") @@ -21,3 +26,16 @@ def time_from_integer_like(self): def time_from_float_array(self): pd.array(self.values_float, dtype="boolean") + + +class IntegerArray: + def setup(self): + self.values_integer = np.array([1, 0, 1, 0]) + self.data = np.array([1, 2, 3, 4], dtype="int64") + self.mask = np.array([False, False, True, False]) + + def time_constructor(self): + pd.arrays.IntegerArray(self.data, self.mask) + + def time_from_integer_array(self): + pd.array(self.values_integer, dtype="Int64") diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index e02a61689d3a5..fce42fb5623f0 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -348,7 +348,7 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): "values should be integer numpy array. Use " "the 'integer_array' function instead" ) - if not (isinstance(mask, np.ndarray) and mask.dtype.kind == "b"): + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): raise TypeError( "mask should be boolean numpy array. Use " "the 'integer_array' function instead" From d1cc90ec27bb042378df31cebff3415668e303fa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Apr 2020 10:34:16 +0200 Subject: [PATCH 3/3] consolidate common checks in the base class --- pandas/core/arrays/boolean.py | 12 +----------- pandas/core/arrays/integer.py | 7 +------ pandas/core/arrays/masked.py | 11 +++++++++++ pandas/tests/arrays/integer/test_construction.py | 2 +- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e85534def6b97..b78a10efa04a0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -271,18 +271,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( "values should be boolean numpy array. Use " - "the 'array' function instead" + "the 'pd.array' function instead" ) - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): - raise TypeError( - "mask should be boolean numpy array. Use " - "the 'array' function instead" - ) - if not values.ndim == 1: - raise ValueError("values must be a 1D array") - if not mask.ndim == 1: - raise ValueError("mask must be a 1D array") - self._dtype = BooleanDtype() super().__init__(values, mask, copy=copy) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index f01dd3a4f286f..5d6f49852e696 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -346,12 +346,7 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): raise TypeError( "values should be integer numpy array. Use " - "the 'integer_array' function instead" - ) - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): - raise TypeError( - "mask should be boolean numpy array. Use " - "the 'integer_array' function instead" + "the 'pd.array' function instead" ) super().__init__(values, mask, copy=copy) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d23d26d870f75..fc5b307bd5754 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -30,6 +30,17 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): _internal_fill_value: Scalar def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + # values is supposed to already be validated in the subclass + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'pd.array' function instead" + ) + if not values.ndim == 1: + raise ValueError("values must be a 1D array") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D array") + if copy: values = values.copy() mask = mask.copy() diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 4a62a35e23d93..43936d8b95bd6 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -70,7 +70,7 @@ def test_integer_array_constructor(): expected = integer_array([1, 2, 3, np.nan], dtype="int64") tm.assert_extension_array_equal(result, expected) - msg = r".* should be .* numpy array. Use the 'integer_array' function instead" + msg = r".* should be .* numpy array. Use the 'pd.array' function instead" with pytest.raises(TypeError, match=msg): IntegerArray(values.tolist(), mask)