From b0ea2f19166d4f0795f069bb4a70f3a1ecca7267 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Jul 2018 12:12:56 +0200 Subject: [PATCH 1/6] Split fastpath IntegerArray constructor and general purpose constructor --- pandas/core/arrays/__init__.py | 2 +- pandas/core/arrays/integer.py | 40 +++++++++++-------- pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 2 +- pandas/tests/extension/base/missing.py | 2 +- .../tests/extension/integer/test_integer.py | 25 ++++++------ 6 files changed, 40 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 9132c74091410..29f258bf1b29e 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -7,4 +7,4 @@ from .period import PeriodArrayMixin # noqa from .timedeltas import TimedeltaArrayMixin # noqa from .integer import ( # noqa - IntegerArray, to_integer_array) + IntegerArray, integer_array) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c126117060c3d..3c45b9bf6a6cb 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -76,7 +76,7 @@ def construct_from_string(cls, string): "'{}'".format(cls, string)) -def to_integer_array(values, dtype=None): +def integer_array(values, dtype=None, copy=False): """ Infer and return an integer array of the values. @@ -94,7 +94,8 @@ def to_integer_array(values, dtype=None): ------ TypeError if incompatible types """ - return IntegerArray(values, dtype=dtype, copy=False) + values, mask = coerce_to_array(values, dtype=dtype, copy=copy) + return IntegerArray(values, mask) def safe_cast(values, dtype, copy): @@ -206,7 +207,7 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): def dtype(self): return _dtypes[str(self._data.dtype)] - def __init__(self, values, mask=None, dtype=None, copy=False): + def __init__(self, values, mask, copy=False): """ Parameters ---------- @@ -219,25 +220,33 @@ def __init__(self, values, mask=None, dtype=None, copy=False): ------- IntegerArray """ - self._data, self._mask = coerce_to_array( - values, dtype=dtype, mask=mask, copy=copy) + if not (isinstance(values, np.ndarray) + and np.issubdtype(values.dtype, np.integer)): + raise TypeError("values should be integer numpy array") + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError("mask should be boolean numpy array") + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask @classmethod def _from_sequence(cls, scalars, dtype=None, copy=False): - return cls(scalars, dtype=dtype, copy=copy) + return integer_array(scalars, dtype=dtype, copy=copy) @classmethod def _from_factorized(cls, values, original): - return cls(values, dtype=original.dtype) + return integer_array(values, dtype=original.dtype) def __getitem__(self, item): if is_integer(item): if self._mask[item]: return self.dtype.na_value return self._data[item] - return type(self)(self._data[item], - mask=self._mask[item], - dtype=self.dtype) + return type(self)(self._data[item], self._mask[item]) def _coerce_to_ndarray(self): """ @@ -294,7 +303,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): result[fill_mask] = fill_value mask = mask ^ fill_mask - return type(self)(result, mask=mask, dtype=self.dtype, copy=False) + return type(self)(result, mask, copy=False) def copy(self, deep=False): data, mask = self._data, self._mask @@ -304,7 +313,7 @@ def copy(self, deep=False): else: data = data.copy() mask = mask.copy() - return type(self)(data, mask, dtype=self.dtype, copy=False) + return type(self)(data, mask, copy=False) def __setitem__(self, key, value): _is_scalar = is_scalar(value) @@ -356,7 +365,7 @@ def _na_value(self): def _concat_same_type(cls, to_concat): data = np.concatenate([x._data for x in to_concat]) mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask=mask, dtype=to_concat[0].dtype) + return cls(data, mask) def astype(self, dtype, copy=True): """Cast to a NumPy array or IntegerArray with 'dtype'. @@ -386,8 +395,7 @@ def astype(self, dtype, copy=True): if isinstance(dtype, _IntegerDtype): result = self._data.astype(dtype.numpy_dtype, casting='same_kind', copy=False) - return type(self)(result, mask=self._mask, - dtype=dtype, copy=False) + return type(self)(result, mask=self._mask, copy=False) # coerce data = self._coerce_to_ndarray() @@ -523,7 +531,7 @@ def _maybe_mask_result(self, result, mask, other, op_name): result[mask] = np.nan return result - return type(self)(result, mask=mask, dtype=self.dtype, copy=False) + return type(self)(result, mask, copy=False) @classmethod def _create_arithmetic_method(cls, op): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 20926ea5163af..0389367ee769d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -280,7 +280,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype - data = dtype.construct_array_type()( + data = dtype.construct_array_type()._from_sequence( data, dtype=dtype, copy=False) # coerce to the object dtype diff --git a/pandas/core/series.py b/pandas/core/series.py index d4c11b19082ab..9603a29b5d126 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4095,7 +4095,7 @@ def _try_cast(arr, take_fast_path): ordered=dtype.ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype - array_type = dtype.construct_array_type() + array_type = dtype.construct_array_type()._from_sequence subarr = array_type(subarr, dtype=dtype, copy=copy) elif dtype is not None and raise_cast_failure: diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 43b2702c72193..6e18c9ff72267 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -94,7 +94,7 @@ def test_fillna_series_method(self, data_missing, method): fill_value = data_missing[1] if method == 'ffill': - data_missing = type(data_missing)(data_missing[::-1]) + data_missing = data_missing[::-1] result = pd.Series(data_missing).fillna(method=method) expected = pd.Series( diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 451f7488bd38a..3d19978223c43 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.generic import ABCIndexClass from pandas.core.arrays import ( - to_integer_array, IntegerArray) + integer_array, IntegerArray) from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) @@ -31,12 +31,12 @@ def dtype(request): @pytest.fixture def data(dtype): - return IntegerArray(make_data(), dtype=dtype) + return integer_array(make_data(), dtype=dtype) @pytest.fixture def data_missing(dtype): - return IntegerArray([np.nan, 1], dtype=dtype) + return integer_array([np.nan, 1], dtype=dtype) @pytest.fixture @@ -49,12 +49,12 @@ def gen(count): @pytest.fixture def data_for_sorting(dtype): - return IntegerArray([1, 2, 0], dtype=dtype) + return integer_array([1, 2, 0], dtype=dtype) @pytest.fixture def data_missing_for_sorting(dtype): - return IntegerArray([1, np.nan, 0], dtype=dtype) + return integer_array([1, np.nan, 0], dtype=dtype) @pytest.fixture @@ -74,7 +74,7 @@ def data_for_grouping(dtype): a = 0 c = 2 na = np.nan - return IntegerArray([b, b, na, na, a, a, b, c], dtype=dtype) + return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) def test_dtypes(dtype): @@ -494,8 +494,7 @@ def test_construct_index(self, all_data, dropna): else: other = all_data - result = pd.Index(IntegerArray(other, - dtype=all_data.dtype)) + result = pd.Index(integer_array(other, dtype=all_data.dtype)) expected = pd.Index(other, dtype=object) self.assert_index_equal(result, expected) @@ -584,14 +583,14 @@ def test_construct_cast_invalid(self, dtype): msg = "cannot safely" arr = [1.2, 2.3, 3.7] with tm.assert_raises_regex(TypeError, msg): - IntegerArray(arr, dtype=dtype) + integer_array(arr, dtype=dtype) with tm.assert_raises_regex(TypeError, msg): pd.Series(arr).astype(dtype) arr = [1.2, 2.3, 3.7, np.nan] with tm.assert_raises_regex(TypeError, msg): - IntegerArray(arr, dtype=dtype) + integer_array(arr, dtype=dtype) with tm.assert_raises_regex(TypeError, msg): pd.Series(arr).astype(dtype) @@ -658,7 +657,7 @@ def test_conversions(data_missing): def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays with pytest.raises(TypeError): - to_integer_array(values) + integer_array(values) @pytest.mark.parametrize( @@ -669,8 +668,8 @@ def test_to_integer_array_error(values): (np.array([1, np.nan]), 'int8', Int8Dtype)]) def test_to_integer_array(values, to_dtype, result_dtype): # convert existing arrays to IntegerArrays - result = to_integer_array(values, dtype=to_dtype) - expected = IntegerArray(values, dtype=result_dtype()) + result = integer_array(values, dtype=to_dtype) + expected = integer_array(values, dtype=result_dtype()) tm.assert_extension_array_equal(result, expected) From c5de1f27132a57f3c2ded170d18d7458aa0ad726 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 29 Jul 2018 00:44:01 +0200 Subject: [PATCH 2/6] some docs and tests --- pandas/core/arrays/integer.py | 45 +++++++++++-------- .../tests/extension/integer/test_integer.py | 45 +++++++++++++++++++ 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3c45b9bf6a6cb..80ba652252bd3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( is_integer, is_scalar, is_float, + is_bool_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, @@ -85,6 +86,7 @@ def integer_array(values, dtype=None, copy=False): values : 1D list-like dtype : dtype, optional dtype to coerce + copy : boolean, default False Returns ------- @@ -198,9 +200,26 @@ def coerce_to_array(values, dtype, mask=None, copy=False): class IntegerArray(ExtensionArray, ExtensionOpsMixin): """ - We represent an IntegerArray with 2 numpy arrays + Array of integer (optional missing) values. + + We represent an IntegerArray with 2 numpy arrays: + - data: contains a numpy integer array of the appropriate dtype - - mask: a boolean array holding a mask on the data, False is missing + - mask: a boolean array holding a mask on the data, True is missing + + To construct an IntegerArray from generic array-like input, use + ``integer_array`` function instead. + + Parameters + ---------- + values : integer 1D numpy array + mask : boolean 1D numpy array + copy : bool, default False + + Returns + ------- + IntegerArray + """ @cache_readonly @@ -208,23 +227,13 @@ def dtype(self): return _dtypes[str(self._data.dtype)] def __init__(self, values, mask, copy=False): - """ - Parameters - ---------- - values : 1D list-like / IntegerArray - mask : 1D list-like, optional - dtype : subclass of _IntegerDtype, optional - copy : bool, default False - - Returns - ------- - IntegerArray - """ if not (isinstance(values, np.ndarray) - and np.issubdtype(values.dtype, np.integer)): - raise TypeError("values should be integer numpy array") - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): - raise TypeError("mask should be boolean numpy array") + and is_integer_dtype(values.dtype)): + raise TypeError("values should be integer numpy array. Use " + "the 'integer_array' function instead") + if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): + raise TypeError("mask should be boolean numpy array. Use " + "the 'integer_array' function instead") if copy: values = values.copy() diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 3d19978223c43..17e558ebd1dcc 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -645,10 +645,45 @@ def test_conversions(data_missing): assert type(r) == type(e) +def test_integer_array_constructor(): + values = np.array([1, 2, 3, 4], dtype='int64') + mask = np.array([False, False, False, True], dtype='bool') + + result = IntegerArray(values, mask) + expected = integer_array([1, 2, 3, np.nan], dtype='int64') + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError): + IntegerArray(values.tolist(), mask) + + with pytest.raises(TypeError): + IntegerArray(values, mask.tolist()) + + with pytest.raises(TypeError): + IntegerArray(values.astype(float), mask) + + with pytest.raises(TypeError): + IntegerArray(values) + + +def test_integer_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype='int64') + mask = np.array([False, False, False, True], dtype='bool') + + result = IntegerArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = IntegerArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + @pytest.mark.parametrize( 'values', [ ['foo', 'bar'], + ['1', '2'], 'foo', 1, 1.0, @@ -660,6 +695,15 @@ def test_to_integer_array_error(values): integer_array(values) +def test_to_integer_array_float(): + result = integer_array([1., 2.]) + expected = integer_array([1, 2]) + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): + integer_array([1.5, 2.]) + + @pytest.mark.parametrize( 'values, to_dtype, result_dtype', [ @@ -669,6 +713,7 @@ def test_to_integer_array_error(values): def test_to_integer_array(values, to_dtype, result_dtype): # convert existing arrays to IntegerArrays result = integer_array(values, dtype=to_dtype) + assert result.dtype == result_dtype() expected = integer_array(values, dtype=result_dtype()) tm.assert_extension_array_equal(result, expected) From 5e8c1a4370b3bf8b6e350f61644dd46c4640c72b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 29 Jul 2018 12:02:41 +0200 Subject: [PATCH 3/6] fix test for 32 bit --- pandas/tests/extension/integer/test_integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 17e558ebd1dcc..1717f009a188e 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -697,7 +697,7 @@ def test_to_integer_array_error(values): def test_to_integer_array_float(): result = integer_array([1., 2.]) - expected = integer_array([1, 2]) + expected = integer_array(np.array([1, 2], dtype='int64')) tm.assert_extension_array_equal(result, expected) with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): From 90a9c131109b1b9e5eea05ba8bbafdd0ee5190c6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 29 Jul 2018 12:34:10 +0200 Subject: [PATCH 4/6] add dtype tests --- .../tests/extension/integer/test_integer.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 1717f009a188e..7e8cdd87c467b 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -695,6 +695,27 @@ def test_to_integer_array_error(values): integer_array(values) +def test_to_integer_array_inferred_dtype(): + # if values has dtype -> respect it + result = integer_array(np.array([1, 2], dtype='int8')) + assert result.dtype == Int8Dtype() + result = integer_array(np.array([1, 2], dtype='int32')) + assert result.dtype == Int32Dtype() + + # if values have no dtype -> always int64 + result = integer_array([1, 2]) + assert result.dtype == Int64Dtype() + + +def test_to_integer_array_dtype_keyword(): + result = integer_array([1, 2], dtype='int8') + assert result.dtype == Int8Dtype() + + # if values has dtype -> override it + result = integer_array(np.array([1, 2], dtype='int8'), dtype='int32') + assert result.dtype == Int32Dtype() + + def test_to_integer_array_float(): result = integer_array([1., 2.]) expected = integer_array(np.array([1, 2], dtype='int64')) From 2fc70e1f35b566c7f0786ceb42e7a264d833d105 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 29 Jul 2018 14:15:26 +0200 Subject: [PATCH 5/6] ensure output is always int64 for all platforms if values have no int dtype --- pandas/core/arrays/integer.py | 10 ++++++---- pandas/tests/extension/integer/test_integer.py | 6 +++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 80ba652252bd3..48a5db7793921 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -136,6 +136,11 @@ def coerce_to_array(values, dtype, mask=None, copy=False): ------- tuple of (values, mask) """ + # if values is integer numpy array, preserve it's dtype + if dtype is None and hasattr(values, 'dtype'): + if is_integer_dtype(values.dtype): + dtype = values.dtype + if dtype is not None: if not issubclass(type(dtype), _IntegerDtype): try: @@ -177,10 +182,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): # infer dtype if needed if dtype is None: - if is_integer_dtype(values): - dtype = values.dtype - else: - dtype = np.dtype('int64') + dtype = np.dtype('int64') else: dtype = dtype.type diff --git a/pandas/tests/extension/integer/test_integer.py b/pandas/tests/extension/integer/test_integer.py index 7e8cdd87c467b..1298958921b2b 100644 --- a/pandas/tests/extension/integer/test_integer.py +++ b/pandas/tests/extension/integer/test_integer.py @@ -718,12 +718,16 @@ def test_to_integer_array_dtype_keyword(): def test_to_integer_array_float(): result = integer_array([1., 2.]) - expected = integer_array(np.array([1, 2], dtype='int64')) + expected = integer_array([1, 2]) tm.assert_extension_array_equal(result, expected) with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): integer_array([1.5, 2.]) + # for float dtypes, the itemsize is not preserved + result = integer_array(np.array([1., 2.], dtype='float32')) + assert result.dtype == Int64Dtype() + @pytest.mark.parametrize( 'values, to_dtype, result_dtype', From 5948ae5d3a89a5e8418210d00de9fe29c0faa0ea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 14 Aug 2018 17:58:55 +0200 Subject: [PATCH 6/6] specify dtype in _from_sequence where needed --- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/extension/base/missing.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 886a0f66b5f66..22b21102fa4ae 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -213,7 +213,7 @@ def test_take_series(self, data): s = pd.Series(data) result = s.take([0, -1]) expected = pd.Series( - data._from_sequence([data[0], data[len(data) - 1]]), + data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), index=[0, len(data) - 1]) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 6e18c9ff72267..2fe547e50a34b 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -77,8 +77,8 @@ def test_fillna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.fillna(fill_value) - expected = pd.Series( - data_missing._from_sequence([fill_value, fill_value])) + expected = pd.Series(data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype)) self.assert_series_equal(result, expected) # Fill with a series @@ -97,8 +97,8 @@ def test_fillna_series_method(self, data_missing, method): data_missing = data_missing[::-1] result = pd.Series(data_missing).fillna(method=method) - expected = pd.Series( - data_missing._from_sequence([fill_value, fill_value])) + expected = pd.Series(data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype)) self.assert_series_equal(result, expected) @@ -111,7 +111,8 @@ def test_fillna_frame(self, data_missing): }).fillna(fill_value) expected = pd.DataFrame({ - "A": data_missing._from_sequence([fill_value, fill_value]), + "A": data_missing._from_sequence([fill_value, fill_value], + dtype=data_missing.dtype), "B": [1, 2], })